In [None]:
import praw
import math
import json
import re
import os
import urllib.request
import numpy as np
import scipy as sp
import pandas as pd
from datetime import datetime
from textblob import TextBlob
from spellchecker import SpellChecker
from collections import defaultdict
from praw.models import MoreComments, Redditor, Submission
from prawcore.exceptions import *
from praw.reddit import models
from typing import List, Dict
from pandas import DataFrame as Df
from collections import Counter
from Analysis.FeatureBuilder import *
from Ingest.pushShift import *
from Ingest.Reddit import *
from Analysis import Common
import plotly.graph_objs as go
from tqdm import tqdm
import plotly.plotly as py
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly import offline
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from xgboost import XGBClassifier
from xgboost import plot_tree
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
init_notebook_mode(connected=True)
%load_ext autoreload
%autoreload
tqdm.pandas()



In [None]:
post_id = 'combined'
post_dataset = Common.load_dataset_for_post(post_id)
print(len(post_dataset))

In [None]:
training_data_features = ['comment_id', 'post_id', 'golds', 'comment_char_count',
       'comment_text_polarity',
       'network_comment_thread_max_depth',
       'network_user_total_comment_count', 'news_subreddit_comment_karma',
       'politics_subreddit_post_karma', 'left_subreddit_comment_count',
       'news_subreddit_comment_count', 'comment_has_user_ref',
       'user_email_verified', 'user_total_comment_count',
       'comment_spelling_error_count', 'right_subreddit_comment_count',
       'network_user_thread_comment_count', 'user_account_age_seconds',
       'center_subreddit_comment_karma', 'post_comment_timedelta_seconds',
       'right_subreddit_post_karma', 'comment_url_refer_count',
       'center_subreddit_comment_count', 'comment_has_citation',
       'center_subreddit_post_karma', 'politics_subreddit_comment_count',
       'politics_subreddit_post_count', 'user_total_post_karma',
       'politics_subreddit_comment_karma',
       'left_subreddit_comment_karma', 'user_total_post_count',
       'comment_text_profanity', 'network_comment_thread_size',
       'user_total_comment_karma', 'network_comment_thread_top_level_count',
       'network_user_top_level_comment_count', 'left_subreddit_post_count',
       'news_subreddit_post_count', 'right_subreddit_comment_karma',
       'left_subreddit_post_karma', 'comment_text_subjectivity',
       'right_subreddit_post_count', 'center_subreddit_post_count',
       'news_subreddit_post_karma']
def gen_model_for(dataset: Df):
    dataset.sort_values(by=['score'], inplace=True)
    top = dataset[dataset.score < -10]
    top['misinformation'] = 1             # Dependent variable
    print(len(top))
    bottom = dataset[dataset.score > 30]
    bottom['misinformation'] = 0          # Dependent variable
    print(len(bottom))
    return pd.concat([top, bottom]).reset_index(drop=True)[training_data_features + ['misinformation']]


### Splitting the Dataset and Feature Scaling

In [None]:
X_frame  = gen_model_for(post_dataset)
X = X_frame.iloc[:, 2:-1].values
y = X_frame.iloc[:, -1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
print(f'Training dataset is: {len(X_train)} rows')
train_X, test_X = X_train, X_test

### Dimensionality Reduction

In [None]:
def do_lda(train_X, train_Y, test_X):
    lda = LinearDiscriminantAnalysis(n_components=2)
    train_X = lda.fit_transform(train_X, train_Y)
    test_X = lda.transform(test_X)
    return train_X, test_X

In [None]:
pca = PCA(n_components=None)  
train_X = pca.fit_transform(X_train)
test_X = pca.transform(X_test)
# cumsum = np.cumsum(pca.explained_variance_ratio_)
# trace = go.Scatter(x=[i for i in range(len(cumsum))], y=cumsum,
#                      marker=dict(color='rgb(150, 25, 120)'))
# fig = go.Figure(data=[trace])
# plot(fig, filename='cdf-dataset')

In [None]:
kpca = KernelPCA(kernel='rbf', n_components=3)  
train_X = kpca.fit_transform(X_train)
test_X = kpca.transform(X_test)


### Visualizing the Principal Components

In [None]:
def visualize_principal_axis(X, Y, title):
    trace0_x, trace0_y, trace1_x, trace1_y = [], [], [], []
    for x_val, y_val in zip(X, Y):
        if y_val == 0:
            trace0_x.append(x_val[0])
            trace0_y.append(x_val[1])
        else:
            trace1_x.append(x_val[0])
            trace1_y.append(x_val[1])
    trace0 = go.Scatter(
        x = trace0_x,
        y = trace0_y,
        mode = 'markers',
        name="Not Credible"
    )
    trace1 = go.Scatter(
        x = trace1_x,
        y = trace1_y,
        mode = 'markers',
        name="Credible"
    )
    layout = dict(
        title=title,
    )
    return dict(data=[trace0, trace1], layout=layout)

fig = visualize_principal_axis(X_test, y_train, "PCA")
plot(fig, filename='pca')

### Performance Metrics

In [None]:
def run_metrics(estimator, trainX, testX, trainY, testY):
    # Performance Evaluation - Confusion Matrix Test Evaluation
    pred_test_y = estimator.predict(testX)
    cm = confusion_matrix(testY, pred_test_y)
    print(f'Test Data Confusion Matrix: {cm}')
    # Performance Evaluation - Confusion Matrix Train Evaluation
    pred_train_y = estimator.predict(trainX)
    cm = confusion_matrix(trainY, pred_train_y)
    print(f'Train Data Confusion Matrix: {cm}')
    # Performance Evaluation - K Means
    accuracies = cross_val_score(estimator = estimator, X=trainX, y=trainY, cv=10)
    print(f'K-Means Accuracy Average: {accuracies.mean()}')
    print(f'K-Means Variance: {accuracies.std()}')

### SVM Testbench

In [None]:
classifier = SVC(kernel = 'rbf', C=2.1, gamma= 0.04, probability=True)
classifier.fit(X_train, y_train)
run_metrics(classifier, X_train, X_test, y_train, y_test)

#### Hyper parameter Tuning Grid Search: SVM

In [None]:
# C_vals = [0.1 * i+1 for i in range(20)]
def num_generator(start_num, intervals, interval_size):
    nums = [start_num]
    while intervals > 0:
        nums.append(nums[-1] + interval_size)
        intervals-=1
    return nums

In [None]:
C_vals = num_generator(0.1, 20, 0.1)
gamma = num_generator(0, 10, 0.01)
parameters = [
#     {
#         'C': C_vals,
#         'kernel': ['linear']
#     },
    {
        'C': C_vals,
        'kernel': [
            'rbf',
            'poly',
            'sigmoid',
        ],
        'gamma': gamma
    }
]
grid_search = GridSearchCV(estimator=classifier, param_grid=parameters, scoring='accuracy', cv=10, n_jobs=-1, verbose=10)
grid_search = grid_search.fit(X_train, y_train)
print(grid_search.best_score_, grid_search.best_params_)

### Random Forest

#### Hyper Parameter Tuning RandomForest

In [None]:
parameters = [
    {
        'n_estimators': [i for i in range(1, 100)],
        'criterion': ['entropy']
    }
]
classifier = RandomForestClassifier()
grid_search = GridSearchCV(estimator=classifier, param_grid=parameters, scoring='accuracy', cv=10, n_jobs=-1, verbose=10)
grid_search = grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
print(grid_search.best_score_, grid_search.best_params_)

In [None]:
classifier = RandomForestClassifier(n_estimators = 47, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)
run_metrics(classifier, X_train, X_test, y_train, y_test)

### XGBoost

In [None]:
classifier = XGBClassifier(learning_rate=0.21, max_depth=4, n_estimators=138)
classifier.fit(X_train, y_train)
run_metrics(classifier, X_train, X_test, y_train, y_test)

In [None]:
classifier = XGBClassifier()
classifier.fit(X_train, y_train)
run_metrics(classifier, X_train, X_test, y_train, y_test)

In [None]:
importances = Df({'importance': classifier.feature_importances_, 'col_names': list(X_frame.columns[2:-1])})
importances.sort_values(by=['importance'], ascending=False)

#### Visualize Boosted Tree

In [None]:
plot_tree(classifier)
fig = plt.gcf()
fig.set_size_inches(40, 40)
fig.savefig('tree.png')

### Feature Importances in Boosted Tree

In [None]:
feature_importances = Df({'columns': X_frame.columns[2:-1], 'importances': classifier.feature_importances_})
feature_importances.sort_values(by=['importances'], ascending=False)

#### Hyper Parameter Tuning XGBoost

In [None]:
learning_rates = num_generator(0.1, 20, 0.01)
num_estimators = num_generator(90, 20, 3)
learning_rates, num_estimators

In [None]:
parameters = [
    {
        'max_depth': [i for i in range(5)],
        'learning_rate': learning_rates,
        'n_estimators': num_estimators,        
    }
]
classifier = XGBClassifier()
grid_search = GridSearchCV(estimator=classifier, param_grid=parameters, scoring='accuracy', cv=10, n_jobs=-1, verbose=10)
grid_search = grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
print(grid_search.best_score_, grid_search.best_params_)

### CAP Curve

In [None]:
best_xgb = XGBClassifier(learning_rate=0.21, max_depth=4, n_estimators=138)
best_svm = SVC(kernel = 'rbf', C=2.1, gamma= 0.04, probability=True)
best_random_forest = RandomForestClassifier(n_estimators = 47, criterion = 'entropy', random_state = 0)

In [None]:
def get_model_for_classifier(classifier, name):
    classifier.fit(X_train, y_train)
    test_data = Df(X_test)
    total = len(test_data)
    test_data['actual'] = y_test
    test_data['predicted'] = classifier.predict_proba(X_test)[:, 1]
    prob_sorted = test_data.sort_values(by=['predicted'], ascending=False)
    model_y = list(prob_sorted.actual) 
    return go.Scatter(
        x=np.arange(0, total + 1),
        y=np.append([0], np.cumsum(model_y)),
        name=name,
        mode='lines'
    )


def get_model_random():
    unique, counts = np.unique(y_test, return_counts=True)
    class_1_count = dict(zip(unique, counts))[1]    
    return {
        "type": "line",
        "x0": 0,
        "x1": len(y_test),
        "y0": 0,
        "y1": class_1_count,
        "line": {
            "color": 'rgb(256, 0, 0)',
            "width": 4,
            "dash": "dot"
        }
    }


def get_model_perfect():
    unique, counts = np.unique(y_test, return_counts=True)
    class_1_count = dict(zip(unique, counts))[1]    
    return go.Scatter(
        x=[0, class_1_count, len(y_test)],
        y= [0, class_1_count, class_1_count],
        mode='lines',
        name='Perfect Model'
    )

def do_plot():
    plt.figure(figsize=(20, 12))


    fig = {
        "data": [
            get_model_for_classifier(best_xgb, 'Tuned XGBoost'),
            get_model_for_classifier(best_svm, 'Tuned SVM'),
            get_model_for_classifier(best_random_forest, 'Tuned Random Forest'),
            get_model_perfect()
        ],
        "layout": go.Layout(
            title=go.layout.Title(text="CAP Curves"),
            xaxis=go.layout.XAxis(title=go.layout.xaxis.Title(text='Observations')),
            yaxis=go.layout.YAxis(title=go.layout.yaxis.Title(text='Misinformative Observations')),
            shapes=[get_model_random()]
        )
    }
    plot(fig, filename='CAP Curves of Various Models')
#     offline.plot(fig, image='png', image_filename='CAP Curves', output_type='file')
do_plot()

In [None]:
def plot_error_rates():
    misinfo_missed = [32.8, 27.14, 14.1, 7.8, 12, 9.2]
    false_positives = [6.1, 4.83, 0.7, 0.2, 0.5, 0.5]
    labels = ['Kernel SVM', 'Tuned SVM', 'XGB', 'Tuned XGB', 'Random Forest', 'Tuned Random Forest']
    trace0 = go.Bar(
        x=labels,
        y=false_positives,
        text=false_positives,
        textposition = 'auto',
        marker=dict(
        color='rgb(158,202,225)',
        line=dict(
            color='rgb(8,48,107)',
            width=1.5),
        ),
        name='False Positives / Type 1 Error'
    )
    trace1 = go.Bar(
        x=labels,
        y=misinfo_missed,
        text=misinfo_missed,
        textposition = 'auto',
        name='False Negatives / Type 2 Error',
        marker=dict(
        color='rgb(58,200,225)',
        line=dict(
            color='rgb(8,48,107)',
            width=1.5),
        ),
    )
    layout = go.Layout(
        title=go.layout.Title(text="Error Types and Counts across Models"),
        xaxis=go.layout.XAxis(title=go.layout.xaxis.Title(text='Model Type')),
        yaxis=go.layout.YAxis(title=go.layout.yaxis.Title(text='Error %')),
    )
    fig = {'data': [trace0, trace1], 'layout': layout}
    plot(fig, filename='Error Rate Bar Chart')
#     offline.plot(fig, image='png', image_filename='error_rate_bar', output_type='file')
plot_error_rates()

In [None]:
plot_tree(best_xgb)
fig = plt.gcf()
fig.set_size_inches(40, 40)
fig.savefig('tree.png')

In [None]:
feature_importances = Df({'columns': X_frame.columns[2:-1], 'importances': best_xgb.feature_importances_})
feature_importances.sort_values(by=['importances'], ascending=False).head