In this notebook I will try out 3 different classification models (logisitic regression, k-nearest neighbors, and a random forest) on several different types of tokenized matrices (tfidf tokenized and svd) in the attempt to create model that can accurately classify which subreddit a post came frome. If I can achieve this, then this will imply that different discussion are occuring on the Democrat and Republican subreddits.

In [71]:
import requests
import json
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.decomposition import TruncatedSVD
from sklearn.grid_search import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.feature_selection import SelectKBest, VarianceThreshold, f_regression
from sklearn.ensemble import RandomForestClassifier
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pickle


# Reading in my data and filling null values

In [72]:
reddit = pd.read_csv('../Data/reddit.csv')
X_train = pd.read_csv('../Data/X_train.csv', header = None)
X_test = pd.read_csv('../Data/X_test.csv', header = None)
y_train = pd.read_csv('../Data/y_train.csv', header = None)
y_test = pd.read_csv('../Data/y_test.csv', header = None)
reddit.drop(columns = 'Unnamed: 0', axis = 1, inplace=True)
reddit.fillna('', inplace=True)

# Using TFIDF to create features for my term matrix


In [73]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import stop_words
custom_stopwords = list(stop_words.ENGLISH_STOP_WORDS)
custom_stopwords.extend(['10','000','https', 'com'])

tfidf = TfidfVectorizer(stop_words=custom_stopwords, min_df=5, max_df=1.0)
term_mat = tfidf.fit_transform(reddit['text_title'])

### Creating a sparse data frame from my term matrix

In [74]:
term_df = pd.SparseDataFrame(term_mat, columns=tfidf.get_feature_names())
term_df.fillna(0, inplace=True)
term_df.head()

Unnamed: 0,12,13,15,18,200,2014,2016,2017,2018,2020,...,working,world,wrong,www,year,years,york,young,youtu,youtube
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


This TFIDF matrix will be used later on to create an SVD matrix

# Logistic Regression

### Creating a pipeline and running a gridsearch to fine tune my model's hyperparameters

In [75]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=custom_stopwords)),
    ('logreg', LogisticRegression())
])

In [76]:
params = {
    'tfidf__min_df': [1,3,5,7,10],
    'tfidf__max_df': [.85,.9,.95,1.0],
    'tfidf__norm': ['l1','l2'],
    'logreg__C': [1,2,4,8,(1/10),(1/5)],
    'logreg__penalty': ['l1', 'l2']
}

In [77]:
gs = GridSearchCV(pipe, params)

In [78]:
gs.fit(X_train[1], y_train[1])

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'tfidf__min_df': [1, 3, 5, 7, 10], 'tfidf__max_df': [0.85, 0.9, 0.95, 1.0], 'tfidf__norm': ['l1', 'l2'], 'logreg__C': [1, 2, 4, 8, 0.1, 0.2], 'logreg__penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

### Scoring my model

In [79]:
gs.score(X_train[1],y_train[1])

0.7183098591549296

In [80]:
gs.score(X_test[1],y_test[1])

0.6591549295774648

### Looking at my best parameters from my grid search

In [81]:
gs.best_params_

{'logreg__C': 1,
 'logreg__penalty': 'l1',
 'tfidf__max_df': 0.85,
 'tfidf__min_df': 5,
 'tfidf__norm': 'l2'}

# K Nearest Neighbors on an SVD data frame

### Creating a SVD data frame

Creating an SVD data frame will reduce the number of features we have and potentially help us with the problem of overfitting

In [82]:
SVD = TruncatedSVD(n_components=100)
svd_matrix = SVD.fit_transform(term_df)
svd_matrix.shape

(1420, 100)

In [83]:
component_names = ["component_"+str(i+1) for i in range(100)]
svd_df = pd.DataFrame(svd_matrix,
                      columns=component_names)
svd_df.head()

Unnamed: 0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9,component_10,...,component_91,component_92,component_93,component_94,component_95,component_96,component_97,component_98,component_99,component_100
0,0.043018,0.033061,-0.036783,0.01296,9.1e-05,-0.083168,-0.049686,-0.13394,0.073176,-0.126207,...,-0.02035,-0.002591,-0.009475,0.037951,-0.006155,-0.026287,-0.005525,0.037521,0.04071,0.011197
1,0.054018,0.063484,-0.058336,0.018481,-0.050631,-0.188665,-0.039902,-0.216124,0.254025,0.022839,...,-0.033701,-0.080035,0.006454,0.024757,-0.019388,-0.02012,0.075663,-0.058197,-0.014386,-0.032351
2,0.037082,0.026774,-0.042797,0.002255,0.025214,-0.038031,-0.003047,-0.031455,0.023449,0.030475,...,0.039686,-0.008353,-0.022161,-0.074135,-0.012417,0.056872,0.052923,-0.051099,0.042788,-0.078246
3,0.033363,0.021766,-0.02455,-0.005295,-0.022273,-0.038249,-0.027995,-0.015521,-0.051013,0.079409,...,-0.027051,0.024252,0.116847,-0.047493,-0.093877,0.03615,-0.03469,-0.032609,-0.015141,0.044326
4,0.206326,-0.064116,0.025296,0.033071,-0.037846,0.00899,-0.020488,-0.024805,-0.012017,-0.027768,...,-0.048827,0.079648,-0.083823,-0.064862,0.004644,-0.062738,-0.090752,0.078274,0.012607,-0.067073


### Creating a loadings dataframe

In [84]:
loadings = pd.DataFrame(SVD.components_,
                        index=component_names,
                        columns=term_df.columns).T
loadings.head()

Unnamed: 0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9,component_10,...,component_91,component_92,component_93,component_94,component_95,component_96,component_97,component_98,component_99,component_100
12,0.003983,0.005542,-0.006467,-0.001924,-0.005483,-0.012465,-0.003019,-0.008902,-0.00578,-0.000902,...,-0.015131,-0.019883,-0.035503,-0.006753,0.000861,0.008505,-0.022801,0.022758,-0.018737,-0.013195
13,0.006718,0.007933,-0.014743,0.000964,0.009692,-0.015561,0.007353,-0.016042,-0.007856,-0.012277,...,-0.020594,-0.001662,-0.012559,0.00463,-0.002678,-0.001203,0.034939,-0.019291,-0.021779,-0.009408
15,0.006876,0.01894,0.004958,-0.001236,-0.002661,-0.010319,0.002397,-0.002006,-0.004766,0.017611,...,-0.00333,-0.030404,-0.036797,0.035783,0.004531,-0.016749,-0.03826,0.041411,0.002083,0.024454
18,0.007043,0.003536,-0.005557,0.004425,-0.002311,-0.009057,-0.002168,0.005133,-0.001145,-0.001496,...,-0.009908,0.008793,-0.007724,0.026158,-0.00965,-0.009395,0.001891,0.014803,-0.007107,-0.008969
200,0.005243,0.00187,-0.004289,-0.0002,-0.003835,-0.004975,-0.005636,-0.007253,0.004634,0.011727,...,-0.018171,-0.01884,0.001201,0.009986,-0.02315,0.008273,-0.009317,-0.009485,0.007634,0.003619


### Examining the first components

In [85]:
loadings['abs_component_1'] = np.abs(loadings.component_1)
loadings['abs_component_2'] = np.abs(loadings.component_2)

In [86]:
loadings.sort_values('abs_component_1',ascending=False).head(20)[['component_1']]

Unnamed: 0,component_1
trump,0.813781
democrats,0.216316
president,0.194949
obama,0.139298
donald,0.13624
like,0.098331
supporters,0.076813
party,0.075294
cnn,0.072661
new,0.071953


### Train test split our SVD data frame

In [87]:
y = reddit['subreddit']

In [88]:
X_train_svd, X_test_svd, y_train_svd, y_test_svd = train_test_split(svd_df, y, stratify=y)

### Creating a pipeline and running a grid search to fine tune my model's hyperparemeters

In [89]:
pipe = Pipeline([
    ('knn', KNeighborsClassifier())
])

In [90]:
params = {
    'knn__n_neighbors': [5, 10, 20, 25, 30,40,50],
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['minkowski', 'euclidean', 'manhattan']
}

In [91]:
gs = GridSearchCV(pipe, params)

In [92]:
gs.fit(X_train_svd,y_train_svd)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'knn__n_neighbors': [5, 10, 20, 25, 30, 40, 50], 'knn__weights': ['uniform', 'distance'], 'knn__metric': ['minkowski', 'euclidean', 'manhattan']},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

### Scoring my model

In [93]:
gs.score(X_train_svd, y_train_svd)

0.7511737089201878

In [94]:
gs.score(X_test_svd, y_test_svd)

0.6112676056338028

### Looking at the best parameters from my grid search

In [95]:
gs.best_params_

{'knn__metric': 'minkowski', 'knn__n_neighbors': 5, 'knn__weights': 'uniform'}

# Random Forest with Sentiment Analysis

Let's see if including features for sentiment help our model's accuracey in predicing the subreddits

### Instantiating my sentiment analyzer

In [96]:
sid = SentimentIntensityAnalyzer()

### Adding columns for sentiment to my data frame

In [97]:
reddit['positive'] = reddit['text_title'].map(lambda x: sid.polarity_scores(x)['pos'])

In [98]:
reddit['negative'] = reddit['text_title'].map(lambda x: sid.polarity_scores(x)['neg'])

In [99]:
reddit['neutral'] = reddit['text_title'].map(lambda x: sid.polarity_scores(x)['neu'])

In [100]:
reddit.head()

Unnamed: 0,name,text,title,subreddit,text_title,positive,negative,neutral
0,t3_9abkdq,,"John McCain, War Hero, Senator, Presidential C...",1,"John McCain, War Hero, Senator, Presidential C...",0.319,0.244,0.438
1,t3_9b99kz,,Martha McSally wins GOP nomination for Arizona...,1,Martha McSally wins GOP nomination for Arizona...,0.291,0.0,0.709
2,t3_9bcm5x,,WV State Police seize $10K from couple without...,1,WV State Police seize $10K from couple without...,0.0,0.241,0.759
3,t3_9b0480,,Student Who Snatched Classmate’s MAGA Hat Faci...,1,Student Who Snatched Classmate’s MAGA Hat Faci...,0.0,0.123,0.877
4,t3_9baibi,,Socialism vs. Trump after Massive Turnout in F...,1,Socialism vs. Trump after Massive Turnout in F...,0.0,0.0,1.0


### Adding sentiment columns to my SVD data frame

In [101]:
svd_df['positive'] = reddit['positive']

In [102]:
svd_df['negative'] = reddit['negative']

In [103]:
svd_df['neutral'] = reddit['neutral']

In [104]:
y = reddit['subreddit']

### Train, test split my new SVD data frame

In [105]:
X_train_sen, X_test_sen, y_train_sen, y_test_sen = train_test_split(svd_df, y, stratify=y)

### Setting parameters for my grid search

After extensive hyperparamter tuning, these are the best ranges for paremeters I've found

In [106]:
params = {
    'n_estimators': [70,80,90,100,110],
    'max_features': ['auto', 'log2', 'sqrt'],
    'min_samples_split': [4,5,6,7],
    'min_samples_leaf': [1,2,3,4,5]
    
}

### Running a gridsearch to find the best hyperparameters for my model

In [107]:
gs = GridSearchCV(estimator = RandomForestClassifier(), param_grid = params)

In [108]:
gs.fit(X_train_sen, y_train_sen)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [70, 80, 90, 100, 110], 'max_features': ['auto', 'log2', 'sqrt'], 'min_samples_split': [4, 5, 6, 7], 'min_samples_leaf': [1, 2, 3, 4, 5]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

### Scoring my model

In [109]:
gs.score(X_train_sen, y_train_sen)

0.9755868544600939

In [110]:
gs.score(X_test_sen, y_test_sen)

0.6

### Looking at the best parameters from my grid search

In [111]:
gs.best_params_

{'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 6,
 'n_estimators': 100}

### Ranking components by their importance

In [112]:
pd.DataFrame(gs.best_estimator_.feature_importances_, svd_df.columns).sort_values(0, ascending=False).head()

Unnamed: 0,0
component_3,0.03731
component_6,0.026992
component_4,0.025783
component_5,0.02388
component_75,0.016018


### Examining my most important component

In [113]:
loadings['abs_component_3'] = np.abs(loadings.component_3)

In [114]:
loadings.sort_values('abs_component_3',ascending=False).head(20)[['component_3']]

Unnamed: 0,component_3
democrats,0.647597
obama,-0.276183
clinton,-0.245928
hillary,-0.242241
party,-0.226923
democratic,-0.192489
new,-0.176672
president,-0.174064
democrat,-0.155903
trump,0.149611


# Conclusion

To reiterate, here are the three scores for my models: 

Logistic Regression: $65.9\%$

K-nearest neighbors: $61.1\%$

Random forest: $60.0\%$


None of my models achieved an accuracy above $65%$, implying that democrats and republicans discuss similar topics and with similar sentiments. My best model turned out to be a simple logistic regression model on a TFIDF matrix, although this model performed only marginally better than my other two models. 