#### **Imports**

In [31]:
import pandas as pd
import numpy as np
import requests
import time
import nltk
import regex as re
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from nltk.corpus import stopwords
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text

##### **Read in .csv file**

In [32]:
# read in dataframe saved as .csv file
df = pd.read_csv('posts.csv')
# drop first column
df.drop(columns='Unnamed: 0', inplace=True)

In [33]:
df.head()

Unnamed: 0,title,author,created_utc,score,subreddit,text
0,Why do we vomit when we are too hot?,childloser,1627143368,1,1,Why do we vomit when we are too hot
1,How much CO2 is released by manufacturing a ki...,banmeyoucoward,1627143292,1,1,How much CO is released by manufacturing a ki...
2,Is the level of AI portrayed in movies (fully ...,SchoolThrow123,1627143240,1,1,Is the level of AI portrayed in movies fully ...
3,Why do some animals have moustache and whiskers,notowork,1627143114,1,1,Why do some animals have moustache and whiskers
4,What is the maximal theoretical size for an an...,Vantaie,1627142862,1,1,What is the maximal theoretical size for an an...


In [34]:
X= df['text']
y= df['subreddit']

X = [word.lower().split() for word in df['text']]
stops = set(stopwords.words('english'))
posts_list = []
for post in X:
    meaningful_words = [word for word in post if word not in stops]
    meaningful_post = " ".join(meaningful_words)
    posts_list.append(meaningful_post)
    
modeltext = pd.Series(posts_list)
df['modeltext'] = modeltext
X = df['modeltext']
y = df['subreddit']
X_train, X_test, y_train, y_test = train_test_split(X,
                                   y,
                                   train_size=0.75,
                                   stratify=y,
                                   random_state=42)

# cv = CountVectorizer()
# cv.fit(X_train)

# X_train_cv = cv.transform(X_train)
# X_test_cv = cv.transform(X_test)

##### **Baseline Accuracy**

In [35]:
y.value_counts(normalize=True)

1    0.655863
0    0.344137
Name: subreddit, dtype: float64

In [36]:
feature_names = cv.get_feature_names()

In [37]:
# text_transformed=pd.DataFrame(text_transformed.todense())
# text_transformed.columns = feature_names
# (text_transformed.sum().sort_values(ascending = False).head(15))
# text_transformed.sum().sort_values(ascending = False).head(15).plot(kind='barh');

In [40]:
pipe_cv = Pipeline([
    ('cv', CountVectorizer(stop_words='english', max_features=10000)),
    ('logreg', LogisticRegression())
]
)
params_cv = {
    'cv__max_features':[1_000, 4_000, 5_000],
    'cv__min_df'      : [2, 3, 5],
    'cv__max_df'      : [.90, .95],
    'cv__ngram_range' : [(1,1), (1,2)],
    'logreg__C'       : [.01, 1.0],
    'logreg__penalty' : ['l1', 'l2'],
    'logreg__solver'  : ['lbfgs', 'liblinear']
}

gs_cv = GridSearchCV(pipe_cv, params_cv, cv=5, verbose=1, n_jobs = -1)
gs_cv.fit(X_train, y_train)
print(gs_cv.best_params_)
print(gs_cv.score(X_train, y_train))
print(gs_cv.score(X_test, y_test))

Fitting 5 folds for each of 288 candidates, totalling 1440 fits


 0.92719977 0.92722455        nan 0.83209866 0.87526329 0.87692346
        nan 0.92804223 0.92655551 0.92648118        nan 0.83212344
 0.87355352 0.87518893        nan 0.92836436 0.92732366 0.92732366
        nan 0.83209866 0.87531285 0.8768739         nan 0.92799267
 0.92672896 0.92672896        nan 0.83209866 0.87347919 0.8751146
        nan 0.92838914 0.92712543 0.92715021        nan 0.83209866
 0.87538718 0.87689868        nan 0.92811656 0.92650597 0.92645641
        nan 0.83212344 0.87672521 0.87870749        nan 0.93721042
 0.93743343 0.93745821        nan 0.83209866 0.87959955 0.88091282
        nan 0.93706174 0.93738383 0.93733427        nan 0.83209866
 0.87677477 0.87873227        nan 0.93728475 0.9374582  0.93743342
        nan 0.83209866 0.87957477 0.88091282        nan 0.9371113
 0.93745816 0.93750772        nan 0.83209866 0.87672521 0.87868271
        nan 0.93725998 0.93740865 0.93745821        nan 0.83212344
 0.87957477 0.8809376         nan 0.93703695 0.93730949 0.937309

{'cv__max_df': 0.9, 'cv__max_features': 5000, 'cv__min_df': 5, 'cv__ngram_range': (1, 1), 'logreg__C': 1.0, 'logreg__penalty': 'l2', 'logreg__solver': 'liblinear'}
0.9613945536090394
0.9434326915929533


In [41]:
predictions = gs_cv.predict(X_test)
print(f'Accuracy Score: {accuracy_score(y_test, predictions)}')

Accuracy Score: 0.9434326915929533


In [146]:
%%time
pipe_tv = Pipeline([
    ('tv', TfidfVectorizer(stop_words='english')),
    ('logreg', LogisticRegression())
]
)
params_tv = {
    'tv__max_features':[4000, 5000, 10000],
    'tv__min_df'      : [1, 2, 5, 10],
    'tv__max_df'      : [.95, .98],
    'tv__ngram_range' : [(1,1), (1,2)]
}

gs_tv = GridSearchCV(pipe_tv, params_tv, cv=5, verbose=1, n_jobs = -1)
gs_tv.fit(X_train, y_train)
print(gs_tv.best_params_)
print(gs_tv.score(X_train, y_train))
print(gs_tv.score(X_test, y_test))

Fitting 5 folds for each of 108 candidates, totalling 540 fits
{'logreg__C': 1.0, 'tvec__max_df': 0.95, 'tvec__max_features': 10000, 'tvec__min_df': 2, 'tvec__ngram_range': (1, 2)}
0.9533414277572664
0.9407567085408459
Wall time: 5min 44s


In [147]:
%%time
preds_tv = gs_tv.predict(X_test)
print(f'Accuracy score: {accuracy_score(y_test, preds_tv)}')

Accuracy score: 0.9407567085408459
Wall time: 381 ms


In [145]:
confusion_matrix(y_test, preds)
cmatrix = ConfusionMatrixDisplay(confusion_matrix)

In [None]:
pipe