#### **Imports**

In [1]:
import pandas as pd
import numpy as np
import requests
import time
import nltk
import regex as re
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from nltk.corpus import stopwords
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text

#### **Read in dataframe saves as .csv file**

In [2]:
# read in dataframe saved as .csv file
df = pd.read_csv('comments.csv')
# drop first column
df.drop(columns='Unnamed: 0', inplace=True)
df.head()

Unnamed: 0,author,body,score,created_utc,subreddit
0,RobusEtCeleritas,None of that means anything You clearly have ...,3,1627142891,1
1,hydroxypcp,The legal limit varies among countries It s c...,0,1627140964,1
2,AnthillOmbudsman,But magma chambers aren t hollow caverns I m...,1,1627140360,1
3,AWormDude,According to this study https www research...,5,1627139388,1
4,Trypanosoma_,Look up TH cellular response vs TH humora...,-4,1627136260,1


##### **X,y train, test, split**

In [None]:
X = df[['body']]
y = df['subreddit']

X = [word.lower().split() for word in df['body']]
stops = set(stopwords.words('english'))
posts_list = []
for post in X:
    meaningful_words = [word for word in post if word not in stops]
    meaningful_post = " ".join(meaningful_words)
    posts_list.append(meaningful_post)
    
modeltext = pd.Series(posts_list)
df['body'] = modeltext
X = df['body']
y = df['subreddit']
X_train, X_test, y_train, y_test = train_test_split(X,
                                   y,
                                   train_size=0.75,
                                   stratify=y,
                                   random_state=42)

##### **Baseline Accuracy**

In [None]:
y.value_counts(normalize=True)

##### **Pipeline, CountVectorizer, Logistic Regression, Grid Search**

In [None]:
%%time
pipe_cv = Pipeline([
    ('cv', CountVectorizer(stop_words='english', max_features=10000)),
    ('logreg', LogisticRegression())
]
)
params_cv = {
    'cv__max_features':[1_000, 4_000, 5_000],
    'cv__min_df'      : [2, 3, 5],
    'cv__max_df'      : [.90, .95],
    'cv__ngram_range' : [(1,1), (1,2)],
    'logreg__C'       : [.01, 1.0],
    'logreg__penalty' : ['l1', 'l2'],
    'logreg__solver'  : ['lbfgs', 'liblinear']
}

gs_cv = GridSearchCV(pipe_cv, params_cv, cv=5, verbose=1, n_jobs = -1)
gs_cv.fit(X_train, y_train)
print(gs_cv.best_params_)
print(gs_cv.score(X_train, y_train))
print(gs_cv.score(X_test, y_test))

##### **Test Predictions and Accuracy Score**

In [None]:
predictions = gs_cv.predict(X_test)
print(f'Accuracy Score: {accuracy_score(y_test, predictions)}')

###

#### **TF-IDF Vectorizer**

In [None]:
%%time
pipe_tv = Pipeline([
    ('tv', TfidfVectorizer(stop_words='english')),
    ('logreg', LogisticRegression())
]
)
params_tv = {
    'tv__max_features':[4000, 5000, 10000],
    'tv__min_df'      : [1, 2, 5, 10],
    'tv__max_df'      : [.95, .98],
    'tv__ngram_range' : [(1,1), (1,2)]
}

gs_tv = GridSearchCV(pipe_tv, params_tv, cv=5, verbose=1, n_jobs = -1)
gs_tv.fit(X_train, y_train)
print(gs_tv.best_params_)
print(gs_tv.score(X_train, y_train))
print(gs_tv.score(X_test, y_test))