#### **Imports**

In [1]:
import pandas as pd
import numpy as np
import requests
import time
import nltk
import regex as re
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RandomizedSearchCV
from nltk.corpus import stopwords
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier, BaggingClassifier
import warnings 
warnings.filterwarnings("ignore")
# skopt imports
from skopt.space import Integer, Real, Categorical
from skopt import BayesSearchCV

# distributions we'll need
from scipy.stats import uniform, loguniform

#### **Read in .csv file & drop first column**

In [3]:
# read in dataframe saved as .csv file
df = pd.read_csv('comments.csv')
# drop first column
df.drop(columns='Unnamed: 0', inplace=True)
df.head()

Unnamed: 0,author,body,score,created_utc,subreddit
0,RobusEtCeleritas,None of that means anything You clearly have ...,3,1627142891,1
1,hydroxypcp,The legal limit varies among countries It s c...,0,1627140964,1
2,AnthillOmbudsman,But magma chambers aren t hollow caverns I m...,1,1627140360,1
3,AWormDude,According to this study https www research...,5,1627139388,1
4,Trypanosoma_,Look up TH cellular response vs TH humora...,-4,1627136260,1


#### **X, y, train_test_split**

In [4]:
X = df[['body']]
y = df['subreddit']

X = [word.lower().split() for word in df['body']]
stops = set(stopwords.words('english'))
posts_list = []
for post in X:
    meaningful_words = [word for word in post if word not in stops]
    meaningful_post = " ".join(meaningful_words)
    posts_list.append(meaningful_post)
    
modeltext = pd.Series(posts_list)
df['body'] = modeltext
X = df['body']
y = df['subreddit']
X_train, X_test, y_train, y_test = train_test_split(X,
                                   y,
                                   train_size=0.75,
                                   stratify=y,
                                   random_state=42)

#### **Baseline Accuracy**

In [5]:
y.value_counts(normalize=True)

0    0.699791
1    0.300209
Name: subreddit, dtype: float64

#### **CountVectorizer SVC**

In [None]:
%%time
pipe_svc = Pipeline([
    ('cv', CountVectorizer(stop_words='english')),
    ('svc', SVC(random_state = 42))
])

params_svc = {
#     'cv__max_features':[2_000, 5_000],
#     'cv__min_df'        : [2, 3],
#     'cv__max_df'        : [.90, .95],
#     'cv__ngram_range'   : [(1,1)],
    'svc__C': loguniform(1e-5,1e+2), 
    'svc__kernel': ['poly','rbf'],
    'svc__gamma': ['scale','auto'],
    'svc__degree': list(np.linspace(2,10,9)), 
    'svc__coef0': uniform(0,1), 
    'svc__shrinking': [True, False]   
}

rs_svc = RandomizedSearchCV(estimator = pipe_svc,
                     param_distributions = params_svc,
                     scoring = 'f1_weighted',
                     n_iter = 2000,
                     n_jobs = -2,
                     cv = 5,
                     verbose = 1)

rs_svc.fit(X_train, y_train)
print(rs_svc.best_params_)
print(rs_svc.score(X_train, y_train))
print(rs_svc.score(X_test, y_test))

In [None]:
preds_rs_svc = rs_svc.predict(X_test)
print(f'Best Score: {rs_svc.best_score_}')
print(f'Accuracy Score: {accuracy_score(y_test, preds_rs_svc)}')

#### **TF-IDF Vectorizer**

In [None]:
%%time
pipe_tvsvc = Pipeline([
    ('tv', TfidfVectorizer(stop_words='english')),
    ('svc', SVC())
]
)
params_tvsvc = {
    'tv__max_features':[4000, 5000, 7000],
    'tv__min_df'      : [2, 3],
    'tv__max_df'      : [.85, .90],
    'tv__ngram_range' : [(1,1)],
    'svc__C': loguniform(1e-5,1e+2), 
    'svc__kernel': ['poly','rbf'],
    'svc__gamma': ['scale','auto'],
    'svc__degree': list(np.linspace(2,10,9)), 
    'svc__coef0': uniform(0,1), 
    'svc__shrinking': [True, False],   
}

rs_tvsvc = RandomizedSearchCV(estimator = pipe_tvsvc,
                     param_distributions = params_tvsvc,
                     scoring = 'f1_weighted',
                     n_iter = 2000,
                     n_jobs = -2,
                     cv = 5,
                     verbose = 1)

rs_tvsvc.fit(X_train, y_train)
print(rs_tvsvc.best_params_)
print(rs_tvsvc.score(X_train, y_train))
print(rs_tvsvc.score(X_test, y_test))

In [None]:
preds_rs_tvsvc = rs_tvsvc.predict(X_test)
print(f'Best Score: {rs_tvsvc.best_score_}')
print(f'Accuracy Score: {accuracy_score(y_test, preds_rs_tvsvc)}')