In [3]:
import streamlit

In [4]:
streamlit.__version__

'0.80.0'

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

## Read into data

In [3]:
comment = pd.read_csv('data/comments_combo_clean.csv')

In [4]:
comment.head()

Unnamed: 0.1,Unnamed: 0,body,subreddit
0,0,Yes,1
1,1,"Add me, u/BoxmanWTF",1
2,2,"Sooo, what’s that mean?",1
3,3,Do you smell popcorn? Everyone will always tak...,1
4,4,I don’t know why I even put my money into anyt...,1


In [5]:
comment.drop(columns = 'Unnamed: 0', inplace=True)

In [6]:
comment.head()

Unnamed: 0,body,subreddit
0,Yes,1
1,"Add me, u/BoxmanWTF",1
2,"Sooo, what’s that mean?",1
3,Do you smell popcorn? Everyone will always tak...,1
4,I don’t know why I even put my money into anyt...,1


In [7]:
# set X and y variables
X = comment['body']
y = comment['subreddit']

In [8]:
# train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.3)

## Logistic Regression Modeling

### CountVectorizer + LogReg

In [9]:
# create pipeline
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('logreg', LogisticRegression())
])

# set up params for gridsearch
pipe_params = {
    'cvec__stop_words':[['just','like','stocks','stock']],
    'cvec__min_df':[20],
    'cvec__max_df': [.8],
    'cvec__ngram_range': [(1,1)],
    'logreg__penalty': ['l2'],
    'logreg__C':[.3],
    'logreg__solver': ['liblinear']
}

# instantiate gridsearch
gs = GridSearchCV(pipe, 
                  param_grid=pipe_params,
                  cv=5)

In [11]:
# fit gridsearch
gs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('logreg', LogisticRegression())]),
             param_grid={'cvec__max_df': [0.8], 'cvec__min_df': [20],
                         'cvec__ngram_range': [(1, 1)],
                         'cvec__stop_words': [['just', 'like', 'stocks',
                                               'stock']],
                         'logreg__C': [0.3], 'logreg__penalty': ['l2'],
                         'logreg__solver': ['liblinear']})

In [13]:
# https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/
reddit_pipe = 'finalized_model.sav'
pickle.dump(gs, open(reddit_pipe, 'wb'))