In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv('../datasets/main_final.csv')[['comp_stem', 'subreddit']].reset_index(drop=True)

X = df['comp_stem']
y = df['subreddit']

X_train, X_test, y_train, y_test=train_test_split(X, y, stratify = y)

In [3]:
tvec = TfidfVectorizer(min_df = 3, max_df = 0.9, ngram_range = (1,2))
tvec.fit(X_train)
X_train_tv = pd.DataFrame(tvec.transform(X_train).todense(), columns = tvec.get_feature_names())
X_test_tv = pd.DataFrame(tvec.transform(X_test).todense(), columns = tvec.get_feature_names())

In [4]:
logreg = LogisticRegression()

pipe_params = {
    'penalty' : ['l2'],
    'C' : [3],
    'solver' : ['liblinear']
}

In [5]:
gs = GridSearchCV(logreg, param_grid = pipe_params, cv = 5)

In [7]:
gs.fit(X_train_tv, y_train)

GridSearchCV(cv=5, estimator=LogisticRegression(),
             param_grid={'C': [3], 'penalty': ['l2'], 'solver': ['liblinear']})

In [8]:
gs.best_score_

0.838835399391993

In [10]:
gs.score(X_train_tv, y_train)

0.9755206025697829

In [11]:
gs.score(X_test_tv, y_test)

0.8551495016611296

In [12]:
new_df = y_test.to_frame()

In [13]:
new_df['predictions'] = gs.predict(X_test_tv)

In [14]:
new_df.to_csv('../datasets/logreg.csv', index = False)