In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/down_sample.csv')[['Subreddit', 'unst_simp_lem']]

In [None]:
df.head()

Unnamed: 0,Subreddit,unst_simp_lem
0,0,i hate feel get interested someone recently go...
1,0,how make friend someone suggest something make...
2,0,so today birthday 9 11 hoping good day well iv...
3,0,having hard time need someone say this dont wa...
4,0,anti depressant inducing trip like state exper...


In [None]:
X = df.drop(columns='Subreddit')
y = df['Subreddit']

X_train, X_test, y_train, y_test=train_test_split(X, y, stratify = y, random_state=42)

In [None]:
tvec = TfidfVectorizer(min_df=5, max_df=0.80, ngram_range=(1,3))
tvec.fit(X_train['unst_simp_lem'])
X_train_tv = pd.DataFrame(tvec.transform(X_train['unst_simp_lem']).todense(), columns = tvec.get_feature_names())
X_test_tv = pd.DataFrame(tvec.transform(X_test['unst_simp_lem']).todense(), columns = tvec.get_feature_names())

In [None]:
lr = LogisticRegression(verbose = 1)

pipe_params = {
    'penalty' : ['l1', 'l2'],
    'C' : [1/0.10, 1/0.075, 1/0.05, 1/0.025],
    'solver' : ['liblinear']
}

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
gs = GridSearchCV(lr, param_grid = pipe_params, cv = 5)

In [None]:
gs.fit(X_train_tv, y_train)

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]

GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=1,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [10.0, 13.333333333333334, 20.0, 40.0],
                         'penalty': ['l1', 'l2'], 'solver': ['liblinear']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [None]:
gs.best_score_

0.782

In [None]:
gs.score(X_train_tv, y_train)

0.9917777777777778

In [None]:
gs.score(X_test_tv, y_test)

0.7861333333333334

In [None]:
gs.best_params_

{'C': 10.0, 'penalty': 'l2', 'solver': 'liblinear'}

In [None]:
lr = LogisticRegression(verbose = 1)

pipe_params = {
    'penalty' : ['l2'],
    'C' : [1/0.10, 1/0.13, 1/0.15, 1/0.18, 1/0.20],
    'solver' : ['liblinear']
}

In [None]:
gs = GridSearchCV(lr, param_grid = pipe_params, cv = 5)

In [None]:
gs.fit(X_train_tv, y_train)

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]

GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=1,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [10.0, 7.692307692307692, 6.666666666666667,
                               5.555555555555555, 5.0],
                         'penalty': ['l2'], 'solver': ['liblinear']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [None]:
gs.best_params_

{'C': 5.0, 'penalty': 'l2', 'solver': 'liblinear'}

In [None]:
gs.best_score_

0.7870666666666667

In [None]:
gs.score(X_train_tv, y_train)

0.9689333333333333

In [None]:
gs.score(X_test_tv, y_test)

0.7925333333333333

In [None]:
lr = LogisticRegression()

In [None]:
cross_val_score(lr, X_train_tv, y_train, cv=5).mean()

0.7885777777777777