# Modeling

In [205]:
#imports 

# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier,BaggingClassifier, VotingClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.compose import ColumnTransformer

import seaborn as sns


from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import re



In [156]:
df = pd.read_csv("../data/cleaned_df.csv")
df = df.drop(columns="Unnamed: 0")
df.head()



Unnamed: 0,post_id,selftext,subreddit,is_osr
0,1irnln3,HeroQuest is the perfect entry into OSR DND. ...,osr,1
1,1f3scds,"To be clear, it was a lot of work before the g...",osr,1
2,1di0qn6,"So, I know there was a thread discussing peopl...",osr,1
3,1g5ga0h,Really loving the booklet layout. Open up char...,osr,1
4,1grfhij,In this video I discuss why I consider Castles...,osr,1


In [157]:
sa = SentimentIntensityAnalyzer()

sentiment_data = [sa.polarity_scores(text) for text in df["selftext"]]
sentiment_df = pd.DataFrame.from_dict(sentiment_data)
df = pd.concat([df, sentiment_df],axis=1)
df.head()



ps = PorterStemmer()
df["selftext"] = df["selftext"].map(lambda entire_selftext:' '.join([ps.stem(re.sub(r'[^a-zA-Z\s]', '', token)) for token in entire_selftext.split()]))

df["text_len"] = df["selftext"].str.len()
df["word_count"] = df["selftext"].str.split().apply(len)

In [158]:
df.head()

Unnamed: 0,post_id,selftext,subreddit,is_osr,neg,neu,pos,compound,text_len,word_count
0,1irnln3,heroquest is the perfect entri into osr dnd my...,osr,1,0.118,0.708,0.174,0.5313,186,36
1,1f3scds,to be clear it wa a lot of work befor the game...,osr,1,0.094,0.76,0.147,0.9769,2119,441
2,1di0qn6,so i know there wa a thread discuss peopl disa...,osr,1,0.066,0.859,0.075,0.3694,782,164
3,1g5ga0h,realli love the booklet layout open up charact...,osr,1,0.0,0.758,0.242,0.9237,278,50
4,1grfhij,in thi video i discuss whi i consid castl cru...,osr,1,0.0,0.701,0.299,0.5719,75,15


In [159]:
print(len([score for score in df["compound"] if score > .050]))
print(len([score for score in df["compound"] if score < -.050]))
print(len([score for score in df["compound"] if score > -.050 and score < .050]))


1729
369
149


In [None]:
#X = df["selftext"]

#y = df["is_osr"]

#X_train, X_test, y_train, y_test = train_test_split(X,
#                                                    y, 
#                                                    random_state=123, 
#                                                    train_size=0.25) 

#cvec = CountVectorizer(
#    stop_words="english",
#    ngram_range=(1,2),
#    min_df=25,
#    max_features=100
#)

#X_train_cv = cvec.fit_transform(X_train)
#X_test_cv = cvec.transform(X_test)

#X_train_df_rpg = pd.DataFrame(
#    X_train_cv.toarray(), 
#    columns=cvec.get_feature_names_out()
#)

#X_train_df_rpg.sum().sort_values(ascending = False).head(15).plot(kind='barh');
#X_train_df_rpg.sum().sort_values(ascending = False).head(50)

## Model 1: Multinomial Naive Baye's



In [160]:
X = df["selftext"]

y = df["is_osr"]


X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y, 
                                                    random_state=123, 
                                                    train_size=0.25) 


In [161]:
#put a different stop word list in
pipe = Pipeline([
    ("cvec", CountVectorizer()),
    ("model", MultinomialNB())
])

In [162]:
#1
pipe_params = {
    'cvec__max_features': [500, 3000, 3800, 1000],
    'cvec__min_df': [20, 3],
    'cvec__max_df': [0.9, 0.95],
    'cvec__ngram_range': [(1,1), (1,2)]
}

# Instantiate GridSearchCV.
gs = GridSearchCV(
    pipe, # the thing to gridsearch over
    param_grid=pipe_params, # the hyperparameters to check
    cv=5 # 5-fold cross-validation
)

In [163]:
gs.fit(X_train, y_train)

In [164]:
gs.best_params_

{'cvec__max_df': 0.9,
 'cvec__max_features': 3800,
 'cvec__min_df': 3,
 'cvec__ngram_range': (1, 2)}

In [165]:
# Score model on training set.
gs.score(X_train, y_train)

0.9180035650623886

In [166]:
# Score model on testing set.
gs.score(X_test, y_test)

0.7200474495848161

In [167]:
# 2
pipe_params = {
    'cvec__max_features': [500, 3000, 3800, 1000],
    'cvec__min_df': [20, 3],
    'cvec__max_df': [0.9, 0.6],
    'cvec__ngram_range': [(1,1), (1,2)]
}
# Instantiate GridSearchCV.
gs = GridSearchCV(
    pipe, # the thing to gridsearch over
    param_grid=pipe_params, # the hyperparameters to check
    cv=10 # 5-fold cross-validation
)

gs.fit(X_train, y_train)

print(f"best params: {gs.best_params_}")
# Score model on training set.
print(f"train score:{gs.score(X_train, y_train)}")
# Score model on testing set.
print(f"train score:{gs.score(X_test, y_test)}")

best params: {'cvec__max_df': 0.9, 'cvec__max_features': 3800, 'cvec__min_df': 3, 'cvec__ngram_range': (1, 2)}
train score:0.9180035650623886
train score:0.7200474495848161


In [168]:
# 3 
pipe_params = {
    'cvec__max_features': [3000, 3100, 2800],
    'cvec__min_df': [1, 2, 3],
    'cvec__max_df': [0.9, 0.95, 0.8],
    'cvec__ngram_range': [(1,1), (1,2)]
}

# Instantiate GridSearchCV.
gs = GridSearchCV(
    pipe, # the thing to gridsearch over
    param_grid=pipe_params, # the hyperparameters to check
    cv=5 # 5-fold cross-validation
)

gs.fit(X_train, y_train)

print(f"best params: {gs.best_params_}")
# Score model on training set.
print(f"train score:{gs.score(X_train, y_train)}")
# Score model on testing set.
print(f"train score:{gs.score(X_test, y_test)}")

best params: {'cvec__max_df': 0.9, 'cvec__max_features': 3100, 'cvec__min_df': 1, 'cvec__ngram_range': (1, 1)}
train score:0.9090909090909091
train score:0.7366548042704626


In [169]:
#4
pipe_params = {
    'cvec__max_features': [3000, 3100],
    'cvec__min_df': [100, 30, 3],
    'cvec__max_df': [0.80, 0.88, 0.85],
    'cvec__ngram_range': [(1,1), (1,2)]
}

# Instantiate GridSearchCV.
gs = GridSearchCV(
    pipe, # the thing to gridsearch over
    param_grid=pipe_params, # the hyperparameters to check
    cv=5 # 5-fold cross-validation
)

gs.fit(X_train, y_train)

print(f"best params: {gs.best_params_}")
# Score model on training set.
print(f"train score:{gs.score(X_train, y_train)}")
# Score model on testing set.
print(f"train score:{gs.score(X_test, y_test)}")

best params: {'cvec__max_df': 0.85, 'cvec__max_features': 3000, 'cvec__min_df': 3, 'cvec__ngram_range': (1, 2)}
train score:0.9001782531194296
train score:0.7253855278766311


In [170]:
#cross_val_score(pipe, X_train, y_train, cv=3).mean() 

## Model 2: random forest

In [171]:
X = df.drop(columns=["is_osr","subreddit","post_id", "word_count","text_len"])

y = df["is_osr"]

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y, 
                                                    random_state=123, 
                                                    train_size=0.25, 
                                                   stratify=y) 

In [172]:
X["compound"].iloc[448]

np.float64(0.1516)

In [173]:
y_train


1236    1
924     0
1988    0
294     1
721     1
       ..
92      1
545     1
1828    0
550     1
1312    1
Name: is_osr, Length: 561, dtype: int64

In [174]:
#f_pipe = Pipeline([
#    ("cvec", CountVectorizer()),
#    ("model", RandomForestClassifier())
#])

In [175]:
text_transformer = Pipeline([
    ("tfidf", TfidfVectorizer())
])

# Numeric processing pipeline
numeric_transformer = Pipeline([
    ("scaler", StandardScaler())
])

# Full preprocessing pipeline
preprocessor = ColumnTransformer([
    ("text", text_transformer, "selftext"),
    ("sentiment", numeric_transformer, ["pos", "neu", "neg", "compound"])
])

# Full pipeline with RandomForest
f_pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier())
])

In [176]:
f_pipe.get_params()

{'memory': None,
 'steps': [('preprocessor',
   ColumnTransformer(transformers=[('text',
                                    Pipeline(steps=[('tfidf', TfidfVectorizer())]),
                                    'selftext'),
                                   ('sentiment',
                                    Pipeline(steps=[('scaler', StandardScaler())]),
                                    ['pos', 'neu', 'neg', 'compound'])])),
  ('model', RandomForestClassifier())],
 'transform_input': None,
 'verbose': False,
 'preprocessor': ColumnTransformer(transformers=[('text',
                                  Pipeline(steps=[('tfidf', TfidfVectorizer())]),
                                  'selftext'),
                                 ('sentiment',
                                  Pipeline(steps=[('scaler', StandardScaler())]),
                                  ['pos', 'neu', 'neg', 'compound'])]),
 'model': RandomForestClassifier(),
 'preprocessor__force_int_remainder_cols': True,
 'preprocess

In [177]:
f_pipe_params = {
    'preprocessor__text__tfidf__max_features': [3000, None],
    'preprocessor__text__tfidf__min_df': [1, 3],
    'preprocessor__text__tfidf__max_df': [0.80, 0.88, 0.85],
    'preprocessor__text__tfidf__ngram_range': [(1,1), (1,2)]
}

# Instantiate GridSearchCV.
gs = GridSearchCV(
    f_pipe, # the thing to gridsearch over
    param_grid=f_pipe_params, # the hyperparameters to check
    cv=5, # 5-fold cross-validation
    error_score='raise'
)

gs.fit(X_train, y_train)
print(f"best params: {gs.best_params_}")
# Score model on training set.
print(f"train score:{gs.score(X_train, y_train)}")
# Score model on testing set.
print(f"test score:{gs.score(X_test, y_test)}")

best params: {'preprocessor__text__tfidf__max_df': 0.8, 'preprocessor__text__tfidf__max_features': 3000, 'preprocessor__text__tfidf__min_df': 1, 'preprocessor__text__tfidf__ngram_range': (1, 2)}
train score:1.0
test score:0.7099644128113879


In [178]:
#chat
f_pipe_params = {
    'preprocessor__text__tfidf__max_features': [1000, 2000],
    'preprocessor__text__tfidf__ngram_range': [(1,1)],
    'preprocessor__text__tfidf__min_df': [2, 3],
    'model__n_estimators': [100],
    'model__max_depth': [10, 15],
    'model__min_samples_leaf': [2, 5],
    'model__max_features': ['sqrt']
}

# Instantiate GridSearchCV.
gs = GridSearchCV(
    f_pipe, # the thing to gridsearch over
    param_grid=f_pipe_params, # the hyperparameters to check
    cv=5, # 5-fold cross-validation
    error_score='raise'
)

gs.fit(X_train, y_train)
print(f"best params: {gs.best_params_}")
# Score model on training set.
print(f"train score:{gs.score(X_train, y_train)}")
# Score model on testing set.
print(f"test score:{gs.score(X_test, y_test)}")

best params: {'model__max_depth': 15, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 5, 'model__n_estimators': 100, 'preprocessor__text__tfidf__max_features': 1000, 'preprocessor__text__tfidf__min_df': 2, 'preprocessor__text__tfidf__ngram_range': (1, 1)}
train score:0.9607843137254902
test score:0.7241992882562278


In [179]:
#chat2
f_pipe_params = {
    'preprocessor__text__tfidf__max_features': [1000, 2000],
    'preprocessor__text__tfidf__ngram_range': [(1,1)],
    'preprocessor__text__tfidf__min_df': [2, 3],
    'model__n_estimators': [100],
    'model__max_depth': [10, 15],
    'model__min_samples_leaf': [2, 5],
    'model__max_features': ['sqrt']
}

# Instantiate GridSearchCV.
gs = GridSearchCV(
    f_pipe, # the thing to gridsearch over
    param_grid=f_pipe_params, # the hyperparameters to check
    cv=5, # 5-fold cross-validation
    error_score='raise'
)

gs.fit(X_train, y_train)
print(f"best params: {gs.best_params_}")
# Score model on training set.
print(f"train score:{gs.score(X_train, y_train)}")
# Score model on testing set.
print(f"test score:{gs.score(X_test, y_test)}")

best params: {'model__max_depth': 15, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 5, 'model__n_estimators': 100, 'preprocessor__text__tfidf__max_features': 1000, 'preprocessor__text__tfidf__min_df': 3, 'preprocessor__text__tfidf__ngram_range': (1, 1)}
train score:0.9536541889483066
test score:0.7176749703440095


In [181]:
#1plus
f_pipe_params = {
    'preprocessor__text__tfidf__min_df': [100, 30, 3],
    'preprocessor__text__tfidf__max_df': [0.80, 0.88, 0.85],
    'preprocessor__text__tfidf__ngram_range': [(1,1), (1,2)],
     "model__min_samples_leaf":[1, 2, 3]
}

# Instantiate GridSearchCV.
gs = GridSearchCV(
    f_pipe, # the thing to gridsearch over
    param_grid=f_pipe_params, # the hyperparameters to check
    cv=5 # 5-fold cross-validation
)

gs.fit(X_train, y_train)

print(f"best params: {gs.best_params_}")
# Score model on training set.
print(f"train score:{gs.score(X_train, y_train)}")
# Score model on testing set.
print(f"test score:{gs.score(X_test, y_test)}")

best params: {'model__min_samples_leaf': 3, 'preprocessor__text__tfidf__max_df': 0.85, 'preprocessor__text__tfidf__min_df': 30, 'preprocessor__text__tfidf__ngram_range': (1, 1)}
train score:0.9928698752228164
test score:0.7188612099644128


### Super overfit, but since mmax_depth was not set, that's not surprising

In [182]:
#2
f_pipe_params = {
    'preprocessor__text__tfidf__max_features': [3100, 3000],
    'preprocessor__text__tfidf__min_df': [30, 50],
    'preprocessor__text__tfidf__max_df': [0.70, 0.80],
    'preprocessor__text__tfidf__ngram_range': [(1,3)],
     'model__max_depth': [5, 4]
}

# Instantiate GridSearchCV.
gs = GridSearchCV(
    f_pipe, # the thing to gridsearch over
    param_grid=f_pipe_params, # the hyperparameters to check
    cv=10 # 5-fold cross-validation
)

gs.fit(X_train, y_train)

print(f"best params: {gs.best_params_}")
# Score model on training set.
print(f"train score:{gs.score(X_train, y_train)}")
# Score model on testing set.
print(f"test score:{gs.score(X_test, y_test)}")

best params: {'model__max_depth': 5, 'preprocessor__text__tfidf__max_df': 0.7, 'preprocessor__text__tfidf__max_features': 3100, 'preprocessor__text__tfidf__min_df': 30, 'preprocessor__text__tfidf__ngram_range': (1, 3)}
train score:0.9180035650623886
test score:0.7194543297746144


In [183]:
#2plus
f_pipe_params = {
    'preprocessor__text__tfidf__max_features': [3100, None],
    'preprocessor__text__tfidf__min_df': [20],
    'preprocessor__text__tfidf__max_df': [0.80, 0.85],
    'preprocessor__text__tfidf__ngram_range': [(1,3)],
     'model__max_depth': [4],
     'model__n_estimators': [100, 250, 500]
    
}

# Instantiate GridSearchCV.
gs = GridSearchCV(
    f_pipe, # the thing to gridsearch over
    param_grid=f_pipe_params, # the hyperparameters to check
    cv=5 # 5-fold cross-validation
)

gs.fit(X_train, y_train)

print(f"best params: {gs.best_params_}")
# Score model on training set.
print(f"train score:{gs.score(X_train, y_train)}")
# Score model on testing set.
print(f"test score:{gs.score(X_test, y_test)}")

best params: {'model__max_depth': 4, 'model__n_estimators': 250, 'preprocessor__text__tfidf__max_df': 0.85, 'preprocessor__text__tfidf__max_features': 3100, 'preprocessor__text__tfidf__min_df': 20, 'preprocessor__text__tfidf__ngram_range': (1, 3)}
train score:0.910873440285205
test score:0.7348754448398577


Tweaking 2 iteration of model checking at increasing estimators

In [186]:
#3 
f_pipe_params = {
    'preprocessor__text__tfidf__max_features': [3100, 4000],
    'preprocessor__text__tfidf__min_df': [30, 20],
    'preprocessor__text__tfidf__max_df': [0.80, 0.90],
    'preprocessor__text__tfidf__ngram_range': [(1,3)],
     'model__max_depth': [4, 5]
}

# Instantiate GridSearchCV.
gs = GridSearchCV(
    f_pipe, # the thing to gridsearch over
    param_grid=f_pipe_params, # the hyperparameters to check
    cv=5 # 5-fold cross-validation
)

gs.fit(X_train, y_train)

print(f"best params: {gs.best_params_}")
# Score model on training set.
print(f"train score:{gs.score(X_train, y_train)}")
# Score model on testing set.
print(f"test score:{gs.score(X_test, y_test)}")

best params: {'model__max_depth': 5, 'preprocessor__text__tfidf__max_df': 0.8, 'preprocessor__text__tfidf__max_features': 3100, 'preprocessor__text__tfidf__min_df': 20, 'preprocessor__text__tfidf__ngram_range': (1, 3)}
train score:0.9233511586452763
test score:0.7396204033214709


## even less overfit and the train also performed better
changing max_depth, max_features

In [187]:
#4
f_pipe_params = {
    'preprocessor__text__tfidf__max_features': [3100, None],
    'preprocessor__text__tfidf__min_df': [25, 20],
    'preprocessor__text__tfidf__max_df': [0.80, 0.90],
    'preprocessor__text__tfidf__ngram_range': [(1,3),(1,3)],
}

# Instantiate GridSearchCV.
gs = GridSearchCV(
    f_pipe, # the thing to gridsearch over
    param_grid=f_pipe_params, # the hyperparameters to check
    cv=5 # 5-fold cross-validation
)

gs.fit(X_train, y_train)

print(f"best params: {gs.best_params_}")
# Score model on training set.
print(f"train score:{gs.score(X_train, y_train)}")
# Score model on testing set.
print(f"train score:{gs.score(X_test, y_test)}")

best params: {'preprocessor__text__tfidf__max_df': 0.9, 'preprocessor__text__tfidf__max_features': 3100, 'preprocessor__text__tfidf__min_df': 25, 'preprocessor__text__tfidf__ngram_range': (1, 3)}
train score:1.0
train score:0.7319098457888493


### remove max features

In [188]:
#5
f_pipe_params = {
    'preprocessor__text__tfidf__min_df': [20],
    'preprocessor__text__tfidf__max_df': [0.90],
    'preprocessor__text__tfidf__ngram_range': [(1,3)],
     'model__max_depth': [4],
     'model__n_estimators': [100, 250, 500],

}

# Instantiate GridSearchCV.
gs = GridSearchCV(
    f_pipe, # the thing to gridsearch over
    param_grid=f_pipe_params, # the hyperparameters to check
    cv=5 # 5-fold cross-validation
)

gs.fit(X_train, y_train)

print(f"best params: {gs.best_params_}")
# Score model on training set.
print(f"train score:{gs.score(X_train, y_train)}")
# Score model on testing set.
print(f"test score:{gs.score(X_test, y_test)}")

best params: {'model__max_depth': 4, 'model__n_estimators': 500, 'preprocessor__text__tfidf__max_df': 0.9, 'preprocessor__text__tfidf__min_df': 20, 'preprocessor__text__tfidf__ngram_range': (1, 3)}
train score:0.8948306595365418
test score:0.736061684460261


In [189]:
#6
f_pipe_params = {
    'preprocessor__text__tfidf__min_df': [20],
    'preprocessor__text__tfidf__ngram_range': [(1,3)],
     'model__max_depth': [4],
     'model__n_estimators': [500],

}

# Instantiate GridSearchCV.
gs = GridSearchCV(
    f_pipe, # the thing to gridsearch over
    param_grid=f_pipe_params, # the hyperparameters to check
    cv=5 # 5-fold cross-validation
)

gs.fit(X_train, y_train)

print(f"best params: {gs.best_params_}")
# Score model on training set.
print(f"train score:{gs.score(X_train, y_train)}")
# Score model on testing set.
print(f"test score:{gs.score(X_test, y_test)}")

best params: {'model__max_depth': 4, 'model__n_estimators': 500, 'preprocessor__text__tfidf__min_df': 20, 'preprocessor__text__tfidf__ngram_range': (1, 3)}
train score:0.8894830659536542
test score:0.7301304863582444


In [190]:
#7
f_pipe_params = {
    'preprocessor__text__tfidf__min_df': [20],
    'preprocessor__text__tfidf__ngram_range': [(2,3)],
     'model__max_depth': [4],
     'model__n_estimators': [500],

}

# Instantiate GridSearchCV.
gs = GridSearchCV(
    f_pipe, # the thing to gridsearch over
    param_grid=f_pipe_params, # the hyperparameters to check
    cv=5 # 5-fold cross-validation
)

gs.fit(X_train, y_train)

print(f"best params: {gs.best_params_}")
# Score model on training set.
print(f"train score:{gs.score(X_train, y_train)}")
# Score model on testing set.
print(f"train score:{gs.score(X_test, y_test)}")

best params: {'model__max_depth': 4, 'model__n_estimators': 500, 'preprocessor__text__tfidf__min_df': 20, 'preprocessor__text__tfidf__ngram_range': (2, 3)}
train score:0.8235294117647058
train score:0.6530249110320284


In [191]:
#7plus
f_pipe_params = {
    'preprocessor__text__tfidf__min_df': [20],
    'preprocessor__text__tfidf__max_df': [0.70, 0.50],
    'preprocessor__text__tfidf__ngram_range': [(2,3)],
     'model__max_depth': [4],
     'model__n_estimators': [500]        

}

# Instantiate GridSearchCV.
gs = GridSearchCV(
    f_pipe, # the thing to gridsearch over
    param_grid=f_pipe_params, # the hyperparameters to check
    cv=10 # 5-fold cross-validation
)

gs.fit(X_train, y_train)

print(f"best params: {gs.best_params_}")
# Score model on training set.
print(f"train score:{gs.score(X_train, y_train)}")
# Score model on testing set.
print(f"train score:{gs.score(X_test, y_test)}")

best params: {'model__max_depth': 4, 'model__n_estimators': 500, 'preprocessor__text__tfidf__max_df': 0.7, 'preprocessor__text__tfidf__min_df': 20, 'preprocessor__text__tfidf__ngram_range': (2, 3)}
train score:0.8235294117647058
train score:0.6548042704626335


## Model 3: Logistic Regression (since model is trending toward overfittedness, let's simplify further)

In [195]:
text_transformer = Pipeline([
    ("tfidf", TfidfVectorizer())
])

# Numeric processing pipeline
numeric_transformer = Pipeline([
    ("scaler", StandardScaler())
])

# Full preprocessing pipeline
preprocessor = ColumnTransformer([
    ("text", text_transformer, "selftext"),
    ("sentiment", numeric_transformer, ["pos", "neu", "neg", "compound"])
])

# Full pipeline with RandomForest
pipe_tvec = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LogisticRegression())
])

In [196]:
pipe_tvec.get_params();

In [197]:
pipe_tvec_params = {
    'preprocessor__text__tfidf__max_features': [300, 500, 700, 1000],
    'preprocessor__text__tfidf__min_df': [2, 3, 4, 5],
    'preprocessor__text__tfidf__max_df': [0.75, 0.85, 0.90],
    'preprocessor__text__tfidf__ngram_range': [(1, 1)],
    'preprocessor__text__tfidf__stop_words': ['english'],
    'preprocessor__text__tfidf__sublinear_tf': [True],
    'model__C': [10, 1, 0.1, 0.01],
    "model__class_weight":['balanced']
}
gs_tvec = GridSearchCV(
    pipe_tvec,
    param_grid = pipe_tvec_params,
    cv = 5
)

gs_tvec.fit(X_train, y_train)

print(f"best params: {gs_tvec.best_params_}")
# Score model on training set.
print(f"train score:{gs_tvec.score(X_train, y_train)}")
# Score model on testing set.
print(f"test score:{gs_tvec.score(X_test, y_test)}")

best params: {'model__C': 10, 'model__class_weight': 'balanced', 'preprocessor__text__tfidf__max_df': 0.75, 'preprocessor__text__tfidf__max_features': 700, 'preprocessor__text__tfidf__min_df': 5, 'preprocessor__text__tfidf__ngram_range': (1, 1), 'preprocessor__text__tfidf__stop_words': 'english', 'preprocessor__text__tfidf__sublinear_tf': True}
train score:0.9893048128342246
test score:0.7390272835112692


# Winning model...

## Logistic Regression

though it is not perfectly interpretable, neither is the random forest model, which came the closest to the logistic regression in score. This model also had the least bad variance of all those tested with train score of 89% and a test score of 75%. 


In [198]:


#good params without text and word count
pipe_tvec_params = {
    'preprocessor__text__tfidf__max_features': [500],
    'preprocessor__text__tfidf__min_df': [2],
    'preprocessor__text__tfidf__max_df': [0.85],
    'preprocessor__text__tfidf__ngram_range': [(1, 1)],
    'preprocessor__text__tfidf__stop_words': ['english'],
    'preprocessor__text__tfidf__sublinear_tf': [True],
    'model__C': [1.0],
    "model__class_weight":['balanced']
}
gs_tvec = GridSearchCV(
    pipe_tvec,
    param_grid = pipe_tvec_params,
    cv = 5
)

gs_tvec.fit(X_train, y_train)

print(f"best params: {gs_tvec.best_params_}")
# Score model on training set.
print(f"train score:{gs_tvec.score(X_train, y_train)}")
# Score model on testing set.
print(f"test score:{gs_tvec.score(X_test, y_test)}")


best params: {'model__C': 1.0, 'model__class_weight': 'balanced', 'preprocessor__text__tfidf__max_df': 0.85, 'preprocessor__text__tfidf__max_features': 500, 'preprocessor__text__tfidf__min_df': 2, 'preprocessor__text__tfidf__ngram_range': (1, 1), 'preprocessor__text__tfidf__stop_words': 'english', 'preprocessor__text__tfidf__sublinear_tf': True}
train score:0.893048128342246
test score:0.7485172004744959


<bound method BaseSearchCV.predict_proba of GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('text',
                                                                         Pipeline(steps=[('tfidf',
                                                                                          TfidfVectorizer())]),
                                                                         'selftext'),
                                                                        ('sentiment',
                                                                         Pipeline(steps=[('scaler',
                                                                                          StandardScaler())]),
                                                                         ['pos',
                                                                          'neu',
                                            

In [210]:
sns.regplot(x=X_train, y=y_train, data=df, logistic=True, ci=None)

ValueError: regplot inputs must be 1d