In [9]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, plot_confusion_matrix

# Import CountVectorizer and TFIDFVectorizer from feature_extraction.text
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [10]:
# Set-up DataFrame for NLP
subreddits = pd.read_csv('../data/subreddits_clean.csv')

In [11]:
#Set-up X- and y-variables
# NOTE: cvec needs a vector(Series), not a matrix(DataFrame)
X = subreddits['submission_text']
y = subreddits['subreddit']

In [12]:
# Redefine training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y, # ALthough data is very well split b/w y-categories, ensure test/train evenly split
                                                    random_state=42)

In [6]:
# Stop Words Identified
%store -r my_stop_words
my_stop_words

['just', 'like', 'don', 've', 'things', 'in my', 'of my']

In [13]:
# Pipeline Set-Up:
pipe_tvec = Pipeline([ ('tvec', TfidfVectorizer()), # Stage 1. TF-IDF Vectorizer (Transformer)
                      #('cvec', CountVectorizer()),
                       ('nb',   MultinomialNB()  )  # Stage 2. Multinomial Naive Bayes (Estimator)
                    ])

In [14]:
# Hyperparameter Search:
pipe_tvec_params = {'tvec__max_features': [1000],# Max No. of features fit: 1K & 2K
                    #'tvec__stop_words': ['english'],    # No stop words and english stop words
                    'tvec__ngram_range': [(1,2)],       # Check (individual tokens) and also check (individual tokens and 2-grams)
                    'tvec__min_df' : [2],               # Min doc freq of 2
                    #'tvec__max_df' : [1.0, 0.90]        # Max doc freq. of 37.5%
                    }

In [None]:
# Hyperparameter Search:
pipe_cvec_params = {'cvec__max_features': [10000, 9000], # Max No. of features fit: 1K & 2K
                    'cvec__stop_words': ['english'],     # No stop words and english stop words
                    'cvec__ngram_range': [(1,2)],        # Check (individual tokens) and also check (individual tokens and 2-grams)
                    'cvec__min_df' : [2],
                    'cvec__max_df' : [1.0, 0.85]         # Max doc freq. of 37.5%
                    }

In [10]:
import os
os.cpu_count()

12

In [15]:
# Instantiate GridSearchCV.
gs_tvec = GridSearchCV(pipe_tvec,
                       pipe_tvec_params,
                       cv = 5,
                       n_jobs = 10,
                       verbose = 1)

In [16]:
# Fit GridSearch to training data.
gs_tvec.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   5 out of   5 | elapsed:   42.5s finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                       ('nb', MultinomialNB())]),
             n_jobs=10,
             param_grid={'tvec__max_features': [1000], 'tvec__min_df': [2],
                         'tvec__ngram_range': [(1, 2)]},
             verbose=1)

In [35]:
# Transform the train corpus
X_train = gs_tvec.transform(X_train)

# Transform test corpus
X_test = gs_tvec.transform(X_test)

AttributeError: 'MultinomialNB' object has no attribute 'transform'

In [18]:
gs_tvec.grid_scores_

AttributeError: 'GridSearchCV' object has no attribute 'grid_scores_'

In [19]:
gs_tvec.best_params_

{'tvec__max_features': 1000, 'tvec__min_df': 2, 'tvec__ngram_range': (1, 2)}

In [20]:
gs_tvec.cv_results_;

In [21]:
# Score model on training set.
gs_tvec.best_score_


0.7072901678657074

In [22]:
# Training scores of Model
tvec_train_score = gs_tvec.score(X_train, y_train)
print(f'Training Scores:\n\tTF-IDF  = {round(tvec_train_score*100,3)}%')

Training Scores:
	TF-IDF  = 70.83%


In [23]:
# Test scores of Model
tvec_test_score = gs_tvec.score(X_test, y_test)
#\n\tCount-V = {round(cvec_train_score*100,3)}%
print(f'Training Scores:\n\tTF-IDF  = {round(tvec_test_score*100,3)}%')

Training Scores:
	TF-IDF  = 70.627%


In [24]:
# Get predictions
preds_tvec = gs_tvec.predict(X_test)

# Save confusion matrix values
tn, fp, fn, tp = confusion_matrix(y_test, preds_tvec).ravel()

In [25]:
# Calculate the sensitivity
print(tp / (tp + fn))

0.7110648957791748


In [26]:
# Calculate the specificity
print(tn / (tn + fp))

0.7013239875389408


In [27]:
# Calculate the precision
print(tp/ (tp+fp))

0.7102927289896128


In [29]:
# STOP WORDS?

# Convert training data to DataFrame
X_train_df = pd.DataFrame(X_train.todense(), 
                          columns=cvec.get_feature_names())

# plot top occuring words
X_train_df.sum().sort_values(ascending=False).head(40).plot(kind='barh',figsize=(10,13));
plt.axvline(x=X_train_df.sum().sort_values().quantile(q=0.964),c='r');

AttributeError: 'Series' object has no attribute 'todense'

In [32]:
# STOP WORDS?

# Convert training data to DataFrame
X_train_df = pd.DataFrame(X_train, 
                          columns=gs_tvec.get_feature_names())

# plot top occuring words
X_train_df.sum().sort_values(ascending=False).head(40).plot(kind='barh',figsize=(10,13));
plt.axvline(x=X_train_df.sum().sort_values().quantile(q=0.964),c='r');

AttributeError: 'GridSearchCV' object has no attribute 'get_feature_names'

In [None]:
# Let's look at sklearn's stopwords.
#print(CountVectorizer(stop_words = 'english').get_stop_words())

In [28]:
X_train

149521    I lost my mother a few months before the pande...
63408     I can’t do this I can’t do this I can’t do thi...
158229    I don't even know who I am anymore. It's hard ...
139054    I don't know how to feel sad and disappointed ...
130038    Finding out no one really likes you. You know ...
                                ...                        
52784     I'm scared of myself.. I have psychotic depres...
108481    Depression's coming back again. I'm starting t...
80932     I feel alone. Not enough people care about the...
42590     I feel lost without my suicidal thoughts. I ba...
146429    Am I depressed?. I think I’m becoming depresse...
Name: submission_text, Length: 125100, dtype: object