In [None]:
import time
import pickle
import warnings
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go

from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_validate, RepeatedStratifiedKFold
from drive.MyDrive.BE_Project_Data.utility_functions import model_evaluate

from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier

In [None]:
warnings.filterwarnings("ignore")
pd.set_option("display.max_colwidth", 150)
sns.set_context("talk", font_scale=1)

In [None]:
data = pd.read_pickle("drive/MyDrive/BE_Project_Data/processed_training_data.pkl")
val_data = pd.read_pickle("drive/MyDrive/BE_Project_Data/processed_validation_data.pkl")

In [None]:
data.head(3)

Unnamed: 0,Tweet,Depressive
0,severely depressed summer seriously thought haveis not depression anymore surpassed first holder new worse mental illness,1
1,like wake panic knowing thing get done today already dreading moment foot touch ground mentalhealthissues anxienty depression,1
2,bpd anxiety depression,1


In [None]:
data.tail(3)

Unnamed: 0,Tweet,Depressive
1081863,ready mojo makeover ask detail,0
1081864,happy th birthday boo time tupac amaru shakur,0
1081865,happy charitytuesday,0


In [None]:
val_data.head(3)

Unnamed: 0,Tweet,Depressive
0,today selfcare beauty amp laugh kung fu panda wellness joy laughter selfcare therapist philadelphia,0
1,get spend new year home alone lonely,1
2,depressed lonely stuck deep never ending hole sad,1


In [None]:
val_data.tail(3)

Unnamed: 0,Tweet,Depressive
3197,compact metal leaf grinder four layer,0
3198,first christmas year work not feel,0
3199,okay sus let hurt go hugging ya bitter as,0


In [None]:
training_data = pd.concat([data, val_data])
training_data.head(3)

Unnamed: 0,Tweet,Depressive
0,severely depressed summer seriously thought haveis not depression anymore surpassed first holder new worse mental illness,1
1,like wake panic knowing thing get done today already dreading moment foot touch ground mentalhealthissues anxienty depression,1
2,bpd anxiety depression,1


In [None]:
training_data.tail(3)

Unnamed: 0,Tweet,Depressive
3197,compact metal leaf grinder four layer,0
3198,first christmas year work not feel,0
3199,okay sus let hurt go hugging ya bitter as,0


In [None]:
training_data.shape

(1085066, 2)

In [None]:
training_data = training_data.sample(frac=1)

In [None]:
training_data["Depressive"].value_counts()

0    878509
1    206557
Name: Depressive, dtype: int64

In [None]:
training_data.to_csv("/content/drive/MyDrive/BE_Project_Data/all_processed_tweets.csv")
training_data.to_pickle("/content/drive/MyDrive/BE_Project_Data/all_processed_tweets.pkl")

## **Stack Generalization**

In [None]:
models_dict = {
                  "MultinomialNB": MultinomialNB(alpha=1.0),
                  "XGBoost": XGBClassifier(n_estimators=200, max_depth=10, gamma=0.1)
              }

In [None]:
vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=200000, stop_words="english")

In [None]:
X = training_data.loc[:, ["Tweet"]]
y = training_data.loc[:, ["Depressive"]]

In [None]:
X_vectorized = vectorizer.fit_transform(X.Tweet)

In [None]:
X_vectorized.shape

(1085066, 200000)

In [None]:
y.shape

(1085066, 1)

### Calculate Model Scores


In [None]:
scoring_metrics = {
                    "Accuracy": "accuracy",
                    "F1-score": "f1_macro",
                    "Recall": "recall",
                    "Precision": "precision"
                  }

def get_model_scores(model, X, y):
    kfold = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=23)
    scores = cross_validate(model, X, y, scoring=scoring_metrics, cv=kfold, verbose=3)
    return scores

model_scores = defaultdict()

for name, model in models_dict.items():
    print("Evaluating {}".format(name))
    scores = get_model_scores(model, X_vectorized, y)
    model_scores[name] = scores

Evaluating MultinomialNB
[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  , Accuracy=0.936, F1-score=0.896, Precision=0.834, Recall=0.828, total=   0.6s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.8s remaining:    0.0s


[CV]  , Accuracy=0.936, F1-score=0.896, Precision=0.837, Recall=0.827, total=   0.6s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.4s remaining:    0.0s


[CV]  , Accuracy=0.936, F1-score=0.895, Precision=0.835, Recall=0.825, total=   0.6s
[CV]  ................................................................
[CV]  , Accuracy=0.936, F1-score=0.895, Precision=0.834, Recall=0.827, total=   0.6s
[CV]  ................................................................
[CV]  , Accuracy=0.937, F1-score=0.897, Precision=0.837, Recall=0.827, total=   0.6s
Evaluating XGBoost
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    3.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  , Accuracy=0.957, F1-score=0.924, Precision=0.993, Recall=0.780, total= 9.9min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  9.9min remaining:    0.0s


[CV]  , Accuracy=0.957, F1-score=0.924, Precision=0.993, Recall=0.779, total= 8.6min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 18.5min remaining:    0.0s


[CV]  , Accuracy=0.957, F1-score=0.923, Precision=0.993, Recall=0.779, total= 8.5min
[CV]  ................................................................
[CV]  , Accuracy=0.957, F1-score=0.924, Precision=0.994, Recall=0.780, total= 8.8min
[CV]  ................................................................
[CV]  , Accuracy=0.957, F1-score=0.923, Precision=0.994, Recall=0.778, total= 8.7min


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 44.5min finished


In [None]:
len(model_scores)

2

In [None]:
print("MNB scores : ", model_scores["MultinomialNB"])
print("XGB scores : ", model_scores["XGBoost"])

MNB scores :  {'fit_time': array([0.30264306, 0.32928896, 0.30161333, 0.30597997, 0.31105494]), 'score_time': array([0.2908628 , 0.2960031 , 0.27243519, 0.28874063, 0.29564476]), 'test_Accuracy': array([0.93582442, 0.93627571, 0.9355246 , 0.93576882, 0.93653836]), 'test_F1-score': array([0.89563262, 0.89615231, 0.89494494, 0.89544724, 0.89657938]), 'test_Recall': array([0.82804028, 0.82655951, 0.82479243, 0.82672896, 0.82721727]), 'test_Precision': array([0.8337111 , 0.83670669, 0.83457026, 0.83434309, 0.83743384])}
XGB scores :  {'fit_time': array([589.16144323, 514.48528194, 507.8316319 , 524.29440546,
       518.93397188]), 'score_time': array([3.38918257, 3.40951538, 3.39116216, 3.45065546, 3.3677361 ]), 'test_Accuracy': array([0.95699356, 0.95687816, 0.956786  , 0.95725602, 0.95685051]), 'test_F1-score': array([0.92377046, 0.92352908, 0.92337722, 0.92422533, 0.92345526]), 'test_Recall': array([0.77960399, 0.77877563, 0.77870301, 0.78015541, 0.77832107]), 'test_Precision': array([0

In [None]:
def plot_results(model_scores, metric):
    
    model_names = list(model_scores.keys())
    results = [model_scores[model][metric] for model in model_names]
    fig = go.Figure(layout=go.Layout(height=600, width=1000))
    for model, result in zip(model_names, results):
        fig.add_trace(go.Box(
            y=result,
            name=model,
            boxpoints="all",
            jitter=0.5,
            whiskerwidth=0.3,
            marker_size=3,
            line_width=1)
        )
    
    fig.update_layout(
    title="Comparison based on : " + metric,
    paper_bgcolor="rgb(243, 243, 243)",
    plot_bgcolor="rgb(243, 243, 243)",
    xaxis_title="Model",
    yaxis_title=metric,
    showlegend=False)
    fig.show()
    

plot_results(model_scores, metric="test_F1-score")
plot_results(model_scores, metric="test_Accuracy")
plot_results(model_scores, metric="test_Recall")
plot_results(model_scores, metric="test_Precision")

## **Stack Generalization**

In [None]:
MNB_clf = models_dict["MultinomialNB"]
XGB_clf = models_dict["XGBoost"]

In [None]:
base_models = [("MultinomialNB", MNB_clf),
               ("XGBoost", XGB_clf)]

meta_model = LogisticRegressionCV()

In [None]:
stacking_model = StackingClassifier(estimators=base_models, 
                                    final_estimator=meta_model, 
                                    passthrough=True, 
                                    cv=5,
                                    verbose=3)

In [None]:
stacking_scores = get_model_scores(stacking_model, X_vectorized, y)
model_scores["Stacking"] = stacking_scores

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    2.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  7.7min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 14.9min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 37.7min finished


[CV]  , Accuracy=0.961, F1-score=0.931, Precision=0.976, Recall=0.812, total=60.7min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 60.7min remaining:    0.0s
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    2.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  7.9min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 15.9min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 37.8min finished


[CV]  , Accuracy=0.961, F1-score=0.932, Precision=0.976, Recall=0.813, total=60.8min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 121.5min remaining:    0.0s
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    2.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  7.6min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 14.8min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 36.6min finished


[CV]  , Accuracy=0.960, F1-score=0.931, Precision=0.976, Recall=0.812, total=58.6min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    2.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  7.4min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 14.6min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 36.2min finished


[CV]  , Accuracy=0.961, F1-score=0.932, Precision=0.976, Recall=0.814, total=58.3min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    2.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  7.9min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 15.6min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 39.0min finished


[CV]  , Accuracy=0.960, F1-score=0.931, Precision=0.976, Recall=0.811, total=60.9min


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 299.4min finished


In [None]:
print("Stacking model scores : ", model_scores["Stacking"])

Stacking model scores :  {'fit_time': array([3635.83363247, 3647.04435325, 3514.08161998, 3495.19924426,
       3653.10492635]), 'score_time': array([3.61448288, 3.63774705, 3.61885071, 3.63119841, 3.57309103]), 'test_Accuracy': array([0.96050485, 0.96055075, 0.96026966, 0.96090557, 0.96016368]), 'test_F1-score': array([0.93142652, 0.93150503, 0.93100907, 0.93217126, 0.93080845]), 'test_Recall': array([0.81242738, 0.81254387, 0.81167244, 0.81433516, 0.81109605]), 'test_Precision': array([0.97609423, 0.97623895, 0.97550403, 0.97637498, 0.97551603])}


In [None]:
plot_results(model_scores, metric="test_F1-score")
plot_results(model_scores, metric="test_Accuracy")
plot_results(model_scores, metric="test_Recall")
plot_results(model_scores, metric="test_Precision")

In [None]:
stacking_clf = stacking_model
stacking_clf.fit(X_vectorized, y)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    2.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  8.6min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 17.2min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 43.0min finished


StackingClassifier(cv=5,
                   estimators=[('MultinomialNB',
                                MultinomialNB(alpha=1.0, class_prior=None,
                                              fit_prior=True)),
                               ('XGBoost',
                                XGBClassifier(base_score=0.5, booster='gbtree',
                                              colsample_bylevel=1,
                                              colsample_bynode=1,
                                              colsample_bytree=1, gamma=0.1,
                                              learning_rate=0.1,
                                              max_delta_step=0, max_depth=10,
                                              min_child_weight=1, missing=None,
                                              n_estimators=200, n_jobs=1,
                                              nthread=...
                                              subsample=1, verbosity=1))],
                   final

In [None]:
pickle.dump(stacking_clf, open("/content/drive/MyDrive/BE_Project_Data/stacking_clf.pkl", 'wb'))

In [None]:
pickle.dump(vectorizer, open("/content/drive/MyDrive/BE_Project_Data/vectorizer.pkl", 'wb'))