In [2]:
import time
import pickle
import warnings
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier

In [3]:
warnings.filterwarnings("ignore")
pd.set_option("display.max_colwidth", 150)
sns.set_context("talk", font_scale=1)

In [4]:
training_data = pd.read_pickle("data/all_processed_tweets.pkl")
training_data.head()

Unnamed: 0,Tweet,Depressive
805878,bhuki ladki got attracted word kaaju like moth flame,0
995414,saw lvatt ad,0
641489,ok need hand still think go samsung oled android goodness,0
595108,video look terrific natural front camera,0
19016,oh would fun hear citi field amazing but no not going friend tho hopefully take lot picture jealous still post concert depression kcon mx,1


In [5]:
training_data.shape

(1085066, 2)

In [6]:
training_data = training_data.sample(frac=1)

In [7]:
training_data["Depressive"].value_counts()

0    878509
1    206557
Name: Depressive, dtype: int64

In [8]:
vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=200000, stop_words="english")

In [9]:
X = training_data.loc[:, ["Tweet"]]
y = training_data.loc[:, ["Depressive"]]

In [10]:
X_vectorized = vectorizer.fit_transform(X.Tweet)

In [11]:
X_vectorized.shape

(1085066, 200000)

In [12]:
y.shape

(1085066, 1)

In [13]:
models_dict = {
                  "MultinomialNB": MultinomialNB(alpha=1.0),
                  "XGBoost": XGBClassifier(n_estimators=200, max_depth=10, gamma=0.1)
              }

In [14]:
MNB_clf = models_dict["MultinomialNB"]
XGB_clf = models_dict["XGBoost"]

In [15]:
base_models = [("MultinomialNB", MNB_clf),
               ("XGBoost", XGB_clf)]

meta_model = LogisticRegressionCV()

In [16]:
stacking_model = StackingClassifier(estimators=base_models, 
                                    final_estimator=meta_model, 
                                    passthrough=True, 
                                    cv=5,
                                    verbose=3)

In [17]:
stacking_clf = stacking_model
stacking_clf.fit(X_vectorized, y)



[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    2.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.




[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  5.8min remaining:    0.0s




[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 11.9min remaining:    0.0s




[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 29.8min finished


StackingClassifier(cv=5,
                   estimators=[('MultinomialNB', MultinomialNB()),
                               ('XGBoost',
                                XGBClassifier(base_score=None, booster=None,
                                              colsample_bylevel=None,
                                              colsample_bynode=None,
                                              colsample_bytree=None, gamma=0.1,
                                              gpu_id=None,
                                              importance_type='gain',
                                              interaction_constraints=None,
                                              learning_rate=None,
                                              max_delta_step=None, max_depth=10,
                                              min_child_weight=None,
                                              missing=nan,
                                              monotone_constraints=None,
                

In [18]:
from joblib import dump, load

In [19]:
dump(stacking_clf, 'stack_model.joblib')

['stack_model.joblib']

In [26]:
print(stacking_clf.predict(vectorizer.transform(["I am destroyed"])))

[0]


In [27]:
dump(vectorizer, 'vectorizer.joblib')

['vectorizer.joblib']