In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Our Baseline model is Logistic Regression. The word representation is computed from Count Vectorizer and Tfidf. The better of the two is used as our baseline model. 

In [0]:
# Models Used
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

# Metrics and Accuracy
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# Train-test split
from sklearn.model_selection import train_test_split

# Feature-Extraction from text.
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Basics
import pandas as pd
import numpy as np




In [0]:
# Read the dataframe ( DataFrame1, DataFrame2, )
# Commons Things to remove (Expand Contraction, char length > 10, Removing urls, hashtags, mentions emojis, image urls, punctutation)

# DataFrame1 ( cleaned_tweets_1.csv ) (Stopping, Stemming)
df = pd.read_csv('drive/My Drive/DatasetNlp/cleaned_tweets.csv')

In [0]:
df.shape     # 14704 -> Depressing, 19221 -> Total

(28992, 6)

In [0]:
sources_to_keep = ['depressingmsgs', 'cuttingquotes', 'sentiment140', 'togethermw']
sources_not_to_keep = ['dataset1_abhiraj', 'dataset2_abhiraj', 'dataset3_abhiraj']

df = df[df.Cleaned != 'None']
df = df[~df.Source.isin(sources_not_to_keep)]
#df = df[df.Source!=sources_not_to_keep]

df = df.sample(frac=1).reset_index(drop=True)

In [0]:
df.shape    # 1615 -> Depressing,   4010 -> Total

(16442, 6)

In [0]:
df.dropna(inplace=True)

In [0]:
df.columns

Index(['Unnamed: 0', 'Tweet', 'Target', 'Source', 'Cleaned',
       'Cleaned_stop_words'],
      dtype='object')

In [0]:
X_train, X_test, y_train, y_test = train_test_split(
     df['Cleaned'], df['Target'], test_size=0.20, random_state=42)

In [0]:
lr = LogisticRegression(random_state=1)

# Initalise the TfIdf vectoriser 
tvec = TfidfVectorizer()
# Fit the training data on the model
tvec.fit(X_train)

# Transform training data into sparse matrix
X_train_tvec = tvec.transform(X_train)
# Transform training data into sparse matrix
X_test_tvec = tvec.transform(X_test)

# Cross val score/ predict
tvec_score = cross_val_score(lr, X_train_tvec, y_train, cv=3)



In [0]:

# Initalise the Count vectoriser 
cvec = CountVectorizer()
# Fit the training data on the model
cvec.fit(X_train)

# Transform training data into sparse matrix
X_train_cvec = cvec.transform(X_train)

# Transform test data into sparse matrix
X_test_cvec = cvec.transform(X_test)

# Cross val score/ predict
cvec_score = cross_val_score(lr, X_train_cvec, y_train, cv=3 )




In [0]:
# Cvec stores the entire vocabulary of words. 
df_cvec = pd.DataFrame(X_train_cvec.todense(),columns=cvec.get_feature_names())
print(df_cvec.shape)
print(df_cvec.head())

(13152, 15066)
   00  000  000ft  001  ...  zoo  zuba  zwinky  zzzzzzzzzzzzzzz
0   0    0      0    0  ...    0     0       0                0
1   0    0      0    0  ...    0     0       0                0
2   0    0      0    0  ...    0     0       0                0
3   0    0      0    0  ...    0     0       0                0
4   0    0      0    0  ...    0     0       0                0

[5 rows x 15066 columns]


In [0]:
baseline = 0.3
print('Baseline:', baseline)
print('Tfidf Vectorizer Score:', tvec_score.mean())
print('Count Vectorizer Score:', cvec_score.mean())
acc_list = []
acc_list.append(cvec_score.mean())
acc_list.append(tvec_score.mean())

# DataFrame Accuracy 
acc_df = pd.DataFrame()
acc_df['params']= ['cvec', 'tvec']
acc_df['scores']= acc_list
acc_df

Baseline: 0.3
Tfidf Vectorizer Score: 0.8506684355593478
Count Vectorizer Score: 0.8534828579446675


Unnamed: 0,params,scores
0,cvec,0.853483
1,tvec,0.850668


In [0]:
lr = LogisticRegression(random_state=1)
model_l1 = LogisticRegressionCV(Cs=np.logspace(-10,10,21),penalty = 'l1',solver='liblinear',cv=3) 
model_l1.fit(X_train_tvec, y_train)

LogisticRegressionCV(Cs=array([1.e-10, 1.e-09, 1.e-08, 1.e-07, 1.e-06, 1.e-05, 1.e-04, 1.e-03,
       1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03, 1.e+04, 1.e+05,
       1.e+06, 1.e+07, 1.e+08, 1.e+09, 1.e+10]),
                     class_weight=None, cv=3, dual=False, fit_intercept=True,
                     intercept_scaling=1.0, l1_ratios=None, max_iter=100,
                     multi_class='warn', n_jobs=None, penalty='l1',
                     random_state=None, refit=True, scoring=None,
                     solver='liblinear', tol=0.0001, verbose=0)

In [0]:
y_predict_tvec = model_l1.predict(X_test_tvec)

In [0]:
accuracy_score(y_predict_tvec, y_test)

0.8422012769838857

In [0]:
classification_report(y_test, y_predict_tvec,output_dict=True)

{'0.0': {'f1-score': 0.884177638919884,
  'precision': 0.8788819875776398,
  'recall': 0.8895374943870678,
  'support': 2227},
 '1.0': {'f1-score': 0.7525035765379113,
  'precision': 0.7623188405797101,
  'recall': 0.7429378531073446,
  'support': 1062},
 'accuracy': 0.8422012769838857,
 'macro avg': {'f1-score': 0.8183406077288977,
  'precision': 0.8206004140786749,
  'recall': 0.8162376737472061,
  'support': 3289},
 'weighted avg': {'f1-score': 0.8416608088044523,
  'precision': 0.8412443888814399,
  'recall': 0.8422012769838857,
  'support': 3289}}

In [0]:
model_l1 = LogisticRegressionCV(Cs=np.logspace(-10,10,21),penalty = 'l1',solver='liblinear',cv=3) 
model_l1.fit(X_train_cvec, y_train)

LogisticRegressionCV(Cs=array([1.e-10, 1.e-09, 1.e-08, 1.e-07, 1.e-06, 1.e-05, 1.e-04, 1.e-03,
       1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03, 1.e+04, 1.e+05,
       1.e+06, 1.e+07, 1.e+08, 1.e+09, 1.e+10]),
                     class_weight=None, cv=3, dual=False, fit_intercept=True,
                     intercept_scaling=1.0, l1_ratios=None, max_iter=100,
                     multi_class='warn', n_jobs=None, penalty='l1',
                     random_state=None, refit=True, scoring=None,
                     solver='liblinear', tol=0.0001, verbose=0)

In [0]:
y_predict_cvec = model_l1.predict(X_test_cvec)
accuracy_score(y_predict_cvec, y_test)

0.8443295834600183

In [0]:
classification_report(y_test, y_predict_cvec,output_dict=True)

{'0.0': {'f1-score': 0.8872743284896522,
  'precision': 0.8629550321199143,
  'recall': 0.9130040779338469,
  'support': 2207},
 '1.0': {'f1-score': 0.7485265225933202,
  'precision': 0.7987421383647799,
  'recall': 0.7042513863216266,
  'support': 1082},
 'accuracy': 0.8443295834600183,
 'macro avg': {'f1-score': 0.8179004255414861,
  'precision': 0.8308485852423471,
  'recall': 0.8086277321277368,
  'support': 3289},
 'weighted avg': {'f1-score': 0.841629717367782,
  'precision': 0.8418305714804935,
  'recall': 0.8443295834600183,
  'support': 3289}}

In [0]:
tvec_ngram = TfidfVectorizer(ngram_range=(1,4)) 
tvec_ngram.fit(X_train)
X_train_tvec_ngram = tvec_ngram.transform(X_train)

# fit with l1 
model_l1 = LogisticRegressionCV(Cs=np.logspace(-10,10,21),penalty = 'l1',solver='liblinear',cv=3) 
model_l1.fit(X_train_tvec_ngram, y_train)

LogisticRegressionCV(Cs=array([1.e-10, 1.e-09, 1.e-08, 1.e-07, 1.e-06, 1.e-05, 1.e-04, 1.e-03,
       1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03, 1.e+04, 1.e+05,
       1.e+06, 1.e+07, 1.e+08, 1.e+09, 1.e+10]),
                     class_weight=None, cv=3, dual=False, fit_intercept=True,
                     intercept_scaling=1.0, l1_ratios=None, max_iter=100,
                     multi_class='warn', n_jobs=None, penalty='l1',
                     random_state=None, refit=True, scoring=None,
                     solver='liblinear', tol=0.0001, verbose=0)

In [0]:
X_test_tvec_ngram = tvec_ngram.transform(X_test)

In [0]:
y_pred_tvec_ngram = model_l1.predict(X_test_tvec_ngram)

In [0]:
accuracy_score(y_pred_tvec_ngram, y_test)

0.8461538461538461

In [0]:
classification_report(y_test, y_pred_tvec_ngram, output_dict=True)

{'0.0': {'f1-score': 0.8782483156881616,
  'precision': 0.9363776295536173,
  'recall': 0.8269143633892161,
  'support': 2207},
 '1.0': {'f1-score': 0.7910817506193228,
  'precision': 0.7149253731343284,
  'recall': 0.8853974121996303,
  'support': 1082},
 'accuracy': 0.8461538461538461,
 'macro avg': {'f1-score': 0.8346650331537422,
  'precision': 0.8256515013439728,
  'recall': 0.8561558877944232,
  'support': 3289},
 'weighted avg': {'f1-score': 0.8495726624791365,
  'precision': 0.8635252910173842,
  'recall': 0.8461538461538461,
  'support': 3289}}