In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

**NB and SVM are best algorithms for Sentiment classification other than Deep Neural Networks.**

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [3]:
df_train = pd.read_csv("/content/drive/My Drive/Colab Notebooks/AnalyticsVidhya/SentimentAnalysis/train.csv")
df_test = pd.read_csv("/content/drive/My Drive/Colab Notebooks/AnalyticsVidhya/SentimentAnalysis/test.csv")
df_sub = pd.read_csv('/content/drive/My Drive/Colab Notebooks/AnalyticsVidhya/SentimentAnalysis/sample_submission.csv')

In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      31962 non-null  int64 
 1   label   31962 non-null  int64 
 2   tweet   31962 non-null  object
dtypes: int64(2), object(1)
memory usage: 749.2+ KB


In [5]:
df_test.info()
df_test.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17197 entries, 0 to 17196
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      17197 non-null  int64 
 1   tweet   17197 non-null  object
dtypes: int64(1), object(1)
memory usage: 268.8+ KB


Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."


In [6]:
print(df_train.label.value_counts())

0    29720
1     2242
Name: label, dtype: int64


In [7]:
df_train['idf'] = 'train'
df_test['idf'] = 'test'
df_total= pd.concat([df_train, df_test],axis=0, ignore_index=True)

In [8]:
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def map_func(text):
  text = text.replace("@user"," ")
  text = re.sub("[^a-zA-Z0-9#']+", " ",text)
  re.sub(' +',' ',text)
  text = text.lower().strip()
  text = [w for w in text.split(" ") if w not in stop_words]
  text = " ".join(text)
  return text

df_total.tweet = df_total.tweet.apply(lambda x: map_func(x))
df_total.tweet
# Short abbreviation used in tweets needs to be expanded

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0        father dysfunctional selfish drags kids dysfun...
1        thanks #lyft credit can't use cause offer whee...
2                                           bihday majesty
3                             #model love u take u time ur
4                           factsguide society #motivation
                               ...                        
49154    thought factory left right polarisation #trump...
49155    feeling like mermaid #hairflip #neverready #fo...
49156    #hillary #campaigned today #ohio omg amp used ...
49157    happy work conference right mindset leads cult...
49158    song glad free download #shoegaze #newmusic #n...
Name: tweet, Length: 49159, dtype: object

In [9]:
#Vectorizer, we use TfIDFVectorizer

df_train = df_total[df_total['idf']=='train']
df_test = df_total[df_total['idf']=='test']

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
tfidf_vect = TfidfVectorizer(analyzer='word',stop_words='english')
X_train = tfidf_vect.fit_transform(df_train.tweet)
X_test = tfidf_vect.transform(df_test.tweet)
y_train = df_train['label']
print(X_train.shape)
print(X_test.shape)

(31962, 38747)
(17197, 38747)


In [10]:
# Models 
class_wts = {0: 0.53771870794, 1: 7.12801070473}  #find by formula (total_samples/(total_categories*each_catg_samples))
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
scores = cross_val_score(clf, X_train, y_train, scoring='f1', cv=10, n_jobs=-1)
clf.fit(X_train, y_train)
print("Logistic Regression: ",scores)
print(confusion_matrix(y_train, clf.predict(X_train)))

clf = LogisticRegression(class_weight=class_wts)
scores = cross_val_score(clf, X_train, y_train, scoring='f1', cv=10, n_jobs=-1)
clf.fit(X_train, y_train)
print("Weighted Logistic Regression: ",scores)
print(confusion_matrix(y_train, clf.predict(X_train)))

from sklearn.svm import LinearSVC
clf = LinearSVC()
scores = cross_val_score(clf, X_train, y_train, scoring='f1', cv=10, n_jobs=-1)
clf.fit(X_train, y_train)
print("SVM: ",scores)
print(confusion_matrix(y_train, clf.predict(X_train)))

clf = LinearSVC(class_weight=class_wts)
scores = cross_val_score(clf, X_train, y_train, scoring='f1', cv=10, n_jobs=-1)
clf.fit(X_train, y_train)
print("Weighted SVM: ",scores)
print(confusion_matrix(y_train, clf.predict(X_train)))

from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)
scores = cross_val_score(clf, X_train, y_train, scoring='f1', cv=10, n_jobs=-1)
print("Navie Bayes: ",scores)
print(confusion_matrix(y_train, clf.predict(X_train)))



Logistic Regression:  [0.42758621 0.44368601 0.38754325 0.50980392 0.48184818 0.45117845
 0.38596491 0.42214533 0.40689655 0.40701754]
[[29688    32]
 [ 1394   848]]
Weighted Logistic Regression:  [0.69728601 0.67474747 0.67961165 0.67399267 0.68627451 0.66666667
 0.67175573 0.69822485 0.671875   0.71084337]
[[28852   868]
 [   17  2225]]
SVM:  [0.7032967  0.70810811 0.65738162 0.73766234 0.72823219 0.70810811
 0.70136986 0.72282609 0.66666667 0.70718232]
[[29709    11]
 [  122  2120]]
Weighted SVM:  [0.73611111 0.70533643 0.68480726 0.72173913 0.71840355 0.70720721
 0.72321429 0.73825503 0.70852018 0.71070615]
[[29543   177]
 [    8  2234]]
Navie Bayes:  [0.26923077 0.22834646 0.30943396 0.32089552 0.3030303  0.35294118
 0.25       0.27027027 0.30827068 0.26356589]
[[29720     0]
 [ 1673   569]]


In [11]:
from imblearn.over_sampling import RandomOverSampler, SMOTE

def balanced_data_fun(strategy):
  if strategy=='oversample':
    oversample= RandomOverSampler(sampling_strategy=0.65)
    return oversample.fit_resample(X_train, y_train)
  if strategy=='smote':
    smote = SMOTE()
    return smote.fit_resample(X_train, y_train)
  if strategy=='none':
    return X_train, y_train
X_train_over, y_train_over = balanced_data_fun('none')



In [12]:
# Models 
class_wts = {0: 0.53771870794, 1: 7.12801070473}  #find by formula (total_samples/(total_categories*each_catg_samples))
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
scores = cross_val_score(clf, X_train_over, y_train_over, scoring='f1', cv=10, n_jobs=-1)
clf.fit(X_train_over, y_train_over)
print("Logistic Regression: ",scores)
print(confusion_matrix(y_train_over, clf.predict(X_train_over)))

clf = LogisticRegression(class_weight=class_wts)
scores = cross_val_score(clf, X_train_over, y_train_over, scoring='f1', cv=10, n_jobs=-1)
clf.fit(X_train_over, y_train_over)
print("Weighted Logistic Regression: ",scores)
print(confusion_matrix(y_train_over, clf.predict(X_train_over)))

from sklearn.svm import LinearSVC
clf = LinearSVC()
scores = cross_val_score(clf, X_train_over, y_train_over, scoring='f1', cv=10, n_jobs=-1)
clf.fit(X_train_over, y_train_over)
print("SVM: ",scores)
print(confusion_matrix(y_train_over, clf.predict(X_train_over)))

clf = LinearSVC(class_weight=class_wts)
scores = cross_val_score(clf, X_train_over, y_train_over, scoring='f1', cv=10, n_jobs=-1)
clf.fit(X_train_over, y_train_over)
print("Weighted SVM: ",scores)
print(confusion_matrix(y_train_over, clf.predict(X_train_over)))

from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train_over, y_train_over)
scores = cross_val_score(clf, X_train_over, y_train_over, scoring='f1', cv=10, n_jobs=-1)
print("Navie Bayes: ",scores)
print(confusion_matrix(y_train_over, clf.predict(X_train_over)))

Logistic Regression:  [0.42758621 0.44368601 0.38754325 0.50980392 0.48184818 0.45117845
 0.38596491 0.42214533 0.40689655 0.40701754]
[[29688    32]
 [ 1394   848]]
Weighted Logistic Regression:  [0.69728601 0.67474747 0.67961165 0.67399267 0.68627451 0.66666667
 0.67175573 0.69822485 0.671875   0.71084337]
[[28852   868]
 [   17  2225]]
SVM:  [0.7032967  0.70810811 0.65738162 0.73766234 0.72823219 0.70810811
 0.70136986 0.72282609 0.66666667 0.70718232]
[[29709    11]
 [  122  2120]]
Weighted SVM:  [0.73611111 0.70533643 0.68480726 0.72173913 0.71840355 0.70720721
 0.72321429 0.73825503 0.70852018 0.71070615]
[[29543   177]
 [    8  2234]]
Navie Bayes:  [0.26923077 0.22834646 0.30943396 0.32089552 0.3030303  0.35294118
 0.25       0.27027027 0.30827068 0.26356589]
[[29720     0]
 [ 1673   569]]


In [13]:
from xgboost  import XGBClassifier
clf = XGBClassifier()
scores = cross_val_score(clf, X_train, y_train, scoring='f1', cv=10, n_jobs=-1)
clf.fit(X_train, y_train)
print("XGB Classifier: ",scores)
print(confusion_matrix(y_train, clf.predict(X_train)))

scores = cross_val_score(clf, X_train_over, y_train_over, scoring='f1', cv=10, n_jobs=-1)
clf.fit(X_train_over, y_train_over)
print("XGB Classifier: ",scores)
print(confusion_matrix(y_train_over, clf.predict(X_train_over)))

XGB Classifier:  [0.29739777 0.2661597  0.31272727 0.38297872 0.32608696 0.2962963
 0.27924528 0.30188679 0.32234432 0.28571429]
[[29696    24]
 [ 1796   446]]
XGB Classifier:  [0.29739777 0.2661597  0.31272727 0.38297872 0.32608696 0.2962963
 0.27924528 0.30188679 0.32234432 0.28571429]
[[29696    24]
 [ 1796   446]]


In [17]:
from sklearn.model_selection import GridSearchCV
svc = LinearSVC()
params = {'penalty': ['l1', 'l2'],
          'C': [0.5,1.0,2.0,3.0,5.0],
          'fit_intercept': [True, False],
          'max_iter': [500,1000,2500]
}
clf = GridSearchCV(svc, params,scoring='f1')
clf.fit(X_train, y_train)
clf.best_params_

ValueError: Unsupported set of arguments: The combination of penalty='l1' and loss='squared_hinge' are not supported when dual=True, Parameters: penalty='l1', loss='squared_hinge', dual=True



{'C': 1.0, 'fit_intercept': False, 'max_iter': 500, 'penalty': 'l2'}

In [14]:
clf = LinearSVC()
clf.fit(X_train, y_train)
preds = clf.predict(X_test) #f1-score: 0.756

LinearSVC(C=1.0,fit_intercept=False,max_iter=500,penalty='l2')


df_sub['label'] = preds
df_sub.to_csv('/content/drive/My Drive/Colab Notebooks/AnalyticsVidhya/SentimentAnalysis/submission.csv', index=False)  