## Import libraries

In [1]:
# data manipulation
import pandas as pd
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)

# linear algebra
import numpy as np

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
plt.rcParams['figure.figsize'] = 12,6
sns.set_style('darkgrid')
%matplotlib inline

# machine learning
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold, StratifiedKFold
from sklearn.naive_bayes import ComplementNB
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix

## Import dataset

In [2]:
df = pd.read_csv('train_lemma.csv')
df.head()

Unnamed: 0,compound,tweet,polarity
0,-0.3818,aww that bummer you shoulda get david carr of ...,0
1,-0.7269,be upset that he can update his facebook by te...,0
2,0.4939,dive many time for the ball manage to save the...,0
3,-0.25,my whole body feel itchy and like its on fire,0
4,-0.6597,no it not behave at all mad why be here becaus...,0


## A little bit data cleaning
Lexicon scores that are above 0 are considered as positive, and negative if they are below 0. We should expect to get positive lexicon for positive tweets and negative lexicon for negative tweets. But I noticed that many tweets do not behave like this. Maybe there is error during labelling process, or maybe the tweets contain sarcasm element that our VADER lexicon could not understand it. So we will just ignore all these tweets.

In [3]:
df_pos = df[df['polarity']==1].copy()
df_neg = df[df['polarity']==0].copy()

df_pos = df_pos[df_pos['compound'] > 0]
df_neg = df_neg[df_neg['compound'] < 0]

df = pd.concat([df_pos,df_neg], axis=0)

In [4]:
df = df.reset_index().drop('index',axis=1)

## Data preparation

In [5]:
X = df['tweet']
y = df['polarity']

# return score - for svc
def evaluate_model(model, X, y, metrics='accuracy'):
    
    tfidf = TfidfVectorizer(ngram_range=(1, 3), max_df=0.5, min_df=2)
    pipeline = Pipeline(steps=[('tfidf',tfidf),('model',model)])
    cv = KFold(10,shuffle=True,random_state=123)
    scores = cross_val_score(pipeline, X, y, cv=cv, scoring=metrics)
    
    return scores

# return probability
def evaluate_proba(model, X, y):
    
    tfidf = TfidfVectorizer(ngram_range=(1, 3), max_df=0.5, min_df=2)
    pipeline = Pipeline(steps=[('tfidf',tfidf),('model',model)])
    cv = KFold(10,shuffle=True,random_state=123)
    proba = cross_val_predict(pipeline, X, y, cv=cv, method='predict_proba')
    
    return proba

## First stage - Naive Bayes

In [6]:
model_1 = ComplementNB()
first_proba = evaluate_proba(model_1, X, y)

In [7]:
a = np.where(np.logical_and(first_proba[:,1] > first_proba[:,0], first_proba[:,1] < 0.95))[0].tolist()
b = np.where(np.logical_and(first_proba[:,0] > first_proba[:,1], first_proba[:,0] < 0.95))[0].tolist()

a.extend(b)
a = np.array(a)
a = a.tolist()

first_success = np.delete(first_proba, a, 0)
y_success = np.delete(np.array(y),a)
first_success = np.where(first_success[:,1] > 0.5, 1, 0)

We set 0.95 as our threshold. Any data points with probability less than this threshold will be forwarded to the second stage.

## Second stage - Support Vector Machine

In [8]:
model_2 = LinearSVC(random_state=123,
                    penalty='l1',
                    dual=False,
                    C=0.45)

second_score = evaluate_model(model_2, X[a], y[a])

In [9]:
h, i = np.diag(confusion_matrix(y_success,first_success))
second_score_mean = np.mean(second_score) * len(X[a])

overall_score = (h + i + second_score_mean) / len(df)
print(f'Accuracy score: {overall_score*100:.3f} %')

Accuracy score: 97.473 %
