## Import libraries

In [1]:
# data manipulation
import pandas as pd
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)

# linear algebra
import numpy as np

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
plt.rcParams['figure.figsize'] = 12,6
sns.set_style('darkgrid')
%matplotlib inline

# machine learning
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, KFold, StratifiedKFold
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# NLP
import re
import nltk
import spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem.porter import *
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
nlp = spacy.load("en_core_web_md", disable=['parser', 'ner'])

# progress bar
from tqdm import tqdm, tqdm_notebook
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()

## Import dataset

In [2]:
df = pd.read_csv('train_lemma.csv')
df.head()

Unnamed: 0,compound,tweet,polarity
0,-0.3818,aww that be bummer you shoulda get david carr ...,0
1,-0.75,be upset that he can not update his facebook b...,0
2,0.4939,dive many time for the ball manage to save the...,0
3,-0.25,my whole body feel itchy and like its on fire,0
4,-0.6597,no it be not behave at all be mad why be here ...,0


## A little bit data cleaning

In [3]:
df_pos = df[df['polarity']==1].copy()
df_neg = df[df['polarity']==0].copy()

In [5]:
df_pos = df_pos[df_pos['compound'] > 0]
df_neg = df_neg[df_neg['compound'] < 0]

In [6]:
df = pd.concat([df_pos,df_neg], axis=0)

In [7]:
df = df.reset_index().drop('index',axis=1)

## Cross validation

In [10]:
X = df['tweet']
y = df['polarity']

# return score - for linearsvc
def evaluate_model(model, X, y, metrics='accuracy'):
    tfidf = TfidfVectorizer(ngram_range=(1, 3), max_df=0.5, min_df=2)
    pipeline = Pipeline(steps=[('tfidf',tfidf),('model',model)])
    
    cv = KFold(10,shuffle=True,random_state=123)
    scores = cross_val_score(pipeline, X, y, cv=cv, scoring=metrics)
    return scores

# return probability
def evaluate_proba(model, X, y):
    tfidf = TfidfVectorizer(ngram_range=(1, 3), max_df=0.5, min_df=2)
    pipeline = Pipeline(steps=[('tfidf',tfidf),('model',model)])
    
    cv = KFold(10,shuffle=True,random_state=123)
    proba = cross_val_predict(pipeline, X, y, cv=cv, method='predict_proba')
    return proba

## First stage - Naive Bayes

In [11]:
model_1 = ComplementNB()
first_proba = evaluate_proba(model_1, X, y)

In [12]:
a = np.where(np.logical_and(first_proba[:,1] > first_proba[:,0], first_proba[:,1] < 0.95))[0].tolist()
b = np.where(np.logical_and(first_proba[:,0] > first_proba[:,1], first_proba[:,0] < 0.95))[0].tolist()

a.extend(b)
a = np.array(a)
a = a.tolist()

first_success = np.delete(first_proba, a, 0)
y_success = np.delete(np.array(y),a)
first_success = np.where(first_success[:,1] > 0.5, 1, 0)

## Second stage - Support Vector Machine

In [13]:
model_2 = LinearSVC(random_state=123,
                    penalty='l1',
                    dual=False,
                    C=0.45)

second_score = evaluate_model(model_2, X[a], y[a])

In [19]:
h, i = np.diag(confusion_matrix(y_success,first_success))
second_score_mean = np.mean(second_score) * len(X[a])

overall_score = (h + i + second_score_mean) / len(df)
print(f'Accuracy score: {overall_score*100:.3f} %')

Accuracy score: 97.012 %
