In [1]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

In [22]:
from utils import tokenizer

In [4]:
train_df = pd.read_csv('training.csv', header=0)
val_df = pd.read_csv('valid.csv', header=0)

In [5]:
index_map = {
    'Positive': 2,
    'Negative': 0,
    'Neutral': 1,
    'Irrelevant': 1
}

def get_features_from_df(df):
    X = df.iloc[:, 3].astype('U')
    y = [index_map[value] for value in df.iloc[:, 2]]
    return X, y

In [24]:
train_X, train_y = get_features_from_df(train_df)
eval_X, eval_y = get_features_from_df(val_df)

val_X, test_X, val_y, test_y = train_test_split(eval_X, eval_y, test_size=0.5)

In [83]:
train_X

0        I am coming to the borders and I will kill you...
1        im getting on borderlands and i will kill you ...
2        im coming on borderlands and i will murder you...
3        im getting on borderlands 2 and i will murder ...
4        im getting into borderlands and i can murder y...
                               ...                        
74676    Just realized that the Windows partition of my...
74677    Just realized that my Mac window partition is ...
74678    Just realized the windows partition of my Mac ...
74679    Just realized between the windows partition of...
74680    Just like the windows partition of my Mac is l...
Name: im getting on borderlands and i will murder you all ,, Length: 74681, dtype: object

In [25]:
vectorizer = TfidfVectorizer(tokenizer=tokenizer, strip_accents='unicode')
features = vectorizer.fit_transform(train_X)

In [8]:
train_X = features
val_X = vectorizer.transform(val_X)

In [9]:
lr = LogisticRegression(max_iter=1000)
lr.fit(train_X, train_y)
pred = lr.predict(val_X)
metrics.accuracy_score(pred, val_y)

0.8677354709418837

In [30]:
lr.predict(vectorizer.transform(['you suck, jerk']))

array([0])

In [26]:
import pickle

with open('vectorizer.pk', 'wb') as f:
 pickle.dump(vectorizer, f)
with open('classifier.pk', 'wb') as f:
 pickle.dump(lr, f)