In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics

%matplotlib inline


SEED = 1337


df = pd.read_csv('Tweets.csv')

In [2]:
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline,retweet_count,text
0,570306133677760513,neutral,Virgin America,0,@VirginAmerica What @dhepburn said.
1,570301130888122368,positive,Virgin America,0,@VirginAmerica plus you've added commercials t...
2,570301083672813571,neutral,Virgin America,0,@VirginAmerica I didn't today... Must mean I n...
3,570301031407624196,negative,Virgin America,0,@VirginAmerica it's really aggressive to blast...
4,570300817074462722,negative,Virgin America,0,@VirginAmerica and it's a really big bad thing...


In [3]:
# we can notice that negative < neutral < positive
# let's encode that appropreately
df.loc[df.airline_sentiment == 'negative', 'airline_sentiment'] = 0
df.loc[df.airline_sentiment == 'neutral', 'airline_sentiment'] = 1
df.loc[df.airline_sentiment == 'positive', 'airline_sentiment'] = 2


# encode airline as categorial variable
airline_le = LabelEncoder()
df['airline'] = airline_le.fit_transform(df.airline)
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline,retweet_count,text
0,570306133677760513,1,5,0,@VirginAmerica What @dhepburn said.
1,570301130888122368,2,5,0,@VirginAmerica plus you've added commercials t...
2,570301083672813571,1,5,0,@VirginAmerica I didn't today... Must mean I n...
3,570301031407624196,0,5,0,@VirginAmerica it's really aggressive to blast...
4,570300817074462722,0,5,0,@VirginAmerica and it's a really big bad thing...


In [4]:
y = df.airline_sentiment.values
df_train, df_test, y_train, y_test = train_test_split(df, y, test_size=0.25, 
                                                                      stratify=y,
                                                                      random_state=SEED, 
                                                                      shuffle=True)

print('train', df_train.shape[0])
print('test', df_test.shape[0])

train 10980
test 3660


In [5]:
%%time
# Zalina's baseline
# simple CountVectorizer over symbols + logistic regression
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords


model = Pipeline([
    ('count', CountVectorizer(analyzer='char', ngram_range=(1, 3))),
    ('est', LogisticRegressionCV(Cs=10, 
                                    cv=5, 
                                    scoring='f1_macro', 
                                    n_jobs=-1, 
                                    multi_class='multinomial', random_state=SEED))
])

model.fit(df_train.text, y_train)
print('train', metrics.f1_score(y_train, model.predict(df_train.text), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(df_test.text), average='macro'))

train 0.889381832369
test 0.747390886768
Wall time: 6min 14s
