In [1]:
import matplotlib
import seaborn
%matplotlib inline

import numpy as np
import pandas as pd

import tensorflow
import nltk
import gensim
from gensim.models import word2vec, doc2vec

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [2]:
def read_data(filename, header=0, sep='\t'):
    return pd.read_csv(filename, header=header, sep=sep)

In [3]:
train = read_data('data/train.data')
test = read_data('data/test.data')
test.head()

Unnamed: 0,Id,Text
0,0,One of our favorite Mexican restaurants. My wi...
1,1,After seeing a picture of the Spanish Mackerel...
2,2,This has been the best restaurant experience (...
3,3,Elements is great! The ambiance of Elements b...
4,4,"ok, so i went back here and had a better exper..."


In [4]:
print(train.shape, test.shape)

(102544, 3) (34194, 2)


In [5]:
train['Sentiment'].value_counts()

4    35432
5    30517
3    17340
2    10798
1     8457
Name: Sentiment, dtype: int64

In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(train.Text, train.Sentiment, test_size=0.3)
X_test = test.Text

# Example 1

In [7]:
clf = GridSearchCV(Pipeline([('vect', CountVectorizer(ngram_range=(1, 4), 
                                                      min_df=50,
                                                      max_df=0.7,
                                                      stop_words='english',
                                                      lowercase=True)),
                             ('tfidf', TfidfTransformer(norm='l2')),
                             ('clf', SGDClassifier(penalty='l2',
                                                   n_iter=7))]), 
                   param_grid={'clf__loss': ['hinge', 
                                             'log', 
                                             'modified_huber', 
                                             'squared_hinge', 
                                             'perceptron']})

In [8]:
clf.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.7, max_features=None, min_df=50,
        ngram_range=(1, 4), preprocessor=None, stop_words='english',
       ...   penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'clf__loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [10]:
clf.best_estimator_

Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.7, max_features=None, min_df=50,
        ngram_range=(1, 4), preprocessor=None, stop_words='english',
       ...   penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False))])

In [9]:
print('Train: ', clf.score(X_train, y_train))
print('Valid: ', clf.score(X_valid, y_valid))

Train:  0.704346614656
Valid:  0.558282408009


In [40]:
pred_test = clf.predict(X_test)
prediction = pd.DataFrame(data={'Id': test.Id, 'Sentiment': pred_test}, index=test.index)
prediction.to_csv('data/prediction.csv', index=False)
print(prediction.head())

   Id  Sentiment
0   0          5
1   1          2
2   2          5
3   3          5
4   4          5
