In [65]:
import matplotlib
import seaborn
%matplotlib inline

import numpy as np
import pandas as pd

import tensorflow
import keras
import nltk
import gensim
from gensim.models import word2vec, doc2vec

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

In [2]:
def read_data(filename, header=0, sep='\t'):
    return pd.read_csv(filename, header=header, sep=sep)

In [3]:
train = read_data('data/train.data')
test = read_data('data/test.data')
test.head()

Unnamed: 0,Id,Text
0,0,One of our favorite Mexican restaurants. My wi...
1,1,After seeing a picture of the Spanish Mackerel...
2,2,This has been the best restaurant experience (...
3,3,Elements is great! The ambiance of Elements b...
4,4,"ok, so i went back here and had a better exper..."


In [4]:
print(train.shape, test.shape)

(102544, 3) (34194, 2)


# Example 1

In [76]:
vectorizer = TfidfVectorizer(analyzer='char',
                             ngram_range=(1, 5),
                             min_df=50,
                             max_df=0.7,
                             stop_words='english',
                             lowercase=True)

In [77]:
vectorizer.fit(pd.concat([train.Text, test.Text]))

TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.7, max_features=None, min_df=50,
        ngram_range=(1, 5), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [78]:
train_features = vectorizer.transform(train.Text)
test_features = vectorizer.transform(test.Text)

In [79]:
X_train, y_train = train_features, train.Sentiment.values
X_test = test_features

In [80]:
est = SGDClassifier(loss='modified_huber')

In [81]:
print(np.mean(cross_val_score(est, X_train, y_train, cv=5)))

0.59317956053


In [82]:
X_train, X_valid, y_train, y_valid = train_test_split(train_features, train.Sentiment.values, test_size=0.3)
X_test = test_features

In [83]:
est.fit(X_train, y_train)
print('Train: ', est.score(X_train, y_train))
print('Valid: ', est.score(X_valid, y_valid))

Train:  0.780133741989
Valid:  0.587862436614


In [40]:
pred_test = clf.predict(X_test)
prediction = pd.DataFrame(data={'Id': test.Id, 'Sentiment': pred_test}, index=test.index)
prediction.to_csv('data/prediction.csv', index=False)
print(prediction.head())

   Id  Sentiment
0   0          5
1   1          2
2   2          5
3   3          5
4   4          5
