In [15]:
import matplotlib
import seaborn
%matplotlib inline

import numpy as np
import pandas as pd

import tensorflow
import keras
import nltk
import gensim
from gensim.models import word2vec, doc2vec

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

In [2]:
def read_data(filename, header=0, sep='\t'):
    return pd.read_csv(filename, header=header, sep=sep)

In [4]:
train = read_data('data/train.data')
test = read_data('data/test.data')

In [5]:
print(train.shape, test.shape)

(102544, 3) (34194, 2)


# Example 1

In [7]:
vectorizer = TfidfVectorizer(analyzer='char',
                             ngram_range=(2, 6),
                             min_df=50,
                             max_df=0.7,
                             max_features=40000,
                             stop_words='english',
                             lowercase=True)
vectorizer.fit(pd.concat([train.Text, test.Text]))

TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.7, max_features=40000, min_df=50,
        ngram_range=(2, 6), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [8]:
train_features = vectorizer.transform(train.Text)
test_features = vectorizer.transform(test.Text)

In [11]:
X_train, X_valid, y_train, y_valid = train_test_split(train_features, train.Sentiment.values, test_size=0.1)
X_test = test_features

In [16]:
sgd_est = SGDClassifier(loss='modified_huber', penalty='l2', n_iter=7, n_jobs=10)
sgd_est.fit(X_train, y_train)
print('Train: ', sgd_est.score(X_train, y_train))
print('Valid: ', sgd_est.score(X_valid, y_valid))

Train:  0.690201432457
Valid:  0.588980984885


In [None]:
svm_est = SVC()
svm_est.fit(X_train, y_train)
print('Train: ', svm_est.score(X_train, y_train))
print('Valid: ', svm_est.score(X_valid, y_valid))

In [40]:
pred_test = clf.predict(X_test)
prediction = pd.DataFrame(data={'Id': test.Id, 'Sentiment': pred_test}, index=test.index)
prediction.to_csv('data/prediction.csv', index=False)
print(prediction.head())

   Id  Sentiment
0   0          5
1   1          2
2   2          5
3   3          5
4   4          5
