### Peer-graded Assignment: Соревнование по сентимент-анализу

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV 
from sklearn.grid_search import RandomizedSearchCV 

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import StratifiedKFold

import numpy as np
import nltk



In [2]:
data_train = pd.read_table('../data/products_sentiment_train.tsv', header=None, index_col=False)
data_test = pd.read_table('../data/products_sentiment_test.tsv')

In [3]:
# Даем название колонкам

data_train['text'] = data_train[0]
data_train['label'] = data_train[1]
del data_train[0]
del data_train[1]

#### Осмотрим данные

In [4]:
data_test.head()

Unnamed: 0,Id,text
0,0,"so , why the small digital elph , rather than ..."
1,1,3/4 way through the first disk we played on it...
2,2,better for the zen micro is outlook compatibil...
3,3,6 . play gameboy color games on it with goboy .
4,4,"likewise , i 've heard norton 2004 professiona..."


In [5]:
data_test.describe()

Unnamed: 0,Id
count,500.0
mean,249.5
std,144.481833
min,0.0
25%,124.75
50%,249.5
75%,374.25
max,499.0


In [6]:
data_train.head()

Unnamed: 0,text,label
0,"2 . take around 10,000 640x480 pictures .",1
1,i downloaded a trial version of computer assoc...,1
2,the wrt54g plus the hga7t is a perfect solutio...,1
3,i dont especially like how music files are uns...,0
4,i was using the cheapie pail ... and it worked...,1


In [7]:
data_train.describe()

Unnamed: 0,label
count,2000.0
mean,0.637
std,0.480985
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [8]:
data_train.label.value_counts()

1    1274
0     726
Name: label, dtype: int64

#### Подбор лучшей модели
С помощью Pipiline и GridSearchCV переберем модели и выберем лучшую. Чтобы избежать переобучения воспользуемся StratifiedKFold.

##### Попробуем 4 метода:
LogisticRegression

SGDClassifier

LinearSVC

MultinomialNB

##### и преобразование для текста: TfidfVectorizer

In [9]:
X = data_train['text']
y = data_train['label']
cv = StratifiedKFold(y, n_folds=10, shuffle=True, random_state=1)

In [15]:
def lemmatization_data(data):
    wnl = nltk.WordNetLemmatizer()
    tokens_list = [nltk.word_tokenize(raw) for raw in data]
    for tokens in tokens_list:
        tokens = [wnl.lemmatize(t) for t in tokens]
        
    data_processing = [' '.join(x) for x in tokens_list]
    return data_processing

#### Пробуем LogisticRegression

In [16]:
%%time
pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer()), ("classifier", LogisticRegression())
])
pipeline_params = {
        'vectorizer__stop_words': ['english', None],
        'vectorizer__ngram_range': [(1, 2), (1, 3), (2, 3), (3, 5), (4, 5), (2, 5)],
        'vectorizer__analyzer': ['word', 'char_wb'],
        'vectorizer__norm': ['l1', 'l2', None],
        'vectorizer__use_idf': (True, False),
        'vectorizer__max_df': [0.25, 0.5, 0.75, 1.0],
        'vectorizer__max_features': [10, 50, 100, 250, 500, 1000, None],
        'classifier__penalty': ['l1', 'l2'],
        'classifier__C': [0.1, 0.5, 1, 5, 10, 50, 100]
    }

#grid = GridSearchCV(pipeline, pipeline_params, cv=cv, refit=True, verbose=1, n_jobs=4)
grid = RandomizedSearchCV(pipeline, pipeline_params, cv=cv, refit=True, verbose=1, n_jobs=4, n_iter=60)
grid.fit(lemmatization_data(X), y)
best = grid.best_estimator_
print(
    "Accuracy (TfidfVectorizer + LogisticRegression): {}, params {}" . format(grid.best_score_, grid.best_params_)
)

Fitting 10 folds for each of 60 candidates, totalling 600 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    8.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   48.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  1.8min
[Parallel(n_jobs=4)]: Done 600 out of 600 | elapsed:  2.3min finished


Accuracy (TfidfVectorizer + LogisticRegression): 0.781, params {'vectorizer__ngram_range': (4, 5), 'vectorizer__max_features': None, 'vectorizer__use_idf': True, 'vectorizer__norm': 'l2', 'vectorizer__max_df': 1.0, 'vectorizer__analyzer': 'char_wb', 'vectorizer__stop_words': None, 'classifier__C': 10, 'classifier__penalty': 'l2'}
CPU times: user 13.9 s, sys: 1.69 s, total: 15.5 s
Wall time: 2min 20s


#### Пробуем SGDClassifier

In [18]:
%%time
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', SGDClassifier())
])
pipeline_params = {
        'vectorizer__ngram_range': [(1, 2), (1, 3), (2, 3), (3, 5), (4, 5), (2, 5)],
        'vectorizer__analyzer': ['word', 'char_wb'],
        'vectorizer__norm': ['l1', 'l2', None],
        'vectorizer__use_idf': (True, False),
        'vectorizer__max_df': [0.25, 0.5, 0.75, 1.0],
        'vectorizer__max_features': [10, 50, 100, 250, 500, 1000, None],
        "classifier__penalty": ['l1', 'l2', 'elasticnet'],
        "classifier__loss": ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
        'classifier__alpha': (0.001, 0.0001, 0.00001, 0.000001)
    }

#grid = GridSearchCV(pipeline, pipeline_params, cv=cv, refit=True, verbose=1, n_jobs=4)
grid = RandomizedSearchCV(pipeline, pipeline_params, cv=cv, refit=True, verbose=1, n_jobs=4, n_iter=6000)
grid.fit(X, y)
best = grid.best_estimator_
print(
    "Accuracy (TfidfVectorizer + SGDClassifier): {}, params {}".format(grid.best_score_, grid.best_params_)
)

Fitting 10 folds for each of 6000 candidates, totalling 60000 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    3.6s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   13.4s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:   29.8s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:   54.8s
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:  1.5min
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed:  2.2min
[Parallel(n_jobs=4)]: Done 2442 tasks      | elapsed:  3.0min
[Parallel(n_jobs=4)]: Done 3192 tasks      | elapsed:  3.9min
[Parallel(n_jobs=4)]: Done 4042 tasks      | elapsed:  5.0min
[Parallel(n_jobs=4)]: Done 4992 tasks      | elapsed:  6.1min
[Parallel(n_jobs=4)]: Done 6042 tasks      | elapsed:  7.4min
[Parallel(n_jobs=4)]: Done 7192 tasks      | elapsed:  8.8min
[Parallel(n_jobs=4)]: Done 8442 tasks      | elapsed: 10.4min
[Parallel(n_jobs=4)]: Done 9792 tasks      | elapsed: 12.0min
[Parallel(n_jobs=4)]: Done 11242 tasks      | elapsed: 14.1min
[Parallel(n_jobs=4)]: Done 12792 tasks      | elapsed: 16.0min
[Parallel(

Accuracy (TfidfVectorizer + SGDClassifier): 0.79 with params {'vectorizer__ngram_range': (2, 5), 'sgd__alpha': 0.001, 'vectorizer__max_features': None, 'vectorizer__use_idf': True, 'sgd__penalty': 'l2', 'vectorizer__analyzer': 'char_wb', 'vectorizer__max_df': 1.0, 'vectorizer__norm': 'l2', 'sgd__loss': 'hinge'}


#### Пробуем LinearSVC

In [11]:
%%time
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', LinearSVC())
])
pipeline_params = {
        'vectorizer__ngram_range': [(1, 2), (1, 3), (2, 3), (3, 5), (4, 5), (2, 5)],
        'vectorizer__analyzer': ['word', 'char_wb'],
        'vectorizer__norm': ['l1', 'l2', None],
        'vectorizer__use_idf': (True, False),
        'vectorizer__max_df': [0.25, 0.5, 0.75, 1.0],
        'vectorizer__max_features': [10, 50, 100, 250, 500, 1000, None],
        "classifier__penalty": ['l2'],
        "classifier__loss": ['hinge'],
        'classifier__C': [0.1, 0.5, 1, 5, 10, 50, 100],
        'classifier__multi_class': ['ovr', 'crammer_singer'],
    }

#grid = GridSearchCV(pipeline, pipeline_params, cv=cv, refit=True, verbose=1, n_jobs=4)
grid = RandomizedSearchCV(pipeline, pipeline_params, cv=cv, refit=True, verbose=1, n_jobs=4, n_iter=600)
grid.fit(X, y)
best = grid.best_estimator_
print(
    "Accuracy (TfidfVectorizer + LinearSVC): {}, params {}" . format(grid.best_score_, grid.best_params_)
)

Fitting 10 folds for each of 600 candidates, totalling 6000 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   25.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  1.0min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  6.3min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  8.4min
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed: 18.1min
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed: 26.6min
[Parallel(n_jobs=4)]: Done 2442 tasks      | elapsed: 37.8min
[Parallel(n_jobs=4)]: Done 3192 tasks      | elapsed: 57.4min
[Parallel(n_jobs=4)]: Done 4042 tasks      | elapsed: 69.2min
[Parallel(n_jobs=4)]: Done 4992 tasks      | elapsed: 86.6min
[Parallel(n_jobs=4)]: Done 6000 out of 6000 | elapsed: 105.0min finished


Accuracy (TfidfVectorizer + LinearSVC): 0.789, params {'vectorizer__ngram_range': (3, 5), 'vectorizer__max_features': None, 'vectorizer__use_idf': True, 'classifier__loss': 'hinge', 'vectorizer__analyzer': 'char_wb', 'vectorizer__max_df': 1.0, 'vectorizer__norm': 'l2', 'classifier__C': 0.5, 'classifier__multi_class': 'ovr', 'classifier__penalty': 'l2'}
CPU times: user 1min 16s, sys: 12.8 s, total: 1min 28s
Wall time: 1h 45min


#### Пробуем MultinomialNB

In [12]:
%%time
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', MultinomialNB())
])
pipeline_params = {
        'vectorizer__ngram_range': [(1, 2), (1, 3), (2, 3), (3, 5), (4, 5), (2, 5)],
        'vectorizer__analyzer': ['word', 'char_wb'],
        'vectorizer__norm': ['l1', 'l2', None],
        'vectorizer__use_idf': (True, False),
        'vectorizer__max_df': [0.25, 0.5, 0.75, 1.0],
        'vectorizer__max_features': [10, 50, 100, 250, 500, 1000, None],
        "classifier__alpha": [0.1, 0.5, 1.0],
        "classifier__fit_prior": [True, False]
    }

#grid = GridSearchCV(pipeline, pipeline_params, cv=cv, refit=True, verbose=1, n_jobs=4)
grid = RandomizedSearchCV(pipeline, pipeline_params, cv=cv, refit=True, verbose=1, n_jobs=4, n_iter=6000)
grid.fit(X, y)
best = grid.best_estimator_
print(
    "Accuracy (TfidfVectorizer + MultinomialNB): {}, params {}" . format(grid.best_score_, grid.best_params_)
)

Fitting 10 folds for each of 6000 candidates, totalling 60000 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    3.5s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   16.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:   35.0s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:  1.9min
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed:  2.9min
[Parallel(n_jobs=4)]: Done 2442 tasks      | elapsed:  3.9min
[Parallel(n_jobs=4)]: Done 3192 tasks      | elapsed:  4.9min
[Parallel(n_jobs=4)]: Done 4042 tasks      | elapsed:  6.0min
[Parallel(n_jobs=4)]: Done 4992 tasks      | elapsed:  7.1min
[Parallel(n_jobs=4)]: Done 6042 tasks      | elapsed:  8.4min
[Parallel(n_jobs=4)]: Done 7192 tasks      | elapsed:  9.7min
[Parallel(n_jobs=4)]: Done 8442 tasks      | elapsed: 11.2min
[Parallel(n_jobs=4)]: Done 9792 tasks      | elapsed: 12.8min
[Parallel(n_jobs=4)]: Done 11242 tasks      | elapsed: 14.5min
[Parallel(n_jobs=4)]: Done 12792 tasks      | elapsed: 16.3min
[Parallel(

Accuracy (TfidfVectorizer + MultinomialNB): 0.799, params {'vectorizer__ngram_range': (3, 5), 'classifier__alpha': 0.1, 'vectorizer__max_features': None, 'vectorizer__use_idf': False, 'vectorizer__analyzer': 'char_wb', 'vectorizer__max_df': 1.0, 'vectorizer__norm': 'l2', 'classifier__fit_prior': False}
CPU times: user 4min 28s, sys: 6.66 s, total: 4min 34s
Wall time: 1h 12min 57s


Наилучший результат показал MultinomialNB. С ним дальше и будем работать.

In [None]:
#nltk.download()

In [17]:
def lemmatization_data(data):
    wnl = nltk.WordNetLemmatizer()
    tokens_list = [nltk.word_tokenize(raw) for raw in data]
    for tokens in tokens_list:
        tokens = [wnl.lemmatize(t) for t in tokens]
        
    data_processing = [' '.join(x) for x in tokens_list]
    return data_processing

In [24]:
vectorizer = TfidfVectorizer(
    ngram_range=(1, 3), max_df=1.0, analyzer='word', use_idf=True,
    norm='l2'
)
X_vect = vectorizer.fit_transform(lemmatization_data(X))

feature_array = np.array(vectorizer.get_feature_names())
tfidf_sorting = np.argsort(X_vect.toarray()).flatten()[::-1]
n = 10
top_n = feature_array[tfidf_sorting][:n]

#Выведем топ 10 слов
print top_n


classifier = MultinomialNB(alpha=0.1, fit_prior=False)

X_test = vectorizer.transform(lemmatization_data(data_test.text))
predicts = classifier.fit(X_vect, y).predict(X_test)

[u'the instructions' u'still hard' u'reading some' u'reading some of'
 u'hard to figure' u'even after reading' u'the instructions it'
 u'instructions it still' u'of the instructions' u'still hard to']


In [25]:
submission = pd.DataFrame()
submission['Id'] = data_test.Id
submission['y'] = predicts
submission.head()

Unnamed: 0,Id,y
0,0,1
1,1,0
2,2,1
3,3,1
4,4,0


In [26]:
submission.to_csv('../data/submission.csv', index = False)

#### На kaggle получили 0.78750. Хотелось бы узнать как значительно улучшить результ