### Peer-graded Assignment: Соревнование по сентимент-анализу

In [13]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV 
from sklearn.grid_search import RandomizedSearchCV 

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import StratifiedKFold

import numpy as np

In [3]:
data_train = pd.read_table('../data/products_sentiment_train.tsv', header=None, index_col=False)
data_test = pd.read_table('../data/products_sentiment_test.tsv')

In [4]:
# Даем название колонкам

data_train['text'] = data_train[0]
data_train['label'] = data_train[1]
del data_train[0]
del data_train[1]

#### Осмотрим данные

In [5]:
data_test.head()

Unnamed: 0,Id,text
0,0,"so , why the small digital elph , rather than ..."
1,1,3/4 way through the first disk we played on it...
2,2,better for the zen micro is outlook compatibil...
3,3,6 . play gameboy color games on it with goboy .
4,4,"likewise , i 've heard norton 2004 professiona..."


In [6]:
data_test.describe()

Unnamed: 0,Id
count,500.0
mean,249.5
std,144.481833
min,0.0
25%,124.75
50%,249.5
75%,374.25
max,499.0


In [7]:
data_train.head()

Unnamed: 0,text,label
0,"2 . take around 10,000 640x480 pictures .",1
1,i downloaded a trial version of computer assoc...,1
2,the wrt54g plus the hga7t is a perfect solutio...,1
3,i dont especially like how music files are uns...,0
4,i was using the cheapie pail ... and it worked...,1


In [8]:
data_train.describe()

Unnamed: 0,label
count,2000.0
mean,0.637
std,0.480985
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [9]:
data_train.label.value_counts()

1    1274
0     726
Name: label, dtype: int64

#### Подбор лучшей модели
С помощью Pipiline и GridSearchCV переберем модели и выберем лучшую. Чтобы избежать переобучения воспользуемся StratifiedKFold.

##### Попробуем 4 метода:
LogisticRegression

SGDClassifier

LinearSVC

MultinomialNB

##### и преобразование для текста: TfidfVectorizer

In [10]:
X = data_train['text']
y = data_train['label']
cv = StratifiedKFold(y, n_folds=10, shuffle=True, random_state=1)

In [16]:
%%time
pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer()), ("classifier", LogisticRegression())
])
pipeline_params = {
        'vectorizer__stop_words': ['english', None],
        'vectorizer__ngram_range': [(1, 2), (1, 3), (2, 3), (3, 5), (4, 5), (2, 5)],
        'vectorizer__analyzer': ['word', 'char_wb'],
        'vectorizer__norm': ['l1', 'l2', None],
        'vectorizer__use_idf': (True, False),
        'vectorizer__max_df': [0.25, 0.5, 0.75, 1.0],
        'vectorizer__max_features': [10, 50, 100, 250, 500, 1000, None],
        'classifier__penalty': ['l1', 'l2'],
        'classifier__C': [0.1, 0.5, 1, 5, 10, 50, 100]
    }

#grid = GridSearchCV(pipeline, pipeline_params, cv=cv, refit=True, verbose=1, n_jobs=4)
grid = RandomizedSearchCV(pipeline, pipeline_params, cv=cv, refit=True, verbose=1, n_jobs=4, n_iter=6000)
grid.fit(X, y)
best = grid.best_estimator_
print(
    "Accuracy (TfidfVectorizer + LogisticRegression): {}, params {}" . format(grid.best_score_, grid.best_params_)
)

Fitting 10 folds for each of 6000 candidates, totalling 60000 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.7s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   16.4s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:   34.2s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  1.0min
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:  1.6min
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed:  2.3min
[Parallel(n_jobs=4)]: Done 2442 tasks      | elapsed:  3.1min
[Parallel(n_jobs=4)]: Done 3192 tasks      | elapsed:  4.2min
[Parallel(n_jobs=4)]: Done 4042 tasks      | elapsed:  5.5min
[Parallel(n_jobs=4)]: Done 4992 tasks      | elapsed:  7.0min
[Parallel(n_jobs=4)]: Done 6042 tasks      | elapsed:  8.3min
[Parallel(n_jobs=4)]: Done 7192 tasks      | elapsed: 10.9min
[Parallel(n_jobs=4)]: Done 8442 tasks      | elapsed: 12.8min
[Parallel(n_jobs=4)]: Done 9792 tasks      | elapsed: 14.8min
[Parallel(n_jobs=4)]: Done 11242 tasks      | elapsed: 16.7min
[Parallel(n_jobs=4)]: Done 12792 tasks      | elapsed: 18.5min
[Parallel(

Accuracy (TfidfVectorizer + LogisticRegression): 0.7875, params {'vectorizer__ngram_range': (1, 3), 'vectorizer__max_features': None, 'vectorizer__use_idf': True, 'vectorizer__norm': 'l2', 'vectorizer__max_df': 1.0, 'vectorizer__analyzer': 'word', 'vectorizer__stop_words': None, 'classifier__C': 100, 'classifier__penalty': 'l2'}
CPU times: user 4min 49s, sys: 8.71 s, total: 4min 57s
Wall time: 1h 40min 6s


In [18]:
cv = StratifiedKFold(y, n_folds=15, shuffle=True, random_state=1)

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('sgd', SGDClassifier())
])
pipeline_params = {
        'tfidf__ngram_range': [(1, 3), (1, 4)],
        'tfidf__use_idf': (True, False),
        'tfidf__max_df': [0.25, 0.5, 0.75, 1.0],
        'tfidf__max_features': [10, 50, 100, 250, 500, 1000, None],
        'tfidf__smooth_idf': (True, False),
        'tfidf__norm': ('l1', 'l2', None),
        "sgd__penalty": ['l1', 'l2', 'elasticnet'],
        "sgd__loss": ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
        'sgd__alpha': (0.00001, 0.000001)
    }

#grid = GridSearchCV(pipeline, pipeline_params, cv=cv, refit=True, verbose=1, n_jobs=4)
grid = RandomizedSearchCV(pipeline, pipeline_params, cv=cv, refit=True, verbose=1, n_jobs=4, n_iter=600)
grid.fit(X, y)
best = grid.best_estimator_
print("Accuracy (TfidfVectorizer + SGDClassifier): {} with params {}"
      .format(grid.best_score_, grid.best_params_))

Fitting 15 folds for each of 600 candidates, totalling 9000 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    3.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   11.4s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:   30.9s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:   52.2s
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:  1.3min
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed:  1.9min
[Parallel(n_jobs=4)]: Done 2442 tasks      | elapsed:  2.6min
[Parallel(n_jobs=4)]: Done 3192 tasks      | elapsed:  3.5min
[Parallel(n_jobs=4)]: Done 4042 tasks      | elapsed:  4.4min
[Parallel(n_jobs=4)]: Done 4992 tasks      | elapsed:  5.5min
[Parallel(n_jobs=4)]: Done 6042 tasks      | elapsed:  6.6min
[Parallel(n_jobs=4)]: Done 7192 tasks      | elapsed:  7.9min
[Parallel(n_jobs=4)]: Done 8442 tasks      | elapsed:  9.3min
[Parallel(n_jobs=4)]: Done 9000 out of 9000 | elapsed:  9.9min finished


Accuracy (TfidfVectorizer + SGDClassifier): 0.7845 with params {'tfidf__smooth_idf': True, 'sgd__alpha': 1e-06, 'tfidf__ngram_range': (1, 4), 'sgd__penalty': 'elasticnet', 'tfidf__max_features': None, 'tfidf__stop_words': None, 'tfidf__max_df': 1.0, 'tfidf__use_idf': True, 'tfidf__norm': 'l1', 'sgd__loss': 'hinge'}


In [23]:
cv = StratifiedKFold(y, n_folds=10, shuffle=True, random_state=1)

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('rf', RandomForestClassifier(random_state=1))
])
pipeline_params = {
        'tfidf__ngram_range': [(1, 3)],
        'tfidf__use_idf': (True, False),
        'tfidf__max_df': [0.25, 0.5, 0.75, 1.0],
        'tfidf__max_features': [10, 50, 100, 250, 500, 1000, None],
        'tfidf__smooth_idf': (True, False),
        'tfidf__norm': ('l1', 'l2', None),
        "rf__n_estimators": [1000, 1500],
        "rf__min_samples_split": [6, 8, 10],
        "rf__min_samples_leaf": [1, 2, 4]
    }

#grid = GridSearchCV(pipeline, pipeline_params, cv=cv, refit=True, verbose=1, n_jobs=4)
grid = RandomizedSearchCV(pipeline, pipeline_params, cv=cv, refit=True, verbose=1, n_jobs=4, n_iter=30)
grid.fit(X, y)
best = grid.best_estimator_
print("Accuracy (TfidfVectorizer + SGDClassifier): {} with params {}"
      .format(grid.best_score_, grid.best_params_))

Fitting 10 folds for each of 30 candidates, totalling 300 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  6.9min
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed: 10.6min finished


Accuracy (TfidfVectorizer + SGDClassifier): 0.7675 with params {'tfidf__smooth_idf': True, 'rf__n_estimators': 1000, 'tfidf__ngram_range': (1, 3), 'tfidf__max_features': 1000, 'tfidf__max_df': 0.25, 'rf__min_samples_split': 6, 'tfidf__use_idf': False, 'rf__min_samples_leaf': 1, 'tfidf__norm': None}


In [16]:
%%time
cv = StratifiedKFold(y, n_folds=5, shuffle=True, random_state=1)

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('mlp', MLPClassifier(hidden_layer_sizes=(10,)))
])
pipeline_params = {
        'tfidf__ngram_range': [(1, 3)],
        #'tfidf__use_idf': (True, False),
        #'tfidf__max_df': [0.25, 0.5, 0.75, 1.0],
        #'tfidf__max_features': [10, 50, 100, 250, 500, 1000, None],
        #'tfidf__smooth_idf': (True, False),
        #'tfidf__norm': ('l1', 'l2', None),
    }

grid = GridSearchCV(pipeline, pipeline_params, cv=cv, refit=True, verbose=1, n_jobs=4)
#grid = RandomizedSearchCV(pipeline, pipeline_params, cv=cv, refit=True, verbose=1, n_jobs=4, n_iter=1)
grid.fit(X, y)
best = grid.best_estimator_
print(
    "Accuracy (TfidfVectorizer + MLPClassifier): {} with params {}".format(grid.best_score_, grid.best_params_)
)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:   48.5s finished


Accuracy (TfidfVectorizer + MLPClassifier): 0.768 with params {'tfidf__ngram_range': (1, 3)}
CPU times: user 13.6 s, sys: 800 ms, total: 14.4 s
Wall time: 1min 2s


In [23]:
%%time
cv = StratifiedKFold(y, n_folds=10, shuffle=True, random_state=1)

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svc', SVC())
])
pipeline_params = {
        'tfidf__ngram_range': [(1, 3)],
        'tfidf__use_idf': (True, False),
        'tfidf__max_df': [0.25, 0.5, 0.75, 1.0],
        'tfidf__max_features': [10, 50, 100, 250, 500, 1000, None],
        'tfidf__smooth_idf': (True, False),
        'tfidf__norm': ('l1', 'l2', None),
    }

grid = GridSearchCV(pipeline, pipeline_params, cv=cv, refit=True, verbose=1, n_jobs=4)
#grid = RandomizedSearchCV(pipeline, pipeline_params, cv=cv, refit=True, verbose=1, n_jobs=4, n_iter=1)
grid.fit(X, y)
best = grid.best_estimator_
print(
    "Accuracy (TfidfVectorizer + MLPClassifier): {} with params {}".format(grid.best_score_, grid.best_params_)
)

Fitting 5 folds for each of 336 candidates, totalling 1680 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    3.7s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   17.9s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:   50.6s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  1.5min
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:  2.4min
[Parallel(n_jobs=4)]: Done 1680 out of 1680 | elapsed:  3.4min finished


Accuracy (TfidfVectorizer + MLPClassifier): 0.743 with params {'tfidf__smooth_idf': False, 'tfidf__ngram_range': (1, 3), 'tfidf__max_features': 500, 'tfidf__max_df': 0.25, 'tfidf__use_idf': True, 'tfidf__norm': None}
CPU times: user 8.58 s, sys: 288 ms, total: 8.86 s
Wall time: 3min 24s


In [24]:
%%time
from sklearn.naive_bayes import MultinomialNB
cv = StratifiedKFold(y, n_folds=5, shuffle=True, random_state=1)

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svc', MultinomialNB())
])
pipeline_params = {
        'tfidf__ngram_range': [(1, 3)],
        'tfidf__use_idf': (True, False),
        'tfidf__max_df': [0.25, 0.5, 0.75, 1.0],
        'tfidf__max_features': [10, 50, 100, 250, 500, 1000, None],
        'tfidf__smooth_idf': (True, False),
        'tfidf__norm': ('l1', 'l2', None),
    }

grid = GridSearchCV(pipeline, pipeline_params, cv=cv, refit=True, verbose=1, n_jobs=4)
#grid = RandomizedSearchCV(pipeline, pipeline_params, cv=cv, refit=True, verbose=1, n_jobs=4, n_iter=1)
grid.fit(X, y)
best = grid.best_estimator_
print(
    "Accuracy (TfidfVectorizer + MLPClassifier): {} with params {}".format(grid.best_score_, grid.best_params_)
)

Fitting 5 folds for each of 336 candidates, totalling 1680 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.8s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   12.3s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:   28.6s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:   51.2s
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:  1.3min
[Parallel(n_jobs=4)]: Done 1680 out of 1680 | elapsed:  1.8min finished


Accuracy (TfidfVectorizer + MLPClassifier): 0.776 with params {'tfidf__smooth_idf': True, 'tfidf__ngram_range': (1, 3), 'tfidf__max_features': None, 'tfidf__max_df': 0.25, 'tfidf__use_idf': False, 'tfidf__norm': None}
CPU times: user 8.04 s, sys: 240 ms, total: 8.28 s
Wall time: 1min 47s
