In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.cross_validation import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV 
from sklearn.grid_search import RandomizedSearchCV
import xgboost as xgb
import numpy as np

In [3]:
data_train = pd.read_table('../data/products_sentiment_train.tsv', header=None, index_col=False)

In [4]:
data_train['text'] = data_train[0]
data_train['label'] = data_train[1]
del data_train[0]
del data_train[1]

In [6]:
data_train.head()

Unnamed: 0,text,label
0,"2 . take around 10,000 640x480 pictures .",1
1,i downloaded a trial version of computer assoc...,1
2,the wrt54g plus the hga7t is a perfect solutio...,1
3,i dont especially like how music files are uns...,0
4,i was using the cheapie pail ... and it worked...,1


In [8]:
data_train.describe()

Unnamed: 0,label
count,2000.0
mean,0.637
std,0.480985
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [10]:
data_train.label.value_counts()

1    1274
0     726
Name: label, dtype: int64

In [11]:
X = data_train['text']
y = data_train['label']

In [22]:
count_vectorizer = CountVectorizer(ngram_range=(1, 3))
X_vect = count_vectorizer.fit_transform(X).toarray()
estimator = LogisticRegression()

scores = cross_val_score(estimator=estimator, X=X_vect, y=y, cv=20)
print round(scores.mean(), 4)

0.771


In [19]:
cv = StratifiedKFold(y, n_folds=15, shuffle=True, random_state=1)

pipeline = Pipeline([
    ("vectorizer", CountVectorizer()), ("classifier", LogisticRegression())
])
pipeline_params = [
    {
        "vectorizer__stop_words": ['english', None],
        "vectorizer__ngram_range": [(1, 2), (1, 3), (2, 3), (3, 5), (4, 5), (2, 5)],
        "vectorizer__analyzer": ['word', 'char_wb'],
        "classifier__penalty": ['l1', 'l2'],
        "classifier__C": [0.5, 1, 5, 10]
    },
]

grid = GridSearchCV(pipeline, pipeline_params, cv=cv, refit=True, verbose=1, n_jobs=4)
grid.fit(X, y)
best = grid.best_estimator_
print("Accuracy (CountVectorizer + LogisticRegression): {} with params {}"
      .format(grid.best_score_, grid.best_params_))

Fitting 15 folds for each of 192 candidates, totalling 2880 fits


[Parallel(n_jobs=4)]: Done  76 tasks      | elapsed:    3.5s
[Parallel(n_jobs=4)]: Done 286 tasks      | elapsed:   28.0s
[Parallel(n_jobs=4)]: Done 536 tasks      | elapsed:   47.7s
[Parallel(n_jobs=4)]: Done 886 tasks      | elapsed:  1.3min
[Parallel(n_jobs=4)]: Done 1336 tasks      | elapsed:  2.2min
[Parallel(n_jobs=4)]: Done 1886 tasks      | elapsed:  3.7min
[Parallel(n_jobs=4)]: Done 2536 tasks      | elapsed:  5.2min
[Parallel(n_jobs=4)]: Done 2880 out of 2880 | elapsed:  5.9min finished


Accuracy (CountVectorizer + LogisticRegression): 0.7785 with params {'vectorizer__stop_words': None, 'vectorizer__ngram_range': (1, 2), 'classifier__C': 0.5, 'vectorizer__analyzer': 'word', 'classifier__penalty': 'l2'}


In [18]:
cv = StratifiedKFold(y, n_folds=15, shuffle=True, random_state=1)

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('sgd', SGDClassifier())
])
pipeline_params = {
        'tfidf__ngram_range': [(1, 3), (1, 4)],
        'tfidf__use_idf': (True, False),
        'tfidf__max_df': [0.25, 0.5, 0.75, 1.0],
        'tfidf__max_features': [10, 50, 100, 250, 500, 1000, None],
        'tfidf__stop_words': ('english', None),
        'tfidf__smooth_idf': (True, False),
        'tfidf__norm': ('l1', 'l2', None),
        "sgd__penalty": ['l1', 'l2', 'elasticnet'],
        "sgd__loss": ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
        'sgd__alpha': (0.00001, 0.000001)
    }

#grid = GridSearchCV(pipeline, pipeline_params, cv=cv, refit=True, verbose=1, n_jobs=4)
grid = RandomizedSearchCV(pipeline, pipeline_params, cv=cv, refit=True, verbose=1, n_jobs=4, n_iter=600)
grid.fit(X, y)
best = grid.best_estimator_
print("Accuracy (TfidfVectorizer + SGDClassifier): {} with params {}"
      .format(grid.best_score_, grid.best_params_))

Fitting 15 folds for each of 600 candidates, totalling 9000 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    3.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   11.4s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:   30.9s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:   52.2s
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:  1.3min
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed:  1.9min
[Parallel(n_jobs=4)]: Done 2442 tasks      | elapsed:  2.6min
[Parallel(n_jobs=4)]: Done 3192 tasks      | elapsed:  3.5min
[Parallel(n_jobs=4)]: Done 4042 tasks      | elapsed:  4.4min
[Parallel(n_jobs=4)]: Done 4992 tasks      | elapsed:  5.5min
[Parallel(n_jobs=4)]: Done 6042 tasks      | elapsed:  6.6min
[Parallel(n_jobs=4)]: Done 7192 tasks      | elapsed:  7.9min
[Parallel(n_jobs=4)]: Done 8442 tasks      | elapsed:  9.3min
[Parallel(n_jobs=4)]: Done 9000 out of 9000 | elapsed:  9.9min finished


Accuracy (TfidfVectorizer + SGDClassifier): 0.7845 with params {'tfidf__smooth_idf': True, 'sgd__alpha': 1e-06, 'tfidf__ngram_range': (1, 4), 'sgd__penalty': 'elasticnet', 'tfidf__max_features': None, 'tfidf__stop_words': None, 'tfidf__max_df': 1.0, 'tfidf__use_idf': True, 'tfidf__norm': 'l1', 'sgd__loss': 'hinge'}


In [23]:
cv = StratifiedKFold(y, n_folds=10, shuffle=True, random_state=1)

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('rf', RandomForestClassifier(random_state=1))
])
pipeline_params = {
        'tfidf__ngram_range': [(1, 3)],
        'tfidf__use_idf': (True, False),
        'tfidf__max_df': [0.25, 0.5, 0.75, 1.0],
        'tfidf__max_features': [10, 50, 100, 250, 500, 1000, None],
        'tfidf__smooth_idf': (True, False),
        'tfidf__norm': ('l1', 'l2', None),
        "rf__n_estimators": [1000, 1500],
        "rf__min_samples_split": [6, 8, 10],
        "rf__min_samples_leaf": [1, 2, 4]
    }

#grid = GridSearchCV(pipeline, pipeline_params, cv=cv, refit=True, verbose=1, n_jobs=4)
grid = RandomizedSearchCV(pipeline, pipeline_params, cv=cv, refit=True, verbose=1, n_jobs=4, n_iter=30)
grid.fit(X, y)
best = grid.best_estimator_
print("Accuracy (TfidfVectorizer + SGDClassifier): {} with params {}"
      .format(grid.best_score_, grid.best_params_))

Fitting 10 folds for each of 30 candidates, totalling 300 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  6.9min
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed: 10.6min finished


Accuracy (TfidfVectorizer + SGDClassifier): 0.7675 with params {'tfidf__smooth_idf': True, 'rf__n_estimators': 1000, 'tfidf__ngram_range': (1, 3), 'tfidf__max_features': 1000, 'tfidf__max_df': 0.25, 'rf__min_samples_split': 6, 'tfidf__use_idf': False, 'rf__min_samples_leaf': 1, 'tfidf__norm': None}
