In [1]:
import pandas as pd
import numpy as np

data_train = pd.read_csv('labeledTrainData.tsv', sep='\t', quoting=3)
data_test = pd.read_csv('testData.tsv', sep='\t', quoting=3)

y_train = data_train["sentiment"]  
print(data_train.shape)
print(data_test.shape)

(25000, 3)
(25000, 2)


In [2]:
data_train[:3]

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."


In [9]:
import os 
import pickle

from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
stops = set(stopwords.words("english"))

def review_to_words(review_raw, stop_words):
    review_text = BeautifulSoup(review_raw).get_text()
        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    
    words = letters_only.lower().split()
    
    meaningful_words = [w for w in words if not w in stop_words]
    
    return(" ".join(meaningful_words))

if os.path.exists('feature_train.pkl'):
    feature_train = pickle.load(open('feature_train.pkl', 'rb'))
else:
    print "Cleaning and parsing movie reviews...\n"      
    feature_train = []
    m_train = len(data_train.review)
    for i in xrange(m_train):
        if( (i+1)%5000 == 0 ):
            print "Review train %d of %d\n" % ( i+1, m_train )    
        feature_train.append( review_to_words(data_train.review[i], stops))

if os.path.exists('feature_test.pkl'):
    feature_train = pickle.load(open('feature_test.pkl', 'rb'))
else:
    print "Cleaning and parsing movie reviews...\n"
    feature_test = []
    m_test = len(data_test.review)
    for i in xrange(m_test):
        if( (i+1)%5000 == 0 ):
            print "Review test %d of %d\n" % ( i+1, m_test )    
        feature_test.append( review_to_words(data_test.review[i], stops))

In [10]:
len(feature_train)

25000

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

In [12]:
feature_all = feature_train + feature_test
vectorizer = TfidfVectorizer(analyzer='word', max_features=5000)
vectorizer_fit = vectorizer.fit(feature_all)
X_train = vectorizer_fit.transform(feature_train)

In [9]:
#lr = LogisticRegression()
#lr_parameters = {'C' : [0.7, 1, 2]}
adab = AdaBoostClassifier(n_estimators=1000)
adab_parameters = {'base_estimator':[DecisionTreeClassifier(max_depth=1), DecisionTreeClassifier(max_depth=2)], 'learning_rate': [0.99,1.]}
grid_search = GridSearchCV(adab, adab_parameters, n_jobs=-1, cv=5, verbose=1, scoring='roc_auc')

grid_search.fit(X_train, y_train)
print "best score: ", grid_search.best_score_
print "best params", grid_search.best_params_

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed: 27.5min finished


best score:  0.927653312
best params {'base_estimator': DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'), 'learning_rate': 0.99}


In [23]:
vectorizer = TfidfVectorizer(analyzer='word', max_features=5000)
vectorizer_fit = vectorizer.fit(feature_all)
X_train = vectorizer_fit.transform(feature_train)

print('start to train the model...')
from sklearn.model_selection import train_test_split
X_tr, X_te, y_tr, y_te = train_test_split(X_train, y_train)
model = AdaBoostClassifier(n_estimators=1000, base_estimator = DecisionTreeClassifier(max_depth=1), learning_rate = 0.99)
model.fit(X_tr, y_tr)

start to train the model...


ValueError: bad input shape (6250L, 2L)

In [24]:
from sklearn.metrics import roc_auc_score
print(roc_auc_score(y_te, model.predict_proba(X_te)[:,1]))

0.509065050878


In [17]:
best_score = 0
best_vectorizer_fit = None
for max_features in [10000,20000]:
    vectorizer = TfidfVectorizer(analyzer='word', max_features=max_features)
    vectorizer_fit = vectorizer.fit(feature_all)
    X_train = vectorizer_fit.transform(feature_train)
    score = np.mean(cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=5))
    if score > best_score:
        best_score = score
        best_vectorizer_fit = vectorizer_fit
        print("untill now the best 'max_features' is %s" % max_features )
        print("best score: %.4f" % best_score)

untill now the best 'max_features' is 10000
best score: 0.4917
untill now the best 'max_features' is 20000
best score: 0.4984


In [35]:
X_train = best_vectorizer_fit.transform(feature_train)
clf = model.fit(X_train, y_train)

X_test = best_vectorizer_fit.transform(feature_test)

In [40]:
print "Retrain on all training data, predicting test labels...\n"
result = clf.predict_proba(X_test)[:, 1]
output = pd.DataFrame( data={"id": data_test.id, "sentiment": result} )

# Use pandas to write the comma-separated output file
output.to_csv('logistic.csv', index=False, quoting=3)
print "Wrote results to logistic.csv"

Retrain on all training data, predicting test labels...

Wrote results to logistic.csv
