# Kaggle: Bag of Words Meets Bags of Popcorn - I
The following code contains two models: logistic regression and ensemble model of logistic+NB+SVM.  
For part-1, logistic model, leaderboard score is 0.95304 (Probability), 0.88360 (Binary).  
For part-2, ensemble model, leaderboard score is 0.95235 (Probability), 0.88680 (Binary).

In [1]:
% matplotlib inline
import numpy as np
import pandas as pd
import nltk
import re

In [2]:
from bs4 import BeautifulSoup
#from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import BernoulliNB
#from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.utils import check_random_state
from sklearn.model_selection import train_test_split
from nltk.stem.snowball import EnglishStemmer

In [3]:
train=pd.read_csv('E:/542/Pro4/labeledTrainData.tsv', 
                  header=0,delimiter='\t',quoting=3)
test = pd.read_csv("E:/542/Pro4/testData.tsv", header=0, delimiter="\t", quoting=3)

In [4]:
def preprocess (corpus):
    num=corpus['review'].size
    clean=[]
    for i in range(0, num):
        review_text = BeautifulSoup(corpus["review"][i],'html.parser').get_text()
        pattern = re.compile(r'[^\w\s]')
        letnum_only = re.sub(pattern, " ", review_text) 
        clean.append(letnum_only)
    clean_result=pd.DataFrame({'text':clean}) 
    return clean_result

In [5]:
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stemmer = EnglishStemmer()
    stems = map(stemmer.stem, tokens)
    return stems

In [6]:
clean_train_df=preprocess(train)
clean_test_df=preprocess(test)

### Tuning and Validation
As test dataset is not labelled, split 25% training data as a separate validation dataset.  
For single models, Naive Bayes, Logistic Regression and SGDClassifier(SVM), SGDClassifier gives better accuracy on validation dataset based on binary classification (either 0 or 1), while logistic regression gives better accuracy based on probabilistic prediction (between 0 and 1).
Here use logistic model.  
By empirical trial of tuning, the parameter 'max_df' of tfidf vectorizing may affect approximately 2-3% of accuracy on validation dataset.

In [7]:
X_train, X_test, y_train, y_test= train_test_split(clean_train_df['text'], train['sentiment'],  test_size=0.25, random_state=check_random_state(2017))

In [8]:
def cv_lr(X_train, y_train, X_test, random_state):
    tfidf=TfidfVectorizer(stop_words = 'english',min_df=5,max_df=0.1,ngram_range=(1, 2),tokenizer=tokenize)
    train_X = tfidf.fit_transform(X_train)
    test_X=tfidf.transform(X_test)
    #model=SGDClassifier(random_state=random_state)
    model=LogisticRegression(penalty='l2', dual=True,random_state=check_random_state(2017))
    model.fit(train_X,y_train)
    y_pred=model.predict(test_X)
    y_score=model.predict_proba(test_X)
    return y_pred,y_score

In [9]:
y_pred1, y_score1 = cv_lr(X_train, y_train, X_test,random_state=check_random_state(2017))
score1 = accuracy_score(y_pred1, y_test)
print("LR prediction accuracy = {0:7.5f}%".format(100.0 * score1))

LR prediction accuracy = 88.65600%


In [10]:
y_pred2, y_score2 = cv_lr(clean_train_df['text'], train['sentiment'], clean_test_df['text'], random_state=check_random_state(520))

In [11]:
output=pd.DataFrame({'id':test['id'],'sentiment':y_pred2})
output.to_csv("PredLRBin.csv", index=False, quoting=3)

In [12]:
y_proba=[]
for i in range(0,25000):
        y_proba.append(y_score2[i][1])
output=pd.DataFrame({'id':test['id'],'sentiment':y_proba})
output.to_csv("PredLRProb.csv", index=False, quoting=3)