In [93]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

from nltk.corpus import stopwords
import re

In [94]:
train = pd.read_csv("labeledTrainData.tsv", header=0,\
                    delimiter="\t", quoting=3)

test = pd.read_csv("testData.tsv",header=0,\
                    delimiter="\t", quoting=3)

In [95]:
def review_word_list(review):
    review_text=BeautifulSoup(review,'lxml').get_text()
    review_text = re.sub("[^a-zA-Z]"," ",review_text)
    words= review_text.lower().split()
    words= [w for w in words if not w in set(stopwords.words('english'))]
    return words


In [96]:
import nltk.data
tokenizer= nltk.data.load('tokenizers/punkt/english.pickle')

In [97]:
def review_sentence(review, tokenizer):
    raw_sentences = tokenizer.tokenize(review.strip().decode('utf-8'))
    sentences=[]
    for raw_sentence in raw_sentences:
        if len(raw_sentence)>0:
            a=review_word_list(raw_sentence)
            sentences.append(a)
    
    return sentences

In [None]:
sentences=[]
i=0
for review in train['review']:
    if i%100==0:
        print i , "out of ", len(train['review'])
    
    sentences+=review_sentence(review,tokenizer)
    i=i+1

In [99]:
num_features = 300  # Word vector dimensionality
min_word_count = 40 # Minimum word count
num_workers = 4     # Number of parallel threads
context = 10        # Context window size
downsampling = 1e-3 # (0.001) Downsample setting for frequent words

from gensim.models import word2vec
print("Training model....")
model = word2vec.Word2Vec(sentences,\
                          workers=num_workers,\
                          size=num_features,\
                          min_count=min_word_count,\
                          window=context,
                          sample=downsampling)

Training model....


In [117]:
model_name = "300features_40minwords_10context"
model.save(model_name)

In [100]:
model.wv.doesnt_match("man woman dog child planet".split())

'planet'

In [101]:
model.wv.doesnt_match("france england germany berlin".split())

'berlin'

In [102]:
model.wv.most_similar("woman")

[(u'girl', 0.701718807220459),
 (u'lady', 0.6968203186988831),
 (u'naive', 0.6901925802230835),
 (u'man', 0.6895425319671631),
 (u'husband', 0.6844757795333862),
 (u'lover', 0.6837038397789001),
 (u'innocent', 0.6774181723594666),
 (u'sexually', 0.6763538122177124),
 (u'daughter', 0.6626691818237305),
 (u'widow', 0.6623377799987793)]

In [103]:
model.wv.most_similar("awful")

[(u'terrible', 0.883781909942627),
 (u'horrible', 0.871444582939148),
 (u'dreadful', 0.8066338300704956),
 (u'sucks', 0.7981616258621216),
 (u'atrocious', 0.776829183101654),
 (u'horrendous', 0.7615725994110107),
 (u'abysmal', 0.7599791288375854),
 (u'crappy', 0.7576720714569092),
 (u'horrid', 0.7543565034866333),
 (u'pathetic', 0.7488211393356323)]

In [None]:
model['awful']

In [104]:
def features(words,num_features,model):
    feature= np.zeros(num_features,dtype='float32')
    nwords=0
    
    index2word_set = set(model.wv.index2word)
    
    for word in words:
        if word in index2word_set:
            nwords=nwords+1
            
            feature=np.add(feature,model[word])
    
    feature=np.divide(feature,nwords)
    return feature

In [105]:
def avg_features(reviews,num_features,model):
    counter=0
    avg_feature_vec=np.zeros((len(reviews),num_features),dtype='float32')
    for review in reviews:
        avg_feature_vec[counter]=features(review,num_features,model)
        counter=counter+1
    
    return avg_feature_vec

In [None]:
clean_train_reviews = []
i=0
for review in train['review']:
    if i%100==0:
        print i , "out of ", len(train['review'])
        
    clean_train_reviews.append(review_word_list(review))
    i=i+1

train_data_vector = avg_features(clean_train_reviews,num_features,model)

In [None]:
clean_test_reviews = []
i=0
for review in test['review']:
    if i%100==0:
        print i , "out of ", len(test['review'])
        
    clean_test_reviews.append(review_word_list(review))
    i=i+1
test_data_vector = avg_features(clean_test_reviews,num_features,model)

In [116]:
#from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LogisticRegression
lg = LogisticRegression()
    
print("Fitting random forest to training data....")    
forest.fit(train_data_vector, train["sentiment"])
pred=forest.predict(train_data_vector)
forest.score(train_data_vector,train["sentiment"])

Fitting random forest to training data....


0.85243999999999998

In [114]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(train["sentiment"],pred))

[[10537  1963]
 [ 1726 10774]]


In [115]:
from sklearn.metrics import classification_report
print(classification_report(train["sentiment"],pred))

             precision    recall  f1-score   support

          0       0.86      0.84      0.85     12500
          1       0.85      0.86      0.85     12500

avg / total       0.85      0.85      0.85     25000

