In [1]:
import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv('datasets/labeledTrainData.tsv',sep='\t')
train.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [3]:
test = pd.read_csv('datasets/testData.tsv',sep='\t')
test.head()

Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...


In [4]:
from bs4 import BeautifulSoup
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))
wnl = WordNetLemmatizer()

def bagOfWords(review):
    #remove html content
    cleaned_review = BeautifulSoup(review,'html.parser').get_text()
    #clean non-alphabetical text
    cleaned_review = re.sub("[^a-zA-Z]"," ",cleaned_review)
    #form bag of words
    bag_of_words = nltk.word_tokenize(cleaned_review)
    word_vector = []
    for word,tag in nltk.pos_tag(bag_of_words):
        #remove stop words
        if word in stop_words:
            continue
        #lemmatize word
        if tag.startswith('V'):
            word_vector.append(wnl.lemmatize(word, pos='v').lower())
        elif tag.startswith('J') or tag.startswith('R'):
            word_vector.append(wnl.lemmatize(word, pos='a').lower())
        elif tag.startswith('NN'):
            word_vector.append(wnl.lemmatize(word, pos='n').lower())
    return ' '.join(word_vector)
vectorizer = np.vectorize(bagOfWords)

In [5]:
import pickle
#train['review'] = vectorizer(train['review'].values)
#pickle.dump(train['review'].values,open("train_review_final.pickle","wb"))
train['review'] = pickle.load(open("train_review_final.pickle","rb"))
train.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,stuff go moment mj start listen music watch od...
1,2381_9,1,classic war worlds timothy hines entertaining ...
2,7759_3,0,film start manager nicholas bell give welcome ...
3,3630_4,0,assume praise film great filmed opera ever rea...
4,9495_8,1,superbly trashy wondrously unpretentious explo...


In [6]:
#test['review'] = vectorizer(test['review'].values)
#pickle.dump(test['review'].values,open("test_review_final.pickle","wb"))
test['review'] = pickle.load(open("test_review_final.pickle","rb"))
test.head()

Unnamed: 0,id,review
0,12311_10,naturally film main theme mortality nostalgia ...
1,8348_2,movie disaster disaster film full great action...
2,5828_4,movie kid saw tonight child love point kid exc...
3,7186_2,afraid dark leave impression several different...
4,12128_7,accurate depiction small time mob life film ne...


In [7]:
X_test_id = test['id']
Y = train['sentiment']
train.drop(["sentiment","id"],axis=1,inplace=True)
test.drop("id",axis=1,inplace=True)

In [18]:
df = train.append(test,ignore_index=False)
print(train.shape,test.shape,df.shape)

(25000, 1) (25000, 1) (50000, 1)


In [24]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
count_vect.fit(train.review.values)
df_cv = count_vect.transform(df.review.values)
df_cv.shape

(50000, 65988)

In [21]:
#from sklearn.feature_extraction.text import TfidfTransformer
#tfidf_transform = TfidfTransformer()
#tfidf_transform.fit(df_cv)
#df_tfidf = tfidf_transform.transform(df_cv)
#df_tfidf.shape

(50000, 65988)

In [25]:
X = df_cv[:len(train)]
X_test = df_cv[len(train):]
from sklearn.model_selection import train_test_split
X_train,X_validation,Y_train,Y_validation = train_test_split(X,Y,test_size=0.1)
print(X_train.shape,X_validation.shape)

(22500, 65988) (2500, 65988)


### Lets try following model
1. Multinomial Naive Bayes
2. SVM
3. XgBoost
4. Keras Neural Net

In [26]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(X_train,Y_train)
mnb.score(X_validation,Y_validation)

0.86560000000000004

In [None]:
from sklearn.svm import SVR
svr = SVR()
svr.fit(X_train,Y_train)
svr.score(X_validation,Y_validation)