In [1]:
import numpy as np
import pandas as pd

pos_train_data = pd.read_csv('train_pos.csv',sep = ',')
neg_train_data = pd.read_csv('train_neg.csv',sep = ',')
pos_test_data = pd.read_csv('test_pos.csv',sep = ',')
neg_test_data = pd.read_csv('test_neg.csv',sep = ',')

In [2]:
pos_train_data=pos_train_data[['Text','Label']]
neg_train_data=neg_train_data[['Text','Label']]
pos_test_data=pos_test_data[['Text','Label']]
neg_test_data=neg_test_data[['Text','Label']]

In [3]:
data_train=pd.concat([pos_train_data,neg_train_data],ignore_index = True)
data_train = data_train.sample(frac=1).reset_index(drop=True)
data_train.head()

Unnamed: 0,Text,Label
0,I rented this movie when it came out on video ...,1
1,This movie has a very simple yet clever premis...,1
2,"Sure, for it's super imagery and awesome sound...",1
3,"Erich Rohmer's ""L'Anglaise et le duc"" makes a ...",1
4,I really enjoyed this movie. I am a single dad...,1


In [4]:
data_test = pd.concat([pos_test_data,neg_test_data],ignore_index = True)
data_test = data_test.sample(frac=1).reset_index(drop=True)
data_test.head()

Unnamed: 0,Text,Label
0,"Uninspired, pretty much all around. The only e...",0
1,This film is so 1980's and that is what I like...,1
2,Previous Tarantino movies were from a guy in l...,0
3,I suspect there's some revisionist history goi...,1
4,"Alfred Hitchcock's remake of ""The Man Who Who ...",1


In [5]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

stop_words=set(stopwords.words('english'))
table = str.maketrans('','',punctuation)

def textclean(text):
    tokens=word_tokenize(text)
    tokens=[word for word in tokens if word.isalpha()]
    tokens=[w.translate(table) for w in tokens]
    tokens=[word for word in tokens if not word in stop_words]
    token = [word for word in tokens if len(word)>1 ]
    return tokens

In [6]:
reviews=[]
for index,row in data_train.iterrows():
    text=row['Text'].lower()
    reviews.append(textclean(text))

In [7]:
reviews[0]

['rented',
 'movie',
 'came',
 'video',
 'tape',
 'really',
 'enjoyed',
 'opportunity',
 'purchase',
 'dvd',
 'weeks',
 'ago',
 'watched',
 'several',
 'times',
 'since',
 'would',
 'agree',
 'others',
 'said',
 'indian',
 'summer',
 'nostalgic',
 'film',
 'watch',
 'wish',
 'could',
 'think',
 'like',
 'movie',
 'extent',
 'times',
 'wish',
 'could',
 'relive',
 'lives',
 'children',
 'adults',
 'would',
 'nice',
 'young',
 'worry',
 'job',
 'parent',
 'etc',
 'know',
 'would',
 'like',
 'jump',
 'de',
 'lorean',
 'go',
 'back',
 'time',
 'enjoyed',
 'film',
 'much',
 'time',
 'favorite',
 'camp',
 'film',
 'though',
 'meatballs',
 'bill',
 'murray',
 'wish',
 'could',
 'make',
 'indian',
 'summer',
 'version']

In [8]:
import gensim
from gensim.models import Word2Vec
n_dim=200

w2v_model =Word2Vec(reviews,min_count=5,size=n_dim)



In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer


vectorizer  = TfidfVectorizer(min_df=10,analyzer=lambda x: x)

matrix=vectorizer.fit_transform(reviews)



In [10]:
tfidf = dict(zip(vectorizer.get_feature_names(),vectorizer.idf_))


In [11]:
def create_word_vector(l,size):
    vector = np.zeros(size).reshape((1,size))
    count = 0.
    for word in l:
        try:
            vector += w2v_model[word].reshape((1, size)) * tfidf[word]
            count+=1
        except KeyError:
            continue
            
    if count!=0:
        vector /= count
    return vector

In [12]:
X_train = []
y_train = []

for i in range(len(data_train)):
    converted_review = create_word_vector(reviews[i],n_dim)
    #print(converted_review)
    X_train.append(np.array(converted_review))
    y_train.append(data_train['Label'][i])

  


In [13]:
from sklearn.preprocessing import scale

X_train = np.concatenate(X_train)
X_train = scale(X_train)
y_train = np.array(y_train)


In [14]:
X_train.shape

(25000, 200)

In [15]:
data_test = pd.concat([pos_test_data,neg_test_data],ignore_index = True)
data_test = data_test.sample(frac=1).reset_index(drop=True)

validation_reviews = []

for index,row in data_test.iterrows():
    text = (row['Text'].lower())
    validation_reviews.append(textclean(text))
    
X_val = []
y_val = []

for i in range(len(data_test)):
    converted_review = create_word_vector(validation_reviews[i],n_dim)
    X_val.append(converted_review)
    y_val.append(data_test['Label'][i])
        
X_val = np.concatenate(X_val)
X_val = scale(X_val)
y_val = np.array(y_val)

  


In [16]:
X_val.shape

(25000, 200)

In [17]:
from sklearn.svm import SVC

clf = SVC()
clf = clf.fit(X_train,y_train)

In [18]:
print(clf.predict(X_val[4].reshape(1,-1)))

[0]


In [19]:
data_test['Text'][4]

"i am an avid ff7 fan, for instance i have the game then sell it(bad mistake) but then buy it again (good mistake...erm)<br /><br />anyways, yes this film is very good, the fights are very cool, music very good, and the cgi you cant falter.<br /><br />only thing disappointing with the film i felt was the lack of other character involvement, it was almost all cloud which although is a great character, u cant beat a of cid and barret.<br /><br />but despite that the film was great in my opinion, and a must watch.<br /><br />overall a great film give and will give it 9/10 <br /><br />squaresoft, make more films like this and you'll be worshiped more so than you already are!!!!"

In [20]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_val,clf.predict(X_val)))

0.85124
