In [36]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [37]:
import warnings
warnings.filterwarnings("ignore")

In [38]:
import re

In [39]:
df = pd.read_csv("IMDB Dataset.csv")

### Part 1: Removing numbers/digits from the review

In [40]:
def review_digit(text):
    return re.sub('\d+', '', text)
for index, row in df.iterrows():
    df.at[index,"review"] = review_digit(row["review"]);

### Part 2: Converting all the text to lowercase (it seems this is a requirement for ML algos)

In [41]:
for index, row in df.iterrows():
    df.at[index,"review"] = row["review"].lower();

### Part 3: Removing the HTML tags from the review

In [42]:
def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, ' ', text);
for index, row in df.iterrows():
    df.at[index,"review"] = remove_html_tags(row["review"]);


In [43]:
### Part 4: Removing any contractions

In [44]:
import contractions
for index, row in df.iterrows():
    df.at[index,"review"] = contractions.fix(row["review"]);


### Part 4: Removing punctuation from the review

In [45]:
def remove_pun(text):
    return re.sub(r'[^\w\s]', ' ', text)
for index, row in df.iterrows():
    df.at[index,"review"] = remove_pun(row["review"]);

In [46]:
### Part 5: Removing any extra spaces created by above

In [47]:
def remove_spaces(text):
    return re.sub(r'\s+', ' ', text)
for index, row in df.iterrows():
    df.at[index,"review"] = remove_spaces(row["review"]);

### Part 6: Removing stopwords from the review

In [48]:
import nltk
from nltk.corpus import stopwords

In [49]:
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\meyer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\meyer\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [50]:
for index, row in df.iterrows():
    df.at[index,"review"] = ' '.join(word for word in row["review"].split() if word not in stop_words);

### Part 7: Applying lemmitization on the review

In [51]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

In [52]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\meyer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [53]:
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\meyer\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\meyer\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

##### Note: in order to run the code below it was giving some error saying nltk.download('punkt') and nltk.download('omw-1.4') is required.

In [54]:
from nltk.corpus import wordnet
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        # As default pos in lemmatization is Noun
        return wordnet.NOUN
for index, row in df.iterrows():
    review_word_tag_list = nltk.pos_tag(nltk.word_tokenize(row["review"]))
    df.at[index,"review"] =' '.join([lemmatizer.lemmatize(word,get_wordnet_pos(pos)) for (word, pos) in review_word_tag_list])

In [55]:
### Part 8: Encode the results

In [56]:
from sklearn.preprocessing import LabelEncoder
Encoder = LabelEncoder()
df.sentiment = Encoder.fit_transform(df.sentiment)

In [57]:
df.to_csv("cleanseDAta.csv");

In [58]:
## Run svm against the data.

In [59]:
### grab the first 70% of data to use as training

In [60]:
percentToTrain = 70
trainingData = df.head(int(len(df)*(percentToTrain/100)))

In [61]:
percentToTest = 100 - percentToTrain
testData = df.tail(int(len(df) * (percentToTest/100)))

0        one reviewer mention watch oz episode hook rig...
1        wonderful little production film technique una...
2        think wonderful way spend time hot summer week...
3        basically family little boy jake think zombie ...
4        petter mattei love time money visually stunnin...
                               ...                        
49995    think movie right good job creative original f...
49996    bad plot bad dialogue bad act idiotic direct a...
49997    catholic teach parochial elementary school nun...
49998    go disagree previous comment side maltin one s...
49999    one expect star trek movie high art fan expect...
Name: review, Length: 50000, dtype: object

In [62]:
### Vectorize the data ( here we can do a lot of different ways we will do a default way for now)
from sklearn.feature_extraction.text import TfidfVectorizer
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(df.review)
Train_X_Tfidf  = Tfidf_vect.transform(trainingData.review)
Test_X_Tfidf = Tfidf_vect.transform(testData.review)


In [63]:
### run svm against the data
from sklearn import svm
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,trainingData.sentiment);


In [64]:

predictions_SVM = SVM.predict(Test_X_Tfidf)

NameError: name 'accuracy_score' is not defined

In [65]:
from sklearn.metrics import accuracy_score
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, testData.sentiment)*100)

SVM Accuracy Score ->  88.36666666666667
