In [13]:
import pandas as pd
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.snowball import SnowballStemmer


In [14]:
data = pd.read_csv('IMDB Dataset.csv')

In [15]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
data.shape

(50000, 2)

In [5]:
data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,negative
freq,5,25000


In [6]:
data.isnull().sum()

review       0
sentiment    0
dtype: int64

In [7]:
data['review'][1]

'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

# Text CLeaning

### we need to remove special characters
### We need to filter out the stop words
### we need to split the text into tokens

In [8]:
#using re.compile pattenstring is converted to patten objects
no_space = re.compile("[.;:!\'?,\"()\[\]]")
space = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

##array of stopwords
stop_words = (stopwords.words('english'))


#to tokenize using regular expression method
tokenizer=RegexpTokenizer(r'\w+')

stemmer = SnowballStemmer(language='english')

#words are reduced to its root form using stemming method


def data_cleaning(review):
    clean_review = review.lower() #converted each letter to lower case
    clean_review = no_space.sub("",review) # all special character beloning to no_space objects replaced by ""
    clean_review = space.sub("",review) # all special character beloning to space objects replaced by " "
    tokens = tokenizer.tokenize(clean_review)
    clean_tokens =[]
    
    #stemming is done to reduce word to its root form
    for token in tokens:
        if token not in stop_words:
            clean_tokens.append(token)
    
    stem_tokens=[]
    
    for token in clean_tokens:
        stem_tokens.append(stemmer.stem(token))
        
    clean_review=' '.join(stem_tokens)

        
    return clean_review

In [10]:
data['review'].apply(data_cleaning)

0        one review mention watch 1 oz episod hook they...
1        a wonder littl product the film techniqu unass...
2        i thought wonder way spend time hot summer wee...
3        basic famili littl boy jake think zombi closet...
4        petter mattei love time money visual stun film...
                               ...                        
49995    i thought movi right good job it creativ origi...
49996    bad plot bad dialogu bad act idiot direct anno...
49997    i cathol taught parochi elementari school nun ...
49998    i go disagre previous comment side maltin one ...
49999    no one expect star trek movi high art fan expe...
Name: review, Length: 50000, dtype: object

In [16]:
data.shape

(50000, 2)

In [17]:
train_reviews=data.review[:40000]
train_sentiments=data.sentiment[:40000]

In [18]:
test_reviews=data.review[40000:]
test_sentiments=data.sentiment[40000:]

In [24]:
train_sentiments.head()

0    positive
1    positive
2    positive
3    negative
4    positive
Name: sentiment, dtype: object

## n-gram vectorization

###### document term matrix is generated and each cell represents the count