In [1]:
import pandas as pd
df = pd.read_csv('movie_data.csv')# contains 25k +ve and 25k -ve reviews
df

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0
...,...,...
49995,"OK, lets start with the best. the building. al...",0
49996,The British 'heritage film' industry is out of...,0
49997,I don't even know where to begin on this one. ...,0
49998,Richard Tyler is a little boy who is scared of...,0


In [4]:
#We will use bag of words model to create features a/c to the reviews
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
#testing bag of words model on the following data
docs = np.array([
        'The sun is shining',
        'The weather is sweet',
        'The sun is shining, the weather is sweet, and one and one is two'])
bag = count.fit_transform(docs)

In [6]:
print(count.vocabulary_) #this shows us the index assigned to each word in the documents
print(bag.toarray()) #the occurences of each word in the 3 documents a/c to index

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}
[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


## to downweight the frequnlty occuring words that do not have much meaning we will use tf idf

$$\text{idf}(t,d) = \text{log}\frac{n_d}{1+\text{df}(d, t)},$$
$$\text{tf-idf}(t,d)=\text{tf (t,d)}\times \text{idf}(t,d)$$
### where $n_d$ is the total number of documents, and df(d, t) is the number of documents d that contain the term t.

In [8]:
from sklearn.feature_extraction.text import TfidfTransformer
np.set_printoptions(precision=2) #setting output to two decimal plcaes
tfidf = TfidfTransformer(use_idf = True, norm = 'l2' , smooth_idf = True) #smooth idf prevents errors like division by zero
#creating tfidf vector using tf obtained above (bag)
print(tfidf.fit_transform(bag).toarray())

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


## Data preparation

In [19]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text) #removing html tages and punctuations
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text) #appending the emojis to the end of sentence
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    return text

In [11]:
df['review'] = df['review'].apply(preprocessor) #applying this for all reviews

## Tokenization of documents

### now we want to reduce derived words into their base form like runnings,runs to run, for this we will use stem which just removes some chunk of letters from the word

In [12]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()

def tokenizer(text): #splitting sentences into words separated by space
    return text.split()
def tokenizer_porter(text):
    return [porter.stem(word) for word in tokenizer(text)]

In [16]:
tokenizer('runners like running and thus they run')
tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [17]:
import nltk
#now we will remove stopwords from sentences like prep and articles
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Biohazard\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [18]:
from nltk.corpus import stopwords

stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:]
if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

## Transform data into TF-IDF 

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(

        lowercase=False,
        preprocessor = None,
        tokenizer=tokenizer_porter,
        use_idf=True,
        norm = 'l2',
        smooth_idf=True

)
y = df.sentiment.values
x = tfidf.fit_transform(df.review)

## classifying reviews using Logistic Regression

In [23]:
#splitting data into train and test
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state = 1,test_size = 0.5,shuffle=False)

In [32]:
clf = LogisticRegressionCV(
                            cv = 5,
                            scoring = 'accuracy',
                            random_state = 0,
                            verbose = 3,
                            max_iter = 300).fit(x_train,y_train)
saved_model = open('saved_model.sav','wb')
pickle.dump(clf,saved_model)
saved_model.close()

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   40.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.3min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  3.6min finished


## Model Evaluation

In [29]:
model = pickle.load(open('saved_model.sav','rb'))

In [31]:
model.score(x_test,y_test)



0.89604