# CH08: Applying Machine Learning to Sentiment Analysis

In [45]:
import os, sys
sys.path.append(os.path.join(os.getcwd(), ".."))

## Preparing the IMDb movie review data

In [47]:
import pandas as pd

df = pd.read_csv("../datasets/aclImdb/movie_data.csv", encoding="utf-8")

In [48]:
df.head(3)

Unnamed: 0,review,sentiment
0,This movie is the Latino Godfather. An unlikel...,1
1,Three young movie theater employees are given ...,0
2,Remember when Harrison Ford was the biggest st...,0


### Bag-of-words model

In [7]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [29]:
np.set_printoptions(precision=2)

In [24]:
count = CountVectorizer(ngram_range=(1, 1))
docs = np.array([
    "The sun is shining",
    "The weather is sweet",
    "The sun is shining and the weather is sweet, and one and one is two"
])
bag = count.fit_transform(docs)

In [25]:
print(count.vocabulary_)

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}


In [26]:
print(bag.toarray())

[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [3 3 2 1 1 1 2 1 1]]


In [27]:
from sklearn.feature_extraction.text import TfidfTransformer

In [39]:
tfidf = TfidfTransformer(use_idf=True, norm="l2", smooth_idf=True)
print(tfidf.fit_transform(bag).toarray())

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.66 0.39 0.44 0.17 0.17 0.17 0.26 0.22 0.17]]


## Cleaning text data

In [49]:
df.loc[0, "review"][-50:]

'to be tied. Son comes back and avenges dads death?'

In [50]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    return text

In [51]:
df["review"] = df["review"].apply(preprocessor)

In [52]:
def tokenizer(text):
    return text.split()

In [53]:
from nltk.stem.porter import PorterStemmer

In [54]:
porter = PorterStemmer()
def tokenizer_porter(text):
    return [ porter.stem(word) for word in text.split() ]
tokenizer_porter("runners like running and thus they run")

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [56]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /home/kuba/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [59]:
from nltk.corpus import stopwords

stop = stopwords.words("english")
stop[1:5]

['me', 'my', 'myself', 'we']