# Chapter 8: Applying Machine Learning to Sentiment Analysis

In [2]:
import pyprind
import pandas as pd
import os

In [2]:
pbar = pyprind.ProgBar(50000)
labels = {'pos': 1, 'neg' : 0}
df = pd.DataFrame()

'C:\\Users\\cbcastleberry\\Documents\\GitHub\\Python_Machine_Learning'

In [5]:
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = './aclImdb/{}/{}'.format(s, l)
        for file in os.listdir(path):
            with open(os.path.join(path,file), 
                      'r', encoding='utf8') as infile:
                txt = infile.read()
                df = df.append([[txt, labels[l]]], ignore_index=True)
                pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:13:02


In [3]:
import numpy as np

In [7]:
np.random.seed(0)

In [8]:
df = df.reindex(np.random.permutation(df.index))

In [11]:
df.to_csv('movie_data.csv', index=False, encoding='utf8')

In [4]:
df = pd.read_csv('movie_data.csv')
df.head(3)

Unnamed: 0,0,1
0,"At least if you're a Disney fanatic (well, of ...",0
1,"great mystery, but the film goes down hill fro...",0
2,I concur with what mallicka.b has said. The mo...,1


## Bag of Words Model

In [14]:
from sklearn.feature_extraction.text import  CountVectorizer

In [15]:
count = CountVectorizer()

In [19]:
docs = np.array(['The sun is shining',
                'The weather is sweet',
                'The sun is shining and the weather is sweet'])

In [20]:
bag = count.fit_transform(docs)

In [21]:
print(count.vocabulary_)

{'the': 5, 'sun': 3, 'is': 1, 'shining': 2, 'weather': 6, 'sweet': 4, 'and': 0}


In [22]:
print(bag.toarray())

[[0 1 1 1 0 1 0]
 [0 1 0 0 1 1 1]
 [1 2 1 1 1 2 1]]


In [23]:
from sklearn.feature_extraction.text import TfidfTransformer

In [24]:
tfidf = TfidfTransformer()

In [25]:
np.set_printoptions(precision=2)

In [26]:
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[ 0.    0.43  0.56  0.56  0.    0.43  0.  ]
 [ 0.    0.43  0.    0.    0.56  0.43  0.56]
 [ 0.4   0.48  0.31  0.31  0.31  0.48  0.31]]


### Cleaning Text Data

In [5]:
df.columns = ['review', 'sentiment']

In [7]:
df.loc[0, 'review'][-50:]

"o blatantly tacky that you can't help but love it."

In [8]:
import re

In [9]:
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-','')
    return text

In [10]:
preprocessor('<a> This :) is a :( test </a>)')

' this is a test :) :('

In [11]:
df['review'] = df['review'].apply(preprocessor)

## Processing documents into tokens

In [12]:
def tokenizer(text):
    return text.split()

In [13]:
tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [14]:
from nltk.stem.porter import PorterStemmer

In [15]:
porter = PorterStemmer()

In [16]:
def tokenizer_porter(text):
    return [ porter.stem(word) for
           word in text.split()]

In [17]:
tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

#### Remove Stop Words

In [18]:
import nltk

In [19]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cbcastleberry\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
from nltk.corpus import stopwords

In [21]:
stop = stopwords.words('english')

In [22]:
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

## Training a logistic regression model for document classification

In [23]:
X_train = df.loc[:25000, 'review'].values
X_test = df.loc[25000:, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
y_test = df.loc[25000:, 'sentiment'].values

In [25]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [26]:
tfidf = TfidfVectorizer(strip_accents=None,
                       lowercase=False,
                       preprocessor=None)

In [27]:
param_grid = [{'vect__ngram_range':[(1,1)],
              'vect__stop_words':[stop,None],
              'vect__tokenizer':[tokenizer, tokenizer_porter],
              'clf__penalty':['l1', 'l2'],
              'clf__C':[1.0, 10.0, 100.0]},
              {'vect__ngram_range':[(1,1)],
              'vect__stop_words':[stop,None],
              'vect__tokenizer':[tokenizer, tokenizer_porter],
              'vect__use_idf':[False],
              'vect__norm':[None],
              'clf__penalty': ['l1', 'l2'],
              'clf__C':[1.0, 10.0, 100.0]}]

In [28]:
lr_tfidf = Pipeline([
    ('vect', tfidf),
    ('clf', LogisticRegression(random_state=0))
])

In [30]:
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid=param_grid,
                          scoring='accuracy', cv=3,
                          verbose=1)

In [None]:
gs_lr_tfidf.fit(X_train, y_train)

Fitting 3 folds for each of 48 candidates, totalling 144 fits


In [None]:
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)

## Working with bigger data

In [1]:
import numpy as np

In [2]:
import re

In [3]:
from nltk.corpus import stopwords

In [4]:
stop = stopwords.words('english')

In [5]:
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-','')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

In [6]:
def stream_docs(path):
    with open(path, 'r', encoding='utf8') as csv:
        next(csv)
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [7]:
next(stream_docs(path='./movie_data.csv'))

('"At least if you\'re a Disney fanatic (well, of the variety who loves their live-action films as well as the animated stuff), if you\'re a kid, if you\'re a kid at heart almost to the extent that you hardly realize you\'re an adult, if you love absolutely any film that features animals, especially when they\'re doing tricks, or if you\'re just not too demanding, Air Bud: World Pup is somewhat enjoyable to watch. I\'m a Disney fanatic. I enjoyed this film enough, and I\'ll gladly watch it again.<br /><br />But boy does it have a lot of problems. The main flaw arises from a combination of too many characters, too many plot threads and not enough time to take care of them all. In the space of 82 minutes, we\'ve got adults getting married, teens falling in love and trying not to be awkward at it, teen competition for love and jealousy, preteens playing spy games, dogs falling in love, dogs playing soccer, dogs having puppies, manipulative parents who\'ll do anything to make their kids wi

In [None]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None None
    return docs, y