**PREPROCESSING the movie dataset**

In [1]:
from google.colab import files
uploaded = files.upload()

Saving movie_data.csv to movie_data.csv


In [54]:
import pandas as pd
import io

df = pd.read_csv(io.BytesIO(uploaded['movie_data.csv']))

df.head(3)

Unnamed: 0,review,sentiment
0,"After five years in prison, Tony le Stéphanois...",1
1,I am a fan of Ed Harris' work and I really had...,0
2,I can appreciate what Barney is trying to achi...,0


Transforming into feature vectors

In [3]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()

docs = np.array([
    'The sun is shining',
    'The weather is sweet',
    'The sun is shining, the weather is sweet, and one and one is two'
])
bag = count.fit_transform(docs)

print(count.vocabulary_)
print(bag.toarray())

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}
[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


Assessing word relevancy via Term frequency-inverse document frequency(tfidf)

In [4]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
np.set_printoptions(precision=2)

print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


Cleaning the text data

In [5]:
import re #regular expression (regex)

def preprocessor(text):
  text = re.sub('<[^>]*>', '', text)
  emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
  text = (re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', ''))

  return text

In [6]:
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :) :( :)'

In [7]:
df['review'] = df['review'].apply(preprocessor)

Processing documents into tokens

In [8]:
def tokenizer(text):
  return text.split()

tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [10]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()

def tokenizer_porter(text):
  return [porter.stem(word) for word in text.split()]

tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

stop-word removal

In [11]:
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [14]:
from nltk.corpus import stopwords

stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

**Training a LOGISTIC REGRESSION model for document classification**

In [15]:
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)

param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenzier': [tokenizer, tokenizer_porter],
               'vect__use_idf': [False],
               'vect__norm': [None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]}
              ]
lr_tfidf = Pipeline([('vect', tfidf), ('clf', LogisticRegression(random_state=0))])
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=-1)

gs_lr_tfidf.fit(X_train, y_train)

In [22]:
# After running the grid search we obtain:
# Best parameter_set: {'clf__C': 10.0, 'vect__stop_words': None, 'clf__penalty': 'l2', 'vect__tokenizer': {regular tokenizer}, 'vect__ngram_range': (1, 1)}
# CV accuracy for the best model = 0.892

In [24]:
from sklearn.pipeline import make_pipeline

clf_best = make_pipeline(TfidfVectorizer(ngram_range=(1, 1), stop_words=None, tokenizer=tokenizer), LogisticRegression(penalty='l2', C=10.0))

clf_best.fit(X_train ,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(tokenizer=<function tokenizer at 0x7f5bd999bca0>)),
                ('logisticregression', LogisticRegression(C=10.0))])

In [25]:
print('Test accuracy: %.3f' %clf_best.score(X_test, y_test))

Test accuracy: 0.895


**Online algorithms and Out-of-Core learning**



In [26]:
import numpy as np
import re
from nltk.corpus import stopwords
stop = stopwords.words('english')

def tokenizer(text):
  text = re.sub('<[^>]*>', '', text)
  emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
  tokenized = [w for w in text.split() if w not in stop]
  return tokenized


to read in and return one document at a time

In [27]:
def stream_docs(path):
  with open(path, 'r', encoding='utf-8') as csv:
    next(csv) #skip header
    for line in csv:
      text, label = line[:-3], int(line[-2])
      yield text, label

next(stream_docs(path='movie_data.csv'))

('"After five years in prison, Tony le Stéphanois (Jean Servais) meets his dearest friends Jo (Carl Möhner) and the Italian Mario Ferrati (Robert Manuel) and they invite Tony to steal a couple of jewels from the show-window of the famous jewelry Mappin & Webb Ltd, but he declines. Tony finds his former girlfriend Mado (Marie Sabouret), who became the lover of the gangster owner of the night-club L\' Âge d\' Or Louis Grutter (Pierre Grasset), and he humiliates her, beating on her back and taking her jewels. Then he calls Jo and Mario and proposes a burglary of the safe of the jewelry. They invite the Italian specialist in safes and elegant wolf Cesar (Perlo Vita) to join their team and they plot a perfect heist. They are successful in their plan, but the D. Juan Cesar makes things go wrong when he gives a valuable ring to his mistress.<br /><br />""Du Rififi Chez les Hommes"" is a magnificent film-noir, certainly among the best I have seen. The screenplay has credibility, supported by a

function to return a particular number of documents specified by the *size* parameter

In [28]:
def get_minibatch(doc_stream, size):
  docs, y = [], []
  try:
    for _ in range(size):
      text, label = next(doc_stream)
      docs.append(text)
      y.append(label)
  except StopIteration:
    return None, None
  return docs, y

since CountVectorizer and TfidfVectorizer cannot be used we instead use the HsshingVectorizer which uses the hashing trick

In [32]:
%pip install pyprind

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyprind
  Downloading PyPrind-2.11.3-py2.py3-none-any.whl (8.4 kB)
Installing collected packages: pyprind
Successfully installed pyprind-2.11.3


In [30]:
from sklearn.feature_extraction.text import HashingVectorizer
# we need a Stochastic gradient descent algorithm for online learning
from sklearn.linear_model import SGDClassifier

vect = HashingVectorizer(decode_error='ignore', n_features=2**21, preprocessor=None, tokenizer=tokenizer)
clf = SGDClassifier(loss='log', random_state=1, max_iter=1)
doc_stream = stream_docs(path='movie_data.csv')

choosing a large number of features for the Hashingvectorizer, we also increase the coefficients in the log Reg.

In [41]:
import pyprind
pbar = pyprind.ProgBar(45)
classes = np.array([0, 1])
for _ in range(45):
  X_train, y_train = get_minibatch(doc_stream, size=1000)
  if not X_train:
    break
  X_train = vect.transform(X_train)
  clf.partial_fit(X_train, y_train, classes=classes)
  pbar.update()

Now using the last 5K documents to evaluate performance

In [None]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))

**LATENT DIRICHLET ALLOCATION** (Topic Modelling)

In [43]:
import pandas as pd
df = pd.read_csv('movie_data.csv', encoding='utf-8')

In [46]:
# to create a bag-of-words matrix we use the CountVectorizer
# we exclude all the words with document frequency > 10%

In [47]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(stop_words='english', max_df=.1, max_features=5000)
X = count.fit_transform(df['review'].values)

In [48]:
# we chose the batch learning method instead of the online faster method

In [51]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=10, random_state=123, learning_method='batch')
X_topics = lda.fit_transform(X)

In [52]:
n_top_words = 5
feature_names = count.get_feature_names()
for topic_idx, topic in enumerate(lda.components_):
  print('Topic %d:' %(topic_idx + 1))
  print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1: -1]]))

Topic 1:
worst minutes script awful stupid
Topic 2:
family mother father children girl
Topic 3:
american war dvd music tv
Topic 4:
human audience cinema art sense
Topic 5:
police guy car dead murder
Topic 6:
horror house sex blood girl
Topic 7:
role performance comedy actor performances
Topic 8:
series episode episodes war season
Topic 9:
book version original effects special
Topic 10:
action fight guy fun kids


