*Python Machine Learning 3rd Edition* by [Sebastian Raschka](https://sebastianraschka.com), Packt Publishing Ltd. 2019

Code Repository: https://github.com/rasbt/python-machine-learning-book-3rd-edition

Code License: [MIT License](https://github.com/rasbt/python-machine-learning-book-3rd-edition/blob/master/LICENSE.txt)

# Python Machine Learning - Code Examples

# Notebook Setup

When you are done with this notebook, run the following code cell to unmount Google Drive

In [None]:
from google.colab import drive
drive.flush_and_unmount()
print('All changes made in this colab session should now be visible in Drive.')

The following code cell installs additional dependencies required to run the Jupyter Notebooks used in this class.

In [None]:
# Add additional Python packages that we will be using in class
!apt install graphviz build-essential checkinstall imagemagick
# Base Python packages to run example Jupyter Notebooks
!pip install watermark pyprind mlxtend
# Python packages to visualize Decision Tree Classifiers
!pip install pydotplus graphviz pyparsing
# Python packages for Natrual Language Processing
!pip install nltk
# Python packages for Flask-based web applications
!pip install flask wtforms
# Python packages for TensorFlow
!pip install tensorflow tensorflow-datasets

The following code cell defines a variable equal to the location inside your Google Drive where you copied the ch03 folder

In [None]:
##### TODO CHANGE THIS TO THE PATH IN GOOGLE DRIVE WHERE YOU COPIED THE ch03 FOLDER #####
google_drive_root='/Colab Notebooks/Project02'

In [None]:
google_drive_mount_location = '/content/drive'
google_file_prefix=google_drive_mount_location + '/My Drive/' + google_drive_root + '/'

The following code cell mounts your Google Drive into the runtime of the workbook, so that you can access files.

In [None]:
# Read more here: https://colab.research.google.com/notebooks/io.ipynb#scrollTo=D78AM1fFt2ty
from google.colab import drive
drive.mount(google_drive_mount_location)

In [None]:
import os
os.chdir(google_file_prefix)

# Project 2: Sentiment Analysis - Amazon Product Reviews

# **1. Experiment Objective**
The objective of this experiment is to train an machine learning model on Amazon product reviews.

# **TOS information:**

Amazon Product Reviews from Kaggle

https://www.kaggle.com/datafiniti/consumer-reviews-of-amazon-products

Okay to use for academic purposes.



# **Data Collection**

In [None]:
import numpy as np
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Project02/Amazon_Reviews.csv')

In [None]:
df.head()

In [None]:
df.info()

# **Data Preprocessing**

In [None]:
df.isnull().sum()

In [None]:
df = df[['reviews.rating','reviews.text']]

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df = df.dropna(axis=0)

In [None]:
df.isnull().sum()

**Label encoding the sentiment column 'reviews.rating'**

In [None]:
sentiment = {1: 0, 2: 0, 3: 0, 4: 1, 5: 1}

df['sentiment']=df['reviews.rating'].map(sentiment)

In [None]:
df['sentiment'].unique()

In [None]:
df['sentiment'].value_counts()

In [None]:
df = df.drop('reviews.rating',axis=1)

In [None]:
df.rename(columns={"reviews.text":"review"} ,inplace=True)

In [None]:
df.info()

**Save a cleaned copy to a csv.**

In [None]:
df.to_csv('/content/drive/MyDrive/Colab Notebooks/Project02/ama_data.csv', index=False, encoding='utf-8')

## **Cleaning text data**

In [None]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

In [None]:
df['review'] = df['review'].apply(preprocessor)

In [None]:
df['review'][0]

## **Processing words into tokens**

In [None]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

def tokenizer(text):
    return text.split()


def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('stopwords')

df['review'] = df['review'].apply(word_tokenize)
stop = stopwords.words('english')
df['review'] = df['review'].apply(lambda words: [word for word in words if word not in stop])

In [None]:
df['review'][0]

In [None]:
df['review'] = df['review'].astype(str)

In [None]:
df['review'][0]

## **Constructing a TF-IDF Vectorizer**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
vectorizer.fit(df['review'])
features = vectorizer.transform(df['review'])

In [None]:
tf_idf = pd.DataFrame(features.toarray(), columns=vectorizer.get_feature_names())

In [None]:
tf_idf.head()

# **Model Training**

In [None]:
X = df['review']
y = df['sentiment']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import GridSearchCV

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

# loss = ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']
penalty = ['l2','l1']
alpha = [0.001, 0.01, 0.1]


param_grid = [{'vect__ngram_range': [(1, 1),(1,2)],
               'vect__stop_words': [stop],
               'vect__tokenizer': [tokenizer],
               'clf__penalty': penalty,
               'clf__alpha': alpha},
              {'vect__ngram_range': [(1, 1), (1,2)],
               'vect__stop_words': [stop],
               'vect__tokenizer': [tokenizer],
               'vect__use_idf':[True, False],
               'vect__norm':[None],
               'clf__penalty': penalty,
               'clf__alpha': alpha},
              ]

sgd_tfidf = Pipeline([('vect', tfidf),
                     ('clf', SGDClassifier(random_state=0, loss='hinge'))])

gs_sgd_tfidf = GridSearchCV(sgd_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=2,
                           n_jobs=-1)

In [None]:
gs_sgd_tfidf.fit(X_train, y_train)

In [None]:
print('Best parameter set: %s ' % gs_sgd_tfidf.best_params_)
print('CV Accuracy: %.3f' % gs_sgd_tfidf.best_score_)

In [None]:
clf = gs_sgd_tfidf.best_estimator_
print('Test Accuracy: %.3f' % clf.score(X_test, y_test))

**Optimal hyperparameters found**

{'clf__alpha': 0.001, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__norm': None, 'vect__use_idf': False}

**Now to fit the SGD classifier on these hyperparameters and pickle the file for out-of-core learning**

In [None]:
vectorizer = TfidfVectorizer(ngram_range= (1,2), norm=None, use_idf=True, strip_accents=None, lowercase=False, preprocessor=None, max_features = 5000)
vectorizer.fit(df['review'])
features = vectorizer.transform(df['review'])

In [None]:
tfidf = pd.DataFrame(features.toarray(), columns=vectorizer.get_feature_names())

In [None]:
tfidf.head()

In [None]:
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier(alpha = 0.001,penalty = 'l2', random_state=0, loss='hinge')
clf = sgd.partial_fit(tfidf, df['sentiment'],classes=df['sentiment'].unique())

Model fitted against entire dataset. Now to pickle the resulting file.

In [None]:
import pickle
import os

dest = os.path.join(google_file_prefix + 'amazonclassifier', 'pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)

pickle.dump(stop, open(os.path.join(dest, 'stopwords.pkl'), 'wb'), protocol=4)   
pickle.dump(clf, open(os.path.join(dest, 'amazonclassifier.pkl'), 'wb'), protocol=4)

In [None]:
import pickle
import os

dest = os.path.join(google_file_prefix + 'amazonclassifier_with_update', 'pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)

pickle.dump(stop, open(os.path.join(dest, 'stopwords.pkl'), 'wb'), protocol=4)   
pickle.dump(clf, open(os.path.join(dest, 'amazonclassifier.pkl'), 'wb'), protocol=4)

In [None]:
import os
os.chdir(google_file_prefix)

In [None]:
%%writefile amazonclassifier/vectorizer.py
from sklearn.feature_extraction.text import HashingVectorizer
import re
import os
import pickle

cur_dir = os.path.dirname(__file__)
stop = pickle.load(open(os.path.join(
                cur_dir,
                'pkl_objects', 
                'stopwords.pkl'), 'rb'))

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) \
                   + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

vect = HashingVectorizer(decode_error='ignore',
                         n_features=2**21,
                         preprocessor=None,
                         ngram_range = (1,2),
                         norm = None,
                         use_idf=True,
                         tokenizer=tokenizer)

In [None]:
import os
os.chdir(google_file_prefix + 'amazonclassifier')

In [None]:
import pickle
import re
import os
from vectorizer import vect

clf = pickle.load(open(os.path.join('pkl_objects', 'amazonclassifier.pkl'), 'rb'))

In [None]:
import numpy as np
label = {0:'negative', 1:'positive'}

example = ["I love this movie. It's amazing."]
X = vect.transform(example)
print('Prediction: %s\nProbability: %.2f%%' %
      (label[clf.predict(X)[0]], 
       np.max(clf.predict_proba(X))*100))

# **Setting up SQLite**

In [None]:
os.getcwd()

In [None]:
import sqlite3
import os

conn = sqlite3.connect('reviews.sqlite')
c = conn.cursor()

c.execute('DROP TABLE IF EXISTS review_db')
c.execute('CREATE TABLE review_db (review TEXT, sentiment INTEGER, date TEXT)')

example1 = 'I love this movie'
c.execute("INSERT INTO review_db (review, sentiment, date) VALUES (?, ?, DATETIME('now'))", (example1, 1))

example2 = 'I disliked this movie'
c.execute("INSERT INTO review_db (review, sentiment, date) VALUES (?, ?, DATETIME('now'))", (example2, 0))

conn.commit()
conn.close()

In [None]:
conn = sqlite3.connect('reviews.sqlite')
c = conn.cursor()

c.execute("SELECT * FROM review_db WHERE date BETWEEN '2017-01-01 10:10:10' AND DATETIME('now')")
results = c.fetchall()

conn.close()

In [None]:
print(results)

## **Updating the movie review classifier**

Let us make and operate on a copy of the movieclassifier subdirectory (this should already exist when you downloaded this GitHub repo (otherwise, please duplicate the `movieclassifier` directory).

In [None]:
import shutil

os.chdir('..')

if not os.path.exists('amazonclassifier_with_update'):
    os.mkdir('amazonclassifier_with_update')
os.chdir('amazonclassifier_with_update')

if not os.path.exists('pkl_objects'):
    os.mkdir('pkl_objects')

shutil.copyfile('../amazonclassifier/pkl_objects/amazonclassifier.pkl',
                './pkl_objects/amazonclassifier.pkl')

shutil.copyfile('../amazonclassifier/reviews.sqlite',
                './reviews.sqlite')

Define a function to update the classifier with the data stored in the local SQLite database:

In [None]:
import pickle
import sqlite3
import numpy as np

# import HashingVectorizer from local dir
from vectorizer import vect

def update_model(db_path, model, batch_size=10000):

    conn = sqlite3.connect(db_path)
    c = conn.cursor()
    c.execute('SELECT * from review_db')
    
    results = c.fetchmany(batch_size)
    while results:
        data = np.array(results)
        X = data[:, 0]
        y = data[:, 1].astype(int)
    
        classes = np.array([0, 1])
        X_train = vect.transform(X)
        clf.partial_fit(X_train, y, classes=classes)
        results = c.fetchmany(batch_size)
    
    conn.close()
    return None

Update the model:

In [None]:
cur_dir = '.'

# Use the following path instead if you embed this code into
# the app.py file

# import os
# cur_dir = os.path.dirname(__file__)

clf = pickle.load(open(os.path.join(cur_dir,
                 'pkl_objects',
                 'amazonclassifier.pkl'), 'rb'))
db = os.path.join(cur_dir, 'reviews.sqlite')

update_model(db_path=db, model=clf, batch_size=10000)

# Uncomment the following lines to update your classifier.pkl file

# pickle.dump(clf, open(os.path.join(cur_dir, 
#             'pkl_objects', 'classifier.pkl'), 'wb')
#             , protocol=4)