<a href="https://colab.research.google.com/github/benedictlai/thinkful-challenges/blob/master/Final_Challenge_Bulid_your_Own_NLP_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter
import nltk
nltk.download('gutenberg')
import nltk
nltk.download('stopwords')
from nltk.corpus import gutenberg, stopwords
from sklearn import ensemble
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import LinearSVC

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [0]:
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [0]:
macbeth = gutenberg.raw('shakespeare-macbeth.txt')
leaves = gutenberg.raw('whitman-leaves.txt')
print('\nRaw:\n', macbeth[0:100])


Raw:
 [The Tragedie of Macbeth by William Shakespeare 1603]


Actus Primus. Scoena Prima.

Thunder and Lig


# 1. Data cleaning / processing / language parsing

In [0]:
def text_cleaner(text):
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text
  
macbeth = text_cleaner(macbeth)
leaves = text_cleaner(leaves)

In [0]:
nlp = spacy.load('en')
macbeth_doc = nlp(macbeth)
leaves_doc = nlp(leaves)

In [0]:
macbethsent = [[sent, "Shake"] for sent in macbeth_doc.sents]
leavessent = [[sent, "Whitman"] for sent in leaves_doc.sents]

sentences = pd.DataFrame(macbethsent + leavessent)
sentences.head()

Unnamed: 0,0,1
0,"(Actus, Primus, .)",Shake
1,"(Scoena, Prima, .)",Shake
2,"(Thunder, and, Lightning, .)",Shake
3,"(Enter, three, Witches, .)",Shake
4,"(1, .)",Shake


In [0]:
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    tokens = wpt.tokenize(doc)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

In [0]:
macbethcorpus = []
macbethcorpus = sentences[0]
macbethcorpus = np.array(macbethcorpus)

macbethcorpus2 = []
for item in macbethcorpus:
    macbethcorpus2.append(str(item))
    
macbethcorpus3 = []

for doc in macbethcorpus2:
    macbethcorpus3.append(normalize_corpus(doc))
    
macbethcorpus4 = []
for item in macbethcorpus3:
    macbethcorpus4.append(item.tolist())

In [0]:
cv =  CountVectorizer(min_df=0, max_df=1)
cvmatrix = cv.fit_transform(macbethcorpus4)
cvmatrix = cvmatrix.toarray()
vocab = cv.get_feature_names()

macbethcvmatrix = pd.DataFrame(cvmatrix, columns=vocab)

In [0]:
macbethcvmatrix
macbethcvmatrix['text_sentence'] = sentences[0]
macbethcvmatrix['text_source'] = sentences[1]

In [0]:
Y1 = macbethcvmatrix['text_source']
X1 = np.array(macbethcvmatrix.drop(['text_sentence','text_source'], 1))

X1_train, X1_test, y1_train, y1_test = train_test_split(X1,Y1,test_size=0.4,random_state=0)

# BoW Random Forest Classifier

In [0]:
rfc = ensemble.RandomForestClassifier()
train = rfc.fit(X1_train, y1_train)

print('\nTraining set scores:', rfc.score(X1_train, y1_train))
print('\nTest set scores:', rfc.score(X1_test, y1_test))




Training set scores: 0.8347372643401524

Test set scores: 0.749172932330827


In [0]:
cross_val_score(rfc, X1_train, y1_train, cv=5)

array([0.749499, 0.749499, 0.749499, 0.75    , 0.75    ])

# BoW Logistic Regression

In [0]:
lr = LogisticRegression()
train = lr.fit(X1_train, y1_train)
print(X1_train.shape, y1_train.shape)
print('\nTraining set scores:', lr.score(X1_train, y1_train))
print('\nTest set scores:', lr.score(X1_test, y1_test))



(4986, 6887) (4986,)

Training set scores: 0.7681508223024468

Test set scores: 0.749172932330827


In [0]:
cross_val_score(lr, X1_train, y1_train, cv=5)



array([0.749499, 0.749499, 0.749499, 0.75    , 0.75    ])

# BoW Gradient Boosting Classifier

In [0]:
clf = ensemble.GradientBoostingClassifier()
train = clf.fit(X1_train, y1_train)

print('\nTraining set scores:', clf.score(X1_train, y1_train))
print('\nTest set scores:', clf.score(X1_test, y1_test))


Training set scores: 0.7496991576413959

Test set scores: 0.749172932330827


In [0]:
cross_val_score(clf, X1_train, y1_train, cv=5)

array([0.749499, 0.749499, 0.749499, 0.75    , 0.75    ])

# BoW SVC

In [0]:
model = LinearSVC()
train = model.fit(X1_train, y1_train)

print('\nTraining set scores:', model.score(X1_train, y1_train))
print('\nTest set scores:', model.score(X1_test, y1_test))


Training set scores: 0.8363417569193743

Test set scores: 0.749172932330827


In [0]:
cross_val_score(model, X1_train, y1_train, cv=5)

array([0.749499, 0.749499, 0.749499, 0.75    , 0.75    ])

# Tf-IDF

In [0]:
vectorizer = TfidfVectorizer(max_df=0.5,
                             min_df=2,
                             stop_words='english', 
                             lowercase=True,
                             use_idf=True,
                             norm=u'l2',
                             smooth_idf=True
                            )

macbethcorpus4tfidf=vectorizer.fit_transform(macbethcorpus4)
print("%d is the number of features." % macbethcorpus4tfidf.get_shape()[1])

7073 is the number of features.


In [0]:
tfidmatrix = macbethcorpus4tfidf.toarray()
tfidvocab = vectorizer.get_feature_names()

macbethtfidmatrix = pd.DataFrame(tfidmatrix, columns=tfidvocab)


In [0]:

macbethtfidmatrix['text_sentence'] = sentences[0]
macbethtfidmatrix['text_source'] = sentences[1]


In [0]:
Y1 = macbethtfidmatrix['text_source']
X1 = np.array(macbethtfidmatrix.drop(['text_sentence','text_source'], 1))

X1_train_tfidf, X1_test_tfidf, y1_train_tfidf, y1_test_tfidf = train_test_split(X1, Y1,test_size=0.4, random_state=0)

# Tf-IDF Random Forest Classifier

In [0]:
rfc = ensemble.RandomForestClassifier()
train = rfc.fit(X1_train_tfidf, y1_train_tfidf)

print('\nTraining set scores:', rfc.score(X1_train_tfidf, y1_train_tfidf))
print('\nTest set scores:', rfc.score(X1_test_tfidf, y1_test_tfidf))




Training set scores: 0.9809466506217409

Test set scores: 0.9121804511278195


In [0]:

cross_val_score(rfc, X1_train_tfidf, y1_train_tfidf, cv=5)

array([0.90480962, 0.89779559, 0.91583166, 0.93273092, 0.91566265])

# Tf-IDF Logistic Regression

In [0]:
lr = LogisticRegression()
train = lr.fit(X1_train_tfidf, y1_train_tfidf)
print(X1_train_tfidf.shape, y1_train_tfidf.shape)
print('\nTraining set scores:', lr.score(X1_train_tfidf, y1_train_tfidf))
print('\nTest set score:', lr.score(X1_test_tfidf, y1_test_tfidf))



(4986, 7073) (4986,)

Training set scores: 0.9107501002807862

Test set score: 0.8803007518796993


In [0]:
cross_val_score(lr, X1_train_tfidf, y1_train_tfidf, cv=5)



array([0.86472946, 0.85971944, 0.87374749, 0.8875502 , 0.86546185])

# Tf-IDF Gradient Boosting Classifier

In [0]:
clf = ensemble.GradientBoostingClassifier()
train = clf.fit(X1_train_tfidf, y1_train_tfidf)

print('\nTraining set scores:', clf.score(X1_train_tfidf, y1_train_tfidf))
print('\nTest set scores:', clf.score(X1_test_tfidf, y1_test_tfidf))


Training set scores: 0.8590052146008824

Test set scores: 0.8523308270676692


In [0]:
cross_val_score(clf, X1_train_tfidf, y1_train_tfidf, cv=5)

array([0.84068136, 0.84068136, 0.8507014 , 0.8564257 , 0.84939759])

# Tf-IDF SVC

In [0]:
model = LinearSVC()
train = model.fit(X1_train_tfidf, y1_train_tfidf)

print('\nTraining set scores:', model.score(X1_train_tfidf, y1_train_tfidf))
print('\nTest set scores:', model.score(X1_test_tfidf, y1_test_tfidf))


Training set scores: 0.9779382270357

Test set scores: 0.9266165413533834


In [0]:
cross_val_score(model, X1_train_tfidf, y1_train_tfidf, cv=5)

array([0.90681363, 0.91082164, 0.92785571, 0.94277108, 0.92068273])

# Model Tuning

In [0]:
from sklearn.metrics import accuracy_score

params = {'C':[1,5], 'penalty':['l1','l2']}
lr = LogisticRegression()

grid = GridSearchCV(lr, params, cv=3)
grid.fit(X1_train_tfidf, y1_train_tfidf)
best_lr = grid.best_estimator_
score = accuracy_score(y_true=y1_test_tfidf, y_pred=best_lr.predict(X1_test_tfidf))
print('\nThe accuracy score is:\t{:.4f}\n'.format(score))




The accuracy score is:	0.9185



In [0]:
cross_val_score(best_lr, X1_train_tfidf, y1_train_tfidf, cv=5)



array([0.90280561, 0.90180361, 0.91683367, 0.92971888, 0.92168675])

# SVD (Singular Value Decomposition)

In [0]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

In [0]:
svd= TruncatedSVD(130)
lsa = make_pipeline(svd, Normalizer(copy=False))

Data_lsa= lsa.fit_transform(X1)

X1_train_lsa, X1_test_lsa, y1_train_lsa, y1_test_lsa = train_test_split(Data_lsa, Y1, random_state=3)

lr = LogisticRegression()
train = lr.fit(X1_train_lsa, y1_train_lsa)
print(X1_train_lsa.shape, y1_train_lsa.shape)
print('\nTraining set scores:', lr.score(X1_train_lsa, y1_train_lsa))
print('\nTest set scores:', lr.score(X1_test_lsa, y1_test_lsa))

cross_val_score(lr, X1_train_lsa, y1_train_lsa, cv=5)



(6233, 130) (6233,)

Training set scores: 0.9140060965827049

Test set scores: 0.9100096246390761




array([0.89815557, 0.90537289, 0.92141139, 0.90850722, 0.90609952])