In [1]:
from pprint import pprint

import pandas as pd
import numpy as np

# visualize
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns 
import graphviz
from graphviz import Graph

# turn off pink warning boxes
import warnings
warnings.filterwarnings("ignore")

import acquire
import prepare
from env import user, password, host

#tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer

#normalize text
import nltk
import unicodedata
import re

#train, validate, test
from sklearn.model_selection import train_test_split

#creating / evaluating models
# Decision Tree  
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz

# Random Forest
from sklearn.ensemble import RandomForestClassifier

# K-Nearest Neighbor(KNN)  
from sklearn.neighbors import KNeighborsClassifier

# Logistic Regression
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report, accuracy_score

From the codeup NLP modeling module:

In [2]:
#example 1 document
document = 'Mary had a little lamb, a little lamb, a little lamb.'

# clean up the text
document = document.lower().replace(',', '').replace('.', '')
# transform into a series
words = pd.Series(document.split())

# From the Series we can extract the value_counts, which is our raw count
# for term frequency. Once we have the raw counts, we can calculate the
# other measures.
(pd.DataFrame({'raw_count': words.value_counts()})
 .assign(frequency=lambda df: df.raw_count / df.raw_count.sum())
 .assign(augmented_frequency=lambda df: df.frequency / df.frequency.max()))

Unnamed: 0,raw_count,frequency,augmented_frequency
lamb,3,0.272727,1.0
a,3,0.272727,1.0
little,3,0.272727,1.0
had,1,0.090909,0.333333
mary,1,0.090909,0.333333


In [3]:
#3 example documents
documents = {
    'news': 'Codeup announced last thursday that they just launched a new data science program. It is 18 weeks long.',
    'description': 'Codeup\'s data science program teaches hands on skills using Python and pandas.',
    'context': 'Codeup\'s data science program was created in response to a percieved lack of data science talent, and growing demand.'
}
pprint(documents)

print('\nCleaning and lemmatizing...\n')

documents = {topic: prepare.lemmatize(prepare.basic_clean(documents[topic])) for topic in documents}
pprint(documents)

{'context': "Codeup's data science program was created in response to a "
            'percieved lack of data science talent, and growing demand.',
 'description': "Codeup's data science program teaches hands on skills using "
                'Python and pandas.',
 'news': 'Codeup announced last thursday that they just launched a new data '
         'science program. It is 18 weeks long.'}

Cleaning and lemmatizing...

{'context': "codeup's data science program wa created in response to a "
            'percieved lack of data science talent and growing demand',
 'description': "codeup's data science program teach hand on skill using "
                'python and panda',
 'news': 'codeup announced last thursday that they just launched a new data '
         'science program it is 18 week long'}


In [4]:
# A simple way to calculate idf for demonstration. Note that this
# function relies on the globally defined documents variable.
def idf(word):
    n_occurences = sum([1 for doc in documents.values() if word in doc])
    return len(documents) / n_occurences

# Get a list of the unique words
unique_words = pd.Series(' '.join(documents.values()).split()).unique()

# put the unique words into a data frame
(pd.DataFrame(dict(word=unique_words))
 # calculate the idf for each word
 .assign(idf=lambda df: df.word.apply(idf))
 # sort the data for presentation purposes
 .set_index('word')
 .sort_values(by='idf', ascending=False)
 .head(5))

Unnamed: 0_level_0,idf
word,Unnamed: 1_level_1
teach,3.0
created,3.0
hand,3.0
skill,3.0
using,3.0


In [5]:
tfs = []

# We'll calculate the tf-idf value for every word across every document

# Start by iterating over all the documents
for doc, text in documents.items():
    # We'll make a data frame that contains the tf for every word in every document
    df = (pd.Series(text.split())
          .value_counts()
          .reset_index()
          .set_axis(['word', 'raw_count'], axis=1, inplace=False)
          .assign(tf=lambda df: df.raw_count / df.shape[0])
          .drop(columns='raw_count')
          .assign(doc=doc))
    # Then add that data frame to our list
    tfs.append(df)

# We'll then concatenate all the tf values together.
(pd.concat(tfs)
 # calculate the idf value for each word
 .assign(idf=lambda df: df.word.apply(idf))
 # then use the if and idf values to calculate tf-idf 
 .assign(tf_idf=lambda df: df.idf * df.tf)
 .drop(columns=['tf', 'idf'])
 .sort_values(by='tf_idf', ascending=False))

Unnamed: 0,word,doc,tf_idf
10,using,description,0.25
11,hand,description,0.25
7,skill,description,0.25
4,python,description,0.25
2,teach,description,0.25
0,panda,description,0.25
2,growing,context,0.176471
3,of,context,0.176471
4,demand,context,0.176471
6,wa,context,0.176471


In [6]:
# We'll then concatenate all the tf values together.
(pd.concat(tfs)
 # calculate the idf value for each word
 .assign(idf=lambda df: df.word.apply(idf))
 # then use the if and idf values to calculate tf-idf 
 .assign(tf_idf=lambda df: df.idf * df.tf)
 .drop(columns=['tf', 'idf'])
 .sort_values(by='tf_idf', ascending=False)
 .pipe(lambda df: pd.crosstab(df.doc, df.word, values=df.tf_idf, aggfunc=lambda x: x))
 .fillna(0))

word,18,a,and,announced,codeup,codeup's,created,data,demand,growing,...,skill,talent,teach,that,they,thursday,to,using,wa,week
doc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
context,0.0,0.058824,0.088235,0.0,0.0,0.088235,0.176471,0.117647,0.176471,0.176471,...,0.0,0.176471,0.0,0.0,0.0,0.0,0.176471,0.0,0.176471,0.0
description,0.0,0.0,0.125,0.0,0.0,0.125,0.0,0.083333,0.0,0.0,...,0.25,0.0,0.25,0.0,0.0,0.0,0.0,0.25,0.0,0.0
news,0.166667,0.055556,0.0,0.166667,0.055556,0.0,0.0,0.055556,0.0,0.0,...,0.0,0.0,0.0,0.166667,0.166667,0.166667,0.0,0.0,0.0,0.166667


In [7]:
tfidf = TfidfVectorizer()
tfidfs = tfidf.fit_transform(documents.values())
tfidfs

<3x36 sparse matrix of type '<class 'numpy.float64'>'
	with 45 stored elements in Compressed Sparse Row format>

In [8]:
pd.DataFrame(tfidfs.todense(), columns=tfidf.get_feature_names())

Unnamed: 0,18,and,announced,codeup,created,data,demand,growing,hand,in,...,skill,talent,teach,that,they,thursday,to,using,wa,week
0,0.263566,0.0,0.263566,0.155666,0.0,0.155666,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.263566,0.263566,0.263566,0.0,0.0,0.0,0.263566
1,0.0,0.25388,0.0,0.19716,0.0,0.19716,0.0,0.0,0.333821,0.0,...,0.333821,0.0,0.333821,0.0,0.0,0.0,0.0,0.333821,0.0,0.0
2,0.0,0.195932,0.0,0.152159,0.257627,0.304317,0.257627,0.257627,0.0,0.257627,...,0.0,0.257627,0.0,0.0,0.0,0.0,0.257627,0.0,0.257627,0.0


In [9]:
def get_db_url(database, host=host, user=user, password=password):
    return f'mysql+pymysql://{user}:{password}@{host}/{database}'

url = get_db_url("spam_db")
sql = "SELECT * FROM spam"

df = pd.read_sql(sql, url, index_col="id")
df.head()

Unnamed: 0_level_0,label,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df.text)
y = df.label

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2)

train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

lm = LogisticRegression().fit(X_train, y_train)

train['predicted'] = lm.predict(X_train)
test['predicted'] = lm.predict(X_test)

In [11]:
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.predicted, train.actual))
print('---')
print(classification_report(train.actual, train.predicted))

Accuracy: 97.44%
---
Confusion Matrix
actual      ham  spam
predicted            
ham        3857   112
spam          2   486
---
              precision    recall  f1-score   support

         ham       0.97      1.00      0.99      3859
        spam       1.00      0.81      0.90       598

    accuracy                           0.97      4457
   macro avg       0.98      0.91      0.94      4457
weighted avg       0.98      0.97      0.97      4457



In [12]:
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.predicted, test.actual))
print('---')
print(classification_report(test.actual, test.predicted))

Accuracy: 96.32%
---
Confusion Matrix
actual     ham  spam
predicted           
ham        966    41
spam         0   108
---
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       966
        spam       1.00      0.72      0.84       149

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115



Take the work we did in the lessons further:

- What other types of models (i.e. different classifcation algorithms) could you use?

In [13]:
ADDITIONAL_STOPWORDS = ['r', 'u', '2', 'ltgt']

def clean(text):
    '''
    This function cleans up text data.
    '''
    
    #lemmatize
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words('english') + ADDITIONAL_STOPWORDS
    
    #normalize
    text = (unicodedata.normalize('NFKD', text)
             .encode('ascii', 'ignore')
             .decode('utf-8', 'ignore')
             .lower())
    words = re.sub(r'[^\w\s]', '', text).split()
    
    return ' '.join([wnl.lemmatize(word) for word in words if word not in stopwords])

In [14]:
df.text = df.text.apply(clean)
df.head()

Unnamed: 0_level_0,label,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif oni
2,spam,free entry wkly comp win fa cup final tkts 21s...
3,ham,dun say early hor c already say
4,ham,nah dont think go usf life around though


In [15]:
df.shape

(5572, 2)

In [16]:
# We'll use this split function later to create in-sample and out-of-sample datasets for modeling
def split(df, stratify_by=None):
    """
    3 way split for train, validate, and test datasets
    To stratify, send in a column name
    """
    
    
    train, test = train_test_split(df, test_size=.2, random_state=123, stratify=df[stratify_by])
    
    train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train[stratify_by])
    
    return train, validate, test

In [17]:
train, validate, test = split(df, 'label')
train.head()

Unnamed: 0_level_0,label,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
236,ham,ill little closer like bus stop street
3851,ham,truekdo knw dis
3906,ham,mean fat head
3365,ham,waiting call sir
2853,ham,haha hope i_ hear receipt sound gd luck


In [18]:
# Setup our X variables
X_train = train.text
X_validate = validate.text
X_test = test.text

In [19]:
# Setup our y variables
y_train = train.label
y_validate = validate.label
y_test = test.label

In [20]:
#baseline prediction: the most prevalent class in training dataset(the mode)
train.label.value_counts()

ham     2701
spam     418
Name: label, dtype: int64

In [21]:
#baseline model would be to predict ham since it is most prevalant
#baseline accuracy:
baseline_accuracy = (train.label == 'ham').mean()

print(f'baseline accuracy: {baseline_accuracy: .2}')

baseline accuracy:  0.87


In [22]:
# Create the tfidf vectorizer object 

# using the computed TF-IDF values as features in a model

#stp 1, this creates a tf-idf values for each word, for each document
#stp 2, encodes these values so that we can use models that only work on numbers, like classification models
tfidf = TfidfVectorizer() #encoder

# Fit on the training data
tfidf.fit(X_train)

# Use the object
X_train_vectorized = tfidf.transform(X_train)
X_validate_vectorized = tfidf.transform(X_validate)
X_test_vectorized = tfidf.transform(X_test)

X_train_vectorized

<3119x6269 sparse matrix of type '<class 'numpy.float64'>'
	with 26336 stored elements in Compressed Sparse Row format>

In [23]:
X_train_vectorized.todense().shape

(3119, 6269)

In [24]:
#Decision Tree model
dt = DecisionTreeClassifier(max_depth=3, random_state=123)

#fit model
dt.fit(X_train_vectorized, y_train)

#get predictions from model
y_pred = dt.predict(X_train_vectorized)

#compare to baseline
print("Baseline is", round(baseline_accuracy, 2))
print("----------------")
print('Accuracy: {:.2f}'
     .format(dt.score(X_train_vectorized, y_train)))

Baseline is 0.87
----------------
Accuracy: 0.94


In [25]:
#Random forest model 
rf = RandomForestClassifier(max_depth=5, min_samples_leaf=3, random_state=123)

#fit model
rf = rf.fit(X_train_vectorized, y_train)

#get predictions from model
y_pred = rf.predict(X_train_vectorized)

#compare to baseline
print("Baseline is", round(baseline_accuracy, 2))
print("----------------")
print('Accuracy: {:.2f}'
     .format(rf.score(X_train_vectorized, y_train)))

Baseline is 0.87
----------------
Accuracy: 0.87


In [26]:
#K-Nearest Neighbor model
knn = KNeighborsClassifier(n_neighbors=20)

#fit model
knn = knn.fit(X_train_vectorized, y_train)

#get predictions from model
y_pred = knn.predict(X_train_vectorized)

#compare to baseline
print("Baseline is", round(baseline_accuracy, 2))
print("----------------")
print('Accuracy: {:.2f}'
     .format(knn.score(X_train_vectorized, y_train)))

Baseline is 0.87
----------------
Accuracy: 0.93


In [27]:
#Logistic regression model 
logit = LogisticRegression(random_state=123)

#fit model
logit.fit(X_train_vectorized, y_train)

#get predictions from model
y_pred = logit.predict(X_train_vectorized)

#compare to baseline
print("Baseline is", round(baseline_accuracy, 2))
print("----------------")
print('Accuracy: {:.2f}'
     .format(logit.score(X_train_vectorized, y_train)))

Baseline is 0.87
----------------
Accuracy: 0.96


In [28]:
#dt's metrics on validate
#evaluate on validate sample
y_pred = dt.predict(X_validate_vectorized)

#compare train to validate
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(dt.score(X_train_vectorized, y_train)))
print("--------------------------------------------------------------")
print("Decision Tree classifier on validate set:")
print(classification_report(y_validate, y_pred))

Accuracy of Decision Tree classifier on training set: 0.94
--------------------------------------------------------------
Decision Tree classifier on validate set:
              precision    recall  f1-score   support

         ham       0.94      0.97      0.96      1158
        spam       0.77      0.62      0.69       180

    accuracy                           0.92      1338
   macro avg       0.86      0.80      0.82      1338
weighted avg       0.92      0.92      0.92      1338



In [29]:
#rf's metrics on validate
#evaluate on validate sample
y_pred = rf.predict(X_validate_vectorized)

#compare train to validate
print('Accuracy of Random Forest classifier on training set: {:.2f}'
     .format(rf.score(X_train_vectorized, y_train)))
print("--------------------------------------------------------------")
print("Random Forest classifier on validate set:")
print(classification_report(y_validate, y_pred))

Accuracy of Random Forest classifier on training set: 0.87
--------------------------------------------------------------
Random Forest classifier on validate set:
              precision    recall  f1-score   support

         ham       0.87      1.00      0.93      1158
        spam       0.00      0.00      0.00       180

    accuracy                           0.87      1338
   macro avg       0.43      0.50      0.46      1338
weighted avg       0.75      0.87      0.80      1338



In [30]:
#knn's metrics on validate
#evaluate on validate sample
y_pred = knn.predict(X_validate_vectorized)

#compare train to validate
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train_vectorized, y_train)))
print("--------------------------------------------------------------")
print("KNN classifier on validate set:")
print(classification_report(y_validate, y_pred))

Accuracy of KNN classifier on training set: 0.93
--------------------------------------------------------------
KNN classifier on validate set:
              precision    recall  f1-score   support

         ham       0.92      1.00      0.96      1158
        spam       0.99      0.46      0.62       180

    accuracy                           0.93      1338
   macro avg       0.95      0.73      0.79      1338
weighted avg       0.93      0.93      0.91      1338



In [31]:
#logit's metrics on validate
#evaluate on validate sample
y_pred = logit.predict(X_validate_vectorized)

#compare train to validate
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train_vectorized, y_train)))
print("--------------------------------------------------------------")
print("Logistic Regression classifier on validate set:")
print(classification_report(y_validate, y_pred))

Accuracy of Logistic Regression classifier on training set: 0.96
--------------------------------------------------------------
Logistic Regression classifier on validate set:
              precision    recall  f1-score   support

         ham       0.95      1.00      0.97      1158
        spam       0.99      0.64      0.78       180

    accuracy                           0.95      1338
   macro avg       0.97      0.82      0.87      1338
weighted avg       0.95      0.95      0.95      1338



In [32]:
#test final model: logit

#evaluate on test sample 
y_pred_test = logit.predict(X_test_vectorized)

print("Logistic Regression Model:")
print(classification_report(y_test, y_pred_test))

Logistic Regression Model:
              precision    recall  f1-score   support

         ham       0.95      1.00      0.97       966
        spam       0.99      0.66      0.79       149

    accuracy                           0.95      1115
   macro avg       0.97      0.83      0.88      1115
weighted avg       0.96      0.95      0.95      1115



- How do the models compare when trained on term frequency data alone, instead of TF-IDF values?

In [33]:
from sklearn.feature_extraction.text import CountVectorizer

In [34]:
# Create the tf vectorizer object 
tf = CountVectorizer() #encoder

# Fit on the training data
tf.fit(X_train)

# Use the object
X_train_vectorized = tf.transform(X_train)
X_validate_vectorized = tf.transform(X_validate)
X_test_vectorized = tf.transform(X_test)

X_train_vectorized

<3119x6269 sparse matrix of type '<class 'numpy.int64'>'
	with 26336 stored elements in Compressed Sparse Row format>

In [35]:
X_train_vectorized.todense()


matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [36]:
#Decision Tree model
dt = DecisionTreeClassifier(random_state=123)

#fit model
dt.fit(X_train_vectorized, y_train)

#get predictions from model
y_pred = dt.predict(X_train_vectorized)

#compare to baseline
print("Baseline is", round(baseline_accuracy, 2))
print("----------------")
print('Accuracy: {:.2f}'
     .format(dt.score(X_train_vectorized, y_train)))

Baseline is 0.87
----------------
Accuracy: 1.00


In [37]:
#Random forest model 
rf = RandomForestClassifier(max_depth=5, min_samples_leaf=3, random_state=123)

#fit model
rf = rf.fit(X_train_vectorized, y_train)

#get predictions from model
y_pred = rf.predict(X_train_vectorized)

#compare to baseline
print("Baseline is", round(baseline_accuracy, 2))
print("----------------")
print('Accuracy: {:.2f}'
     .format(rf.score(X_train_vectorized, y_train)))

Baseline is 0.87
----------------
Accuracy: 0.87


In [38]:
#K-Nearest Neighbor model
knn = KNeighborsClassifier(n_neighbors=20)

#fit model
knn = knn.fit(X_train_vectorized, y_train)

#get predictions from model
y_pred = knn.predict(X_train_vectorized)

#compare to baseline
print("Baseline is", round(baseline_accuracy, 2))
print("----------------")
print('Accuracy: {:.2f}'
     .format(knn.score(X_train_vectorized, y_train)))

Baseline is 0.87
----------------
Accuracy: 0.87


In [39]:
#Logistic regression model 
logit = LogisticRegression(random_state=123)

#fit model
logit.fit(X_train_vectorized, y_train)

#get predictions from model
y_pred = logit.predict(X_train_vectorized)

#compare to baseline
print("Baseline is", round(baseline_accuracy, 2))
print("----------------")
print('Accuracy: {:.2f}'
     .format(logit.score(X_train_vectorized, y_train)))

Baseline is 0.87
----------------
Accuracy: 0.99
