In [32]:
from pprint import pprint

import pandas as pd
import numpy as np
import nltk
import re
%matplotlib inline
import matplotlib.pyplot as plt

from prepare import basic_clean, lemmatize


In [5]:
def clean(text: str) -> list:
    'A simple function to cleanup text data'
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = set(nltk.corpus.stopwords.words('english'))
    text = (text.encode('ascii', 'ignore')
             .decode('utf-8', 'ignore')
             .lower())
    words = re.sub(r'[^\w\s]', '', text).split() # tokenization
    return [wnl.lemmatize(word) for word in words if word not in stopwords]

In [6]:
data = [
    'Python is pretty cool',
    'Python is a nice programming language with nice syntax',
    'I think SQL is cool too',
]

In [8]:
pprint(data)

['Python is pretty cool',
 'Python is a nice programming language with nice syntax',
 'I think SQL is cool too']


# Bag of Words 

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
# purpose of this is to make text documents into a matrix of token counts 
# same basic process as any sklearn transformation:
# make the thing
cv = CountVectorizer()
# use the thing
bag_of_words = cv.fit_transform(data)

In [10]:
bag_of_words

<3x12 sparse matrix of type '<class 'numpy.int64'>'
	with 16 stored elements in Compressed Sparse Row format>

In [11]:
# creates sparse matrix 
bag_of_words.todense()

matrix([[1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0],
        [0, 1, 1, 2, 0, 1, 1, 0, 1, 0, 0, 1],
        [1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0]])

In [13]:
# viewing within bag of words 
cv.get_feature_names()



['cool',
 'is',
 'language',
 'nice',
 'pretty',
 'programming',
 'python',
 'sql',
 'syntax',
 'think',
 'too',
 'with']

In [16]:
# Taking a look at the bag of words transformation for education and diagnostics.
# In practice this is not necesssary and the resulting data might be to big to be reasonably helpful.
bow = pd.DataFrame(bag_of_words.todense())
bow.columns = cv.get_feature_names()

In [17]:
data

['Python is pretty cool',
 'Python is a nice programming language with nice syntax',
 'I think SQL is cool too']

In [18]:
bow

Unnamed: 0,cool,is,language,nice,pretty,programming,python,sql,syntax,think,too,with
0,1,1,0,0,1,0,1,0,0,0,0,0
1,0,1,1,2,0,1,1,0,1,0,0,1
2,1,1,0,0,0,0,0,1,0,1,1,0


In [21]:
# another way to view list of words in bag of words
cv.get_feature_names_out()

array(['cool', 'is', 'language', 'nice', 'pretty', 'programming',
       'python', 'sql', 'syntax', 'think', 'too', 'with'], dtype=object)

In [15]:
# frequency of words used in bag of words 
cv.vocabulary_

{'python': 6,
 'is': 1,
 'pretty': 4,
 'cool': 0,
 'nice': 3,
 'programming': 5,
 'language': 2,
 'with': 11,
 'syntax': 8,
 'think': 9,
 'sql': 7,
 'too': 10}

In [19]:
# lambda function to determine proportionality of words 
bow.apply(lambda row:row/row.sum(), axis=1)

Unnamed: 0,cool,is,language,nice,pretty,programming,python,sql,syntax,think,too,with
0,0.25,0.25,0.0,0.0,0.25,0.0,0.25,0.0,0.0,0.0,0.0,0.0
1,0.0,0.125,0.125,0.25,0.0,0.125,0.125,0.0,0.125,0.0,0.0,0.125
2,0.2,0.2,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.2,0.2,0.0


In [22]:
# this will result in the tfidf matrix
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
bag_of_words = tfidf.fit_transform(data)
pprint(data)
pd.DataFrame(bag_of_words.todense(), 
             columns=tfidf.get_feature_names())

['Python is pretty cool',
 'Python is a nice programming language with nice syntax',
 'I think SQL is cool too']




Unnamed: 0,cool,is,language,nice,pretty,programming,python,sql,syntax,think,too,with
0,0.480458,0.373119,0.0,0.0,0.631745,0.0,0.480458,0.0,0.0,0.0,0.0,0.0
1,0.0,0.197673,0.334689,0.669378,0.0,0.334689,0.25454,0.0,0.334689,0.0,0.0,0.334689
2,0.38377,0.298032,0.0,0.0,0.0,0.0,0.0,0.504611,0.0,0.504611,0.504611,0.0


In [23]:
# zip: put these two things of the same length together
# dict: turn those two associated things into a k: v pair
# pd.Series: turn those keys into indeces, and the values into values
# HIGHER IDF means it provides more INFO, more relevant within single document 
pd.Series(
    dict(
        zip(
            tfidf.get_feature_names(), tfidf.idf_
        )
    )
)

# tells you the idf 



cool           1.287682
is             1.000000
language       1.693147
nice           1.693147
pretty         1.693147
programming    1.693147
python         1.287682
sql            1.693147
syntax         1.693147
think          1.693147
too            1.693147
with           1.693147
dtype: float64

# Bag of Ngrams

In [24]:
cv = CountVectorizer(ngram_range=(1, 3))
bag_of_grams = cv.fit_transform(data)

In [25]:
pprint(data)

['Python is pretty cool',
 'Python is a nice programming language with nice syntax',
 'I think SQL is cool too']


In [26]:
# make data frame from bag of grams 
pd.DataFrame(bag_of_grams.todense(),
            columns=cv.get_feature_names())



Unnamed: 0,cool,cool too,is,is cool,is cool too,is nice,is nice programming,is pretty,is pretty cool,language,...,sql is,sql is cool,syntax,think,think sql,think sql is,too,with,with nice,with nice syntax
0,1,0,1,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,1,1,0,0,1,...,0,0,1,0,0,0,0,1,1,1
2,1,1,1,1,1,0,0,0,0,0,...,1,1,0,1,1,1,1,0,0,0


In [28]:
from env import get_db_url
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

url = get_db_url('spam_db')
df = pd.read_sql('SELECT * FROM spam',url)
df

Unnamed: 0,id,label,text
0,0,ham,"Go until jurong point, crazy.. Available only ..."
1,1,ham,Ok lar... Joking wif u oni...
2,2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,3,ham,U dun say so early hor... U c already then say...
4,4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...,...
5567,5567,spam,This is the 2nd time we have tried 2 contact u...
5568,5568,ham,Will Ì_ b going to esplanade fr home?
5569,5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,5570,ham,The guy did some bitching but I acted like i'd...


In [33]:
df['clean_text'] = df.text.apply(clean).apply(' '.join)

In [34]:
# defining variables for modeling 
X = df.clean_text
y = df.label
X_train, X_test, y_train, y_test = \
train_test_split(X, y, 
                 test_size=0.2, 
                 random_state=1000)

In [35]:
X_train.head()

1889                           gotta collect da car 6 lei
3499    dorothykiefercom bank granite issue strongbuy ...
2693         nice new shirt thing wear nudist themed _ mu
405         haha get used driving usf man know lot stoner
1214               ill text creepy like wont think forgot
Name: clean_text, dtype: object

In [36]:
y_train.head()

1889     ham
3499    spam
2693     ham
405      ham
1214     ham
Name: label, dtype: object

In [37]:
# Whatever transformations we apply to X_train need to be applied to X_test
# this is model results 
cv = CountVectorizer()
X_bow = cv.fit_transform(X_train)
tree = DecisionTreeClassifier(max_depth=3)
tree.fit(X_bow, y_train)
tree.score(X_bow, y_train)

0.9216962082118016

In [38]:
# as with any other sklearn transformation, 
# transform only on our validate and/or test, 
# only fit on train
X_test_bow = cv.transform(X_test)
# tree.score(X_test_bow, y_test)

In [39]:
pd.DataFrame(X_bow.todense())


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7714,7715,7716,7717,7718,7719,7720,7721,7722,7723
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4452,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4453,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4454,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4455,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# MODELING RESULTS 

In [40]:
pd.Series(
    dict(
    zip(cv.get_feature_names_out(), 
    tree.feature_importances_))).sort_values().tail()

claim    0.035786
later    0.073497
text     0.085482
txt      0.375805
call     0.421967
dtype: float64

# Try other models 


In [49]:
from sklearn.naive_bayes import ComplementNB
classifier=ComplementNB()
classifier.fit(X_bow,y_train)

ComplementNB()

In [50]:
# create predictions 
y_pred_train=classifier.predict(X_bow)
y_pred_test=classifier.predict(X_test_bow)

In [52]:
# test results
classifier.score(X_bow,y_train)

0.9753197217859547

In [58]:
# test results 
classifier.score(X_test_bow,y_test)

0.9551569506726457

In [59]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred_test)
print(cm)
accuracy_score(y_test, y_pred_test)

[[905  38]
 [ 12 160]]


0.9551569506726457

### How models compare when trained on term frequency data alone, instead of TF-IDF Values?

In [62]:
# tdfidf data 
tfidf = TfidfVectorizer()
X1 = tfidf.fit_transform(X_train)

In [63]:
X1

<4457x7724 sparse matrix of type '<class 'numpy.float64'>'
	with 37269 stored elements in Compressed Sparse Row format>

In [64]:
# decision tree classifier model and results 
tree2 = DecisionTreeClassifier(max_depth=3)
tree2.fit(X1, y_train)
tree2.score(X1, y_train)

0.9358312766434822

In [65]:
# trying complementNB model
classifier=ComplementNB()
classifier.fit(X_bow,y_train)
classifier.score(X_bow,y_train)

0.9753197217859547

In [None]:
Complement NB performs better 