In [5]:
# natural language processing
# 1. collect data
# 2. featurize
# 3. compare features

# vectors can be created out of bads of words
# cosine similarity can be used on vectors to determine 
# similarity

# Term Frequency (TR) - 
# import of the term within that document
# TF(d,t) = number of occurences of term t in document d

# Inverse Document Frequency (TR) - 
# importance of the term in the corpus
# IDF(t) = log(D/t) where
# D = total number of documents
# t = number of ducments with the team

In [None]:
import nltk

In [None]:
nltk.download_shell()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------


In [None]:
fpath = '/Users/bcutrell/Python-Data-Science-and-Machine-Learning-Bootcamp/Machine Learning Sections/Natural-Language-Processing/smsspamcollection/SMSSpamCollection'
messages = [ line.rstrip() for line in open(fpath, encoding='utf-8')]

In [None]:
len(messages)

In [None]:
messages[50]

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
messages = pd.read_csv(fpath, sep='\t', names=['label', 'message'])

In [None]:
messages.describe()

In [None]:
messages.groupby('label').describe()

In [None]:
messages['length'] = messages['message'].apply(len)
messages['length'].plot.hist(bins=100)

In [None]:
messages['length'].describe()

In [None]:
messages[messages['length'] == 910]['message'].iloc[0]

In [None]:
messages.hist(column='length', by='label', bins=60, figsize=(12,4))

In [None]:
# Spam messages tend to have more characters
# so length appears to be a good feature

In [None]:
import string

In [None]:
mess = 'Sample message! Notice: it has punctuation.'

In [None]:
nopunc = [c for c in mess if c not in string.punctuation]

In [None]:
from nltk.corpus import stopwords # stopwords.words('en')

In [None]:
nopunc = ''.join(nopunc)

In [None]:
clean_mess = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [None]:
clean_mess

In [None]:
def text_process(mess):
    '''
    1. remove punc
    2. remove stop words
    3. return list of clean text words
    '''
    nopunc = [c for c in mess if c not in string.punctuation]
    nopunc = ''.join(nopunc)
    
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [None]:
messages.head()

In [None]:
messages['message'].head(5).apply(text_process)

In [None]:
# stemming can also be used to combine 
# like words (running, ran, run)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
bow_transformer = CountVectorizer(analyzer=text_process).fit(messages['message'])

In [None]:
mess4 = messages['message'][3]

In [None]:
print(mess4)

In [None]:
bow4 = bow_transformer.transform([mess4])

In [None]:
print(bow4)

In [None]:
print(bow4.shape)

In [None]:
bow_transformer.get_feature_names()[4068]

In [None]:
messages_bow = bow_transformer.transform(messages['message'])

In [None]:
print('Shape of Sparse Matrix: ', messages_bow.shape)

In [None]:
messages_bow.nnz

In [None]:
sparsity = (100.0 * messages_bow.nnz / (messages_bow.shape[0] * messages_bow.shape[1]))
print('sparsity: {}'.format(sparsity))

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
tfidf_transformer = TfidfTransformer().fit(messages_bow)

In [None]:
tfidf4 = tfidf_transformer.transform(bow4)

In [None]:
print(tfidf4)

In [None]:
tfidf_transformer.idf_[bow_transformer.vocabulary_['university']]

In [None]:
messages_tfidf = tfidf_transformer.transform(messages_bow)

In [None]:
# messages are now numerical vectors

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
spam_detect_model = MultinomialNB().fit(messages_tfidf, messages['label'])

In [None]:
spam_detect_model.predict(tfidf4)[0]

In [None]:
messages['label'][3]

In [None]:
all_pred = spam_detect_model.predict(messages_tfidf)

In [None]:
from sklearn.cross_validation import train_test_split

In [None]:
X = messages['message']
y = messages['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),
    ('tfidf', TfidfTransformer()),
    ('classifier', MultinomialNB())
])

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
predictions = pipeline.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test, predictions))

In [None]:
# precision (also called positive predictive value) 
# is the fraction of relevant instances among the retrieved instances
# true positives / false positives

# recall (also known as sensitivity) is the fraction of relevant instances 
# that have been retrieved over the total amount of relevant instances.
# true positives / total relevant

In [None]:
##############################
# Project
##############################

In [None]:
fpath = '/Users/bcutrell/Python-Data-Science-and-Machine-Learning-Bootcamp/Machine Learning Sections/Natural-Language-Processing/yelp.csv'
yelp = pd.read_csv(messages)