In [0]:
# NATURAL LANGUAGE PROCESSING

In [0]:
## Key concepts, text data cleaning

In [1]:

import numpy as np
import pandas as pd
from collections import Counter
import nltk                                              # used for preprocessing
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import string
from scipy.spatial.distance import pdist, squareform
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline

In [2]:
# how do we turn a corpus of a text (list of texts / documents) into a feature matrix?
stops = set(nltk.corpus.stopwords.words('english'))
corpus = ["Jeff stole my octopus sandwich.",
         "'Help!' I sobbed, sandwichlessly.",
         "'Drop the sandwiches!' Said the sandwich police."]

In [3]:
# tokenizer --> converts list as words
def our_tokenizer(doc, stops=None, stemmer=None):
    doc = word_tokenize(doc.lower())
    tokens = [''.join([char for char in tok if char not in string.punctuation]) for tok in doc]
    tokens = [tok for tok in tokens if tok]
    if stops:
        tokens = [tok for tok in tokens if (tok not in stops)]
    if stemmer:
        tokens = [stemmer.stem(tok) for tok in tokens]
    return tokens

In [4]:
# a list of lists: a list of with each sentence's words in another list
tokenized_docs = [our_tokenizer(doc) for doc in corpus]
tokenized_docs

[['jeff', 'stole', 'my', 'octopus', 'sandwich'],
 ['help', 'i', 'sobbed', 'sandwichlessly'],
 ['drop', 'the', 'sandwiches', 'said', 'the', 'sandwich', 'police']]

In [5]:
stopwords = set(nltk.corpus.stopwords.words('english'))   # a set because the values are not duplicated
stopwords

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [7]:
"i" in stopwords         # checks if i is in stopwords

True

In [6]:
# in this case we filter the original sentence and exclude elements which are in stopwords
tokenized_docs = [our_tokenizer(doc, stops=stopwords) for doc in corpus]
tokenized_docs

[['jeff', 'stole', 'octopus', 'sandwich'],
 ['help', 'sobbed', 'sandwichlessly'],
 ['drop', 'sandwiches', 'said', 'sandwich', 'police']]

In [7]:
# in this case we filter the original sentence and exclude elements which are in stopwords AND we get the stem part of each word (no suffixes or prefixes)
tokenized_docs = [our_tokenizer(doc, stops=stopwords, stemmer=SnowballStemmer('english')) for doc in corpus]   
tokenized_docs

[['jeff', 'stole', 'octopus', 'sandwich'],
 ['help', 'sob', 'sandwichless'],
 ['drop', 'sandwich', 'said', 'sandwich', 'polic']]

In [8]:
# set of the stemmed vocabulary in the documents
vocab_set = set()
for doc in tokenized_docs:
    vocab_set.update(doc)

In [9]:
# convert the list to set
vocab = sorted(list(vocab_set))
print(vocab)

['drop', 'help', 'jeff', 'octopus', 'polic', 'said', 'sandwich', 'sandwichless', 'sob', 'stole']


In [0]:
## Count Vectorizer, TFIDF

In [0]:
# A way to count frequency of words on the document list --> count vectorizer --> it just counts!
# Another way to do so: TFIDF --> it allows to give weight to the word even though it can be appear rarely --> it has two components: 
    # term frequency (TF) [number of times word appears on document/total number of words on document]
    # inverse document frequency (IDF) --> log(document frequency)
        # document frequency --> number of documents containing word/total number of documents
    ## TFIDF = TF*IDF
    
#With these methods we turn documents into vectors and we can put them into any ML algorithm

In [2]:
# looks at the angled distance between two vectors --> the lower the value, the most distant they are
cosine_similarity([[0, 0, 0.275, 0.275, 0, 0, 0.101, 0, 0, 0.275], [0.22, 0, 0, 0, 0.22, 0.22, 0.162, 0, 0, 0]])

array([[1.        , 0.08115802],
       [0.08115802, 1.        ]])

In [10]:
# Example with spam data
df = pd.read_table('SMSSpamCollection', header=None)

In [13]:
df.columns=['spam', 'msg']
df.head(10)

Unnamed: 0,spam,msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [14]:
# cleanse the data from stopwords and punctuation: we split words (x.split() is a list of words) by space from df.msg, ''.join allows to join words as total string again

stopwords_set = set(stopwords)
punctuation_set = set(string.punctuation)
df['msg_cleaned'] = df.msg.apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords_set and word not in punctuation_set]))

In [15]:
df['msg_cleaned'] = df.msg_cleaned.str.lower()
df.head(2)

Unnamed: 0,spam,msg,msg_cleaned
0,ham,"Go until jurong point, crazy.. Available only ...","go jurong point, crazy.. available bugis n gre..."
1,ham,Ok lar... Joking wif u oni...,ok lar... joking wif u oni...


In [16]:
count_vect = CountVectorizer()
X = count_vect.fit_transform(df.msg_cleaned)
X.shape    # gives number of features --> each word is a column

(5572, 8703)

In [17]:
y = df.spam

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [19]:
lg = LogisticRegression()     #classification problem
lg.fit(X_train,y_train)
y_pred = lg.predict(X_test)
lg.score(X_test,y_test)

0.9856424982053122

In [20]:
confusion_matrix(y_test, y_pred)

array([[1187,    3],
       [  17,  186]])

In [0]:
## Tweak model with spam data --> see what is the best model

In [27]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df.msg_cleaned)
y = df.spam
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [31]:
# random forest classifier
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
rf.score(X_test,y_test)

0.9748743718592965

In [33]:
confusion_matrix(y_test,y_pred)

array([[1216,    0],
       [  35,  142]])

In [34]:
# gradient boost classifier
gb = GradientBoostingClassifier()
gb.fit(X_train,y_train)
y_pred = gb.predict(X_test)
gb.score(X_test,y_test)

0.9669777458722182

In [35]:
confusion_matrix(y_test,y_pred)

array([[1210,    6],
       [  40,  137]])

In [36]:
# Try tfidf with bigrams and trigrams --> two or three words per token which are considered as one --> range=(1,3)
tfidf = TfidfVectorizer(ngram_range=(1,3))
X = tfidf.fit_transform(df.msg_cleaned)
y = df.spam
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [37]:
# gradient boost classifier
gb = GradientBoostingClassifier()
gb.fit(X_train,y_train)
y_pred = gb.predict(X_test)
gb.score(X_test,y_test)

0.9705671213208902

In [38]:
confusion_matrix(y_test,y_pred)

array([[1220,    9],
       [  32,  132]])

In [39]:
# logistic regression
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df.msg_cleaned)
y = df.spam
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [40]:
lg = LogisticRegression()     #classification problem
lg.fit(X_train,y_train)
y_pred = lg.predict(X_test)
lg.score(X_test,y_test)

0.9526202440775305

In [41]:
confusion_matrix(y_test,y_pred)

array([[1212,    2],
       [  64,  115]])

In [0]:
## Pipelining with Spam Data

In [22]:
# streams the code to use it many times and avoid repetition --> we pass a list of tuples into the pipeline
# preprocessing : takes one column (each cell is a clean version of the message), then we fit through TfidfVectorizer to vectorize it --> each word becomes a column (feature)
pipeline = Pipeline([# ('count_vect', CountVectorizer(stop_words = stopwords_set)), \
                    ('tfidf', TfidfVectorizer(stop_words = stopwords_set)), \
                    ('rf', RandomForestClassifier())])   # spam data is fit into a model

In [23]:
X = df.msg_cleaned   # we pass the cleaned msg to the pipeline
y = df.spam
X_train, X_test, y_train, y_test = train_test_split(X,y)

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print(pipeline.score(X_test, y_test))
print(confusion_matrix(y_test, y_pred))

0.9755922469490309
[[1202    0]
 [  34  157]]


In [24]:
# streams the code to use it many times and avoid repetition --> we pass a list of tuples into the pipeline
# preprocessing : takes one column (each cell is a clean version of the message), then we fit through CountVectorizer to vectorize it --> each word becomes a column (feature)
pipeline = Pipeline([ ('count_vect', CountVectorizer(stop_words = stopwords_set)), \
                    #('tfidf', TfidfVectorizer(stop_words = stopwords_set)), \
                    ('lg', LogisticRegression())])   # spam data is fit into a model

In [25]:
X = df.msg_cleaned   # we pass the cleaned msg to the pipeline
y = df.spam
X_train, X_test, y_train, y_test = train_test_split(X,y)

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print(pipeline.score(X_test, y_test))
print(confusion_matrix(y_test, y_pred))

0.9798994974874372
[[1214    3]
 [  25  151]]
