# NLP (Natural language processing)

In [1]:
import numpy as np
from collections import Counter
import pandas as pd
import nltk
#new line
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.stem import SnowballStemmer
import string
from scipy.spatial.distance import pdist, squareform
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression

# from sklearn.cross_validation import train_test_split
# New module is
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC 
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline  import Pipeline, FeatureUnion, make_pipeline

print("Imported Modules")

Imported Modules


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alwinsolair/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
stops = set(nltk.corpus.stopwords.words('english'))

In [3]:
#stops

In [4]:
corpus = ["Jeff stole my octopus sandwich.", 
    "'Help!' I sobbed, sandwichlessly.", 
    "'Drop the sandwiches!' said the sandwich police."]

## How do I turn a corpus of documents into a feature matrix?

**Words --> numbers?????**

**Corpus: list of documents**

 [
     "Jeff stole my octopus sandwich.", 
     "'Help!' I sobbed, sandwichlessly.", 
     "'Drop the sandwiches!' said the sandwich police."
 ]

In [5]:
def our_tokenizer(doc, stops=None, stemmer=None):
    doc = word_tokenize(doc.lower())
    tokens = [''.join([char for char in tok if char not in string.punctuation]) for tok in doc]
    tokens = [tok for tok in tokens if tok]
    if stops:
        tokens = [tok for tok in tokens if (tok not in stops)]
    if stemmer:
        tokens = [stemmer.stem(tok) for tok in tokens]
    return tokens

In [6]:
tokenized_docs = [our_tokenizer(doc) for doc in corpus]
tokenized_docs

[['jeff', 'stole', 'my', 'octopus', 'sandwich'],
 ['help', 'i', 'sobbed', 'sandwichlessly'],
 ['drop', 'the', 'sandwiches', 'said', 'the', 'sandwich', 'police']]

**Step 1: lowercase, lose punction, split into tokens**

    [
     ['jeff', 'stole', 'my', 'octopus', 'sandwich'],
     ['help', 'i', 'sobbed', 'sandwichlessly'],
     ['drop', 'the', 'sandwiches', 'said', 'the', 'sandwich', 'police']
    ]

In [7]:
stopwords = set(nltk.corpus.stopwords.words('english'))

In [8]:
'i' in stopwords

True

In [9]:
tokenized_docs = [our_tokenizer(doc, stops=stopwords) for doc in corpus]
tokenized_docs

[['jeff', 'stole', 'octopus', 'sandwich'],
 ['help', 'sobbed', 'sandwichlessly'],
 ['drop', 'sandwiches', 'said', 'sandwich', 'police']]

**Step 2: remove stop words**

    [
     ['jeff', 'stole', 'octopus', 'sandwich'],
     ['help', 'sobbed', 'sandwichlessly'],
     ['drop', 'sandwiches', 'said', 'sandwich', 'police']
    ]

In [10]:
tokenized_docs = [our_tokenizer(doc, stops=stopwords, stemmer=SnowballStemmer('english')) for doc in corpus]
tokenized_docs

[['jeff', 'stole', 'octopus', 'sandwich'],
 ['help', 'sob', 'sandwichless'],
 ['drop', 'sandwich', 'said', 'sandwich', 'polic']]

**Step 3: Stemming/Lemmatization**

    [
     ['jeff', 'stole', 'octopus', 'sandwich'],
     ['help', 'sobbed', 'sandwichlessly'],
     ['drop', u'sandwich', 'said', 'sandwich', 'police']
    ]

**OK now what?**

Vocabulary:

    ['drop', 'help', 'jeff', 'octopus', 'police', 'said', 'sandwich', 'sandwichlessly', 'sobbed', 'stole']


In [11]:
vocab_set = set()

In [12]:
for doc in tokenized_docs:
    vocab_set.update(doc)

In [13]:
vocab = sorted(list(vocab_set))
print(vocab)

['drop', 'help', 'jeff', 'octopus', 'polic', 'said', 'sandwich', 'sandwichless', 'sob', 'stole']


### Count Vectorizer, TFIDF

Count vectorization

***Vocabulary:***

    ['drop', 'help', 'jeff', 'octopus', 'police', 'said', 'sandwich', 'sandwichlessly', 'sobbed', 'stole']
    ['jeff', 'stole', 'octopus', 'sandwich']
    [0, 0, 1, 1, 0, 0, 1, 0, 0, 1]

    ['help', 'sobbed', 'sandwichlessly']
    [0, 1, 0, 0, 0, 0, 0, 1, 1, 0]

    ['drop', u'sandwich', 'said', 'sandwich', 'police']
    [1, 0, 0, 0, 1, 1, 2, 0, 0, 0]
    
**Term frequency**

$$TF_{word,document} = \frac{\#\_of\_times\_word\_appears\_in\_document}{total\_\#\_of\_words\_in\_document}$$

    ['jeff', 'stole', 'octopus', 'sandwich']
    [0, 0, 1/4, 1/4, 0, 0, 1/4, 0, 0, 1/4]

    ['help', 'sobbed', 'sandwichlessly']
    [0, 1/3, 0, 0, 0, 0, 0, 1/3, 1/3, 0]

    ['drop', u'sandwich', 'said', 'sandwich', 'police']
    [1/5, 0, 0, 0, 1/5, 1/5, 2/5, 0, 0, 0]

### Document frequency

$$ DF_{word} = \frac{\#\_of\_documents\_containing\_word}{total\_\#\_of\_documents} $$
Vocabulary:

    ['drop', 'help', 'jeff', 'octopus', 'police', 'said', 'sandwich', 'sandwichlessly', 'sobbed', 'stole']

**Document frequency for each word:**

    [1/3, 1/3, 1/3, 1/3, 1/3, 1/3, 2/3, 1/3, 1/3, 1/3]

### Inverse document frequency

$$ IDF_{word} = \log\left(\frac{total\_\#\_of\_documents}{\#\_of\_documents\_containing\_word}\right) $$

**Vocabulary:**

    ['drop', 'help', 'jeff', 'octopus', 'police', 'said', 'sandwich', 'sandwichlessly', 'sobbed', 'stole']

**IDF for each word:**

    [1.099, 1.099, 1.099, 1.099, 1.099, 1.099, 0.405, 1.099, 1.099, 1.099]

### TFIDF

**Vocabulary:**

    ['drop', 'help', 'jeff', 'octopus', 'police', 'said', 'sandwich', 'sandwichlessly', 'sobbed', 'stole']

**TF * IDF:**

    ['jeff', 'stole', 'octopus', 'sandwich']
    [0, 0, 0.275, 0.275, 0, 0, 0.101, 0, 0, 0.275]

    ['help', 'sobbed', 'sandwichlessly']
    [0, 0.366, 0, 0, 0, 0, 0, 0.366, 0.366, 0]

    ['drop', u'sandwich', 'said', 'sandwich', 'police']
    [0.22, 0, 0, 0, 0.22, 0.22, 0.162, 0, 0, 0]

Now that we have turned our DOCUMENTS into VECTORS, we can put them into whatever machine learning algorithm we want! 
We can use whatever kind of similarity measure we please!

Wow!

In [14]:
cosine_similarity([[0, 0, 0.275, 0.275, 0, 0, 0.101, 0, 0, 0.275],  [0.22, 0, 0, 0, 0.22, 0.22, 0.162, 0, 0, 0]])

array([[1.        , 0.08115802],
       [0.08115802, 1.        ]])

In [15]:
cosine_similarity([[0, 0.366, 0, 0, 0, 0, 0, 0.366, 0.366, 0],  [0.22, 0, 0, 0, 0.22, 0.22, 0.162, 0, 0, 0]])

array([[1., 0.],
       [0., 1.]])

### Example with Spam data

In [16]:
#revisit spam ham example

In [17]:
df= pd.read_table('data/SMSSpamCollection', header=None)

In [18]:
df.head(3)

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...


In [19]:
df.columns=['spam', 'msg']

In [20]:
df.head(2)

Unnamed: 0,spam,msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...


In [21]:
stopwords_set=set(stopwords)

punctuation_set=set(string.punctuation)

In [22]:
len(stopwords_set)

179

In [23]:
len(punctuation_set)

32

In [24]:
df['msg_cleaned']= df.msg.apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords_set \
                                                   and word not in punctuation_set]))

In [25]:
str1='Go until jurong point, crazy'.split()
' '.join(str1)

'Go until jurong point, crazy'

In [26]:
df.head(2)

Unnamed: 0,spam,msg,msg_cleaned
0,ham,"Go until jurong point, crazy.. Available only ...","Go jurong point, crazy.. Available bugis n gre..."
1,ham,Ok lar... Joking wif u oni...,Ok lar... Joking wif u oni...


In [27]:
df['msg_cleaned']= df.msg_cleaned.str.lower()

In [28]:
df.head(2)

Unnamed: 0,spam,msg,msg_cleaned
0,ham,"Go until jurong point, crazy.. Available only ...","go jurong point, crazy.. available bugis n gre..."
1,ham,Ok lar... Joking wif u oni...,ok lar... joking wif u oni...


In [29]:
count_vect= CountVectorizer()

In [30]:
X= count_vect.fit_transform(df.msg_cleaned)

In [31]:
X.shape

(5572, 8703)

In [32]:
y=df.spam

In [33]:
X_train, X_test, y_train, y_test= train_test_split(X,y)

In [34]:
lg= LogisticRegression()

lg.fit(X_train,y_train)
y_pred=lg.predict(X_test)
lg.score(X_test,y_test)

0.9842067480258435

In [35]:
confusion_matrix(y_test, y_pred)

array([[1212,    1],
       [  21,  159]])

In [36]:
y_pred

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)

### Tweak model with Spam data

In [37]:
## try tfidf  

tfidf= TfidfVectorizer()

In [38]:
df.head(2)

Unnamed: 0,spam,msg,msg_cleaned
0,ham,"Go until jurong point, crazy.. Available only ...","go jurong point, crazy.. available bugis n gre..."
1,ham,Ok lar... Joking wif u oni...,ok lar... joking wif u oni...


In [39]:
X= tfidf.fit_transform(df.msg_cleaned)
y=df.spam 
X_train, X_test, y_train, y_test= train_test_split(X,y)

In [40]:
## try random forest 
rf= RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred=rf.predict(X_test)
rf.score(X_test,y_test)

0.9763101220387652

In [41]:
confusion_matrix(y_test, y_pred)

array([[1215,    2],
       [  31,  145]])

In [42]:
#try gradient boost 
gb= GradientBoostingClassifier()
gb.fit(X_train,y_train)
y_pred=gb.predict(X_test)
gb.score(X_test,y_test)

0.9641062455132807

In [43]:
confusion_matrix(y_test, y_pred)

array([[1202,   15],
       [  35,  141]])

In [44]:
# Try tfidf with bigrams & trigrams 
tfidf=TfidfVectorizer(ngram_range=(1,3))

In [45]:
X= tfidf.fit_transform(df.msg_cleaned)
y=df.spam
X_train, X_test, y_train, y_test= train_test_split(X,y)

In [46]:
#try gradient boost 
gb= GradientBoostingClassifier()
gb.fit(X_train,y_train)
y_pred=gb.predict(X_test)
gb.score(X_test,y_test)

0.9705671213208902

In [47]:
confusion_matrix(y_test, y_pred)

array([[1198,   11],
       [  30,  154]])

In [48]:
tfidf=TfidfVectorizer()

In [49]:
X=tfidf.fit_transform(df.msg_cleaned)
y=df.spam
X_train, X_test, y_train, y_test=  train_test_split(X,y)

In [50]:
lg= LogisticRegression()
lg.fit(X_train,y_train)
y_pred=lg.predict(X_test)
lg.score(X_test,y_test)

0.9504666188083274

In [51]:
confusion_matrix(y_test, y_pred)

array([[1198,    1],
       [  68,  126]])

### Pipeline with Spam data

In [52]:
pipeline= Pipeline([('countvect', CountVectorizer(stop_words=stopwords_set)),\
                    #('tfidf', TfidfVectorizer(stop_words=stopwords_set)),\
                    ('lg',  LogisticRegression())])

In [53]:
X=df.msg_cleaned #note we are passing the cleaned msg to the pipeline 
y=df.spam
X_train, X_test, y_train, y_test= train_test_split(X,y) 


pipeline.fit(X_train, y_train) 
y_pred= pipeline.predict(X_test)
print(pipeline.score(X_test, y_test))
print(confusion_matrix(y_test, y_pred))

0.9863603732950467
[[1224    0]
 [  19  150]]


In [54]:
pipeline= Pipeline([#('countvect', CountVectorizer(stop_words=stopwords_set)),\
                    ('countvect', CountVectorizer(stop_words=stopwords_set)),\
                    ('rf',  RandomForestClassifier())])

In [55]:
X=df.msg_cleaned #note we are passing the cleaned msg to the pipeline 
y=df.spam
X_train, X_test, y_train, y_test= train_test_split(X,y) 


pipeline.fit(X_train, y_train) 
y_pred= pipeline.predict(X_test)
print(pipeline.score(X_test, y_test))
print(confusion_matrix(y_test, y_pred))  

# the best one so far!

0.9798994974874372
[[1221    0]
 [  28  144]]


In [56]:
pipeline= Pipeline([#('countvect', CountVectorizer(stop_words=stopwords_set)),\
                    ('countvect', CountVectorizer(stop_words=stopwords_set, ngram_range=(1,3))),\
                    ('rf',  RandomForestClassifier())])

In [57]:
X=df.msg_cleaned #note we are passing the cleaned msg to the pipeline 
y=df.spam
X_train, X_test, y_train, y_test= train_test_split(X,y) 

pipeline.fit(X_train, y_train) 
y_pred= pipeline.predict(X_test)
print(pipeline.score(X_test, y_test))
print(confusion_matrix(y_test, y_pred))

0.9626704953338119
[[1202    2]
 [  50  139]]
