# NLP (Natural language processing)

In [1]:
import numpy as np
from collections import Counter
import pandas as pd
import nltk
#new line
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.stem import SnowballStemmer
import string
from scipy.spatial.distance import pdist, squareform
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression

# from sklearn.cross_validation import train_test_split
# New module is
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC 
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline  import Pipeline, FeatureUnion, make_pipeline

print("Imported Modules")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alwinsolair/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Imported Modules


In [2]:
stops = set(nltk.corpus.stopwords.words('english'))

In [3]:
#stops

In [4]:
corpus = ["Jeff stole my octopus sandwich.", 
    "'Help!' I sobbed, sandwichlessly.", 
    "'Drop the sandwiches!' said the sandwich police."]

## How do I turn a corpus of documents into a feature matrix?

**Words --> numbers?????**

**Corpus: list of documents**

 [
     "Jeff stole my octopus sandwich.", 
     "'Help!' I sobbed, sandwichlessly.", 
     "'Drop the sandwiches!' said the sandwich police."
 ]

In [5]:
def our_tokenizer(doc, stops=None, stemmer=None):
    doc = word_tokenize(doc.lower())
    tokens = [''.join([char for char in tok if char not in string.punctuation]) for tok in doc]
    tokens = [tok for tok in tokens if tok]
    if stops:
        tokens = [tok for tok in tokens if (tok not in stops)]
    if stemmer:
        tokens = [stemmer.stem(tok) for tok in tokens]
    return tokens

In [6]:
tokenized_docs = [our_tokenizer(doc) for doc in corpus]
tokenized_docs

[['jeff', 'stole', 'my', 'octopus', 'sandwich'],
 ['help', 'i', 'sobbed', 'sandwichlessly'],
 ['drop', 'the', 'sandwiches', 'said', 'the', 'sandwich', 'police']]

**Step 1: lowercase, lose punction, split into tokens**

    [
     ['jeff', 'stole', 'my', 'octopus', 'sandwich'],
     ['help', 'i', 'sobbed', 'sandwichlessly'],
     ['drop', 'the', 'sandwiches', 'said', 'the', 'sandwich', 'police']
    ]

In [7]:
stopwords = set(nltk.corpus.stopwords.words('english'))

In [8]:
'i' in stopwords

True

In [9]:
tokenized_docs = [our_tokenizer(doc, stops=stopwords) for doc in corpus]
tokenized_docs

[['jeff', 'stole', 'octopus', 'sandwich'],
 ['help', 'sobbed', 'sandwichlessly'],
 ['drop', 'sandwiches', 'said', 'sandwich', 'police']]

**Step 2: remove stop words**

    [
     ['jeff', 'stole', 'octopus', 'sandwich'],
     ['help', 'sobbed', 'sandwichlessly'],
     ['drop', 'sandwiches', 'said', 'sandwich', 'police']
    ]

In [10]:
tokenized_docs = [our_tokenizer(doc, stops=stopwords, stemmer=SnowballStemmer('english')) for doc in corpus]
tokenized_docs

[['jeff', 'stole', 'octopus', 'sandwich'],
 ['help', 'sob', 'sandwichless'],
 ['drop', 'sandwich', 'said', 'sandwich', 'polic']]

**Step 3: Stemming/Lemmatization**

    [
     ['jeff', 'stole', 'octopus', 'sandwich'],
     ['help', 'sobbed', 'sandwichlessly'],
     ['drop', u'sandwich', 'said', 'sandwich', 'police']
    ]

**OK now what?**

Vocabulary:

    ['drop', 'help', 'jeff', 'octopus', 'police', 'said', 'sandwich', 'sandwichlessly', 'sobbed', 'stole']


In [11]:
vocab_set = set()

In [12]:
for doc in tokenized_docs:
    vocab_set.update(doc)

In [14]:
vocab = sorted(list(vocab_set))
print(vocab)

['drop', 'help', 'jeff', 'octopus', 'polic', 'said', 'sandwich', 'sandwichless', 'sob', 'stole']
