### What is Natural Language Processing (NLP)? 

* Text Classification
* Spam Detection
* Sentiment Analysis
* Financial Markets
* Automatic Summarization
* Translation
* Chatbot
* Speech 
* Generating Caption for Photographs
* ...

### How to process data?


#### Bag of Words (bow)

* Tokens
* Stop words
* Stemming 
* Lemmatization


In [6]:
docs = ['the quick brown fox', 'jumped over the lazy dog']

In [7]:
%matplotlib inline
from collections import Counter
import pandas as pd

In [8]:
bows = [s.split() for s in docs]
print(bows)

[['the', 'quick', 'brown', 'fox'], ['jumped', 'over', 'the', 'lazy', 'dog']]


In [9]:
bows = [Counter(d) for d in bows]
print(bows)

[Counter({'the': 1, 'quick': 1, 'brown': 1, 'fox': 1}), Counter({'jumped': 1, 'over': 1, 'the': 1, 'lazy': 1, 'dog': 1})]


In [10]:
df = pd.DataFrame(bows)
df

Unnamed: 0,brown,dog,fox,jumped,lazy,over,quick,the
0,1.0,,1.0,,,,1.0,1
1,,1.0,,1.0,1.0,1.0,,1


In [11]:
df = df.fillna(0)
df

Unnamed: 0,brown,dog,fox,jumped,lazy,over,quick,the
0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1
1,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1


In [12]:
df.describe()

Unnamed: 0,brown,dog,fox,jumped,lazy,over,quick,the
count,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
mean,0.5,0.5,0.5,0.5,0.5,0.5,0.5,1.0
std,0.707107,0.707107,0.707107,0.707107,0.707107,0.707107,0.707107,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,0.25,0.25,0.25,0.25,0.25,0.25,0.25,1.0
50%,0.5,0.5,0.5,0.5,0.5,0.5,0.5,1.0
75%,0.75,0.75,0.75,0.75,0.75,0.75,0.75,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [13]:
df['is_spam'] = [1, 0]

In [14]:
from sklearn import linear_model
model = linear_model.LinearRegression()

In [15]:
X_train = df[df.columns[:-1]]
y_train = df[['is_spam']]
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

In [17]:
docs = ['the quick brown fox', 'jumped over the lazy dog']
vectorizer = CountVectorizer() # This is like a collections.Counter -> dictionaries

In [18]:
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [19]:
vectorizer.fit(docs)
docs

['the quick brown fox', 'jumped over the lazy dog']

In [20]:
vectorizer.get_feature_names()

['brown', 'dog', 'fox', 'jumped', 'lazy', 'over', 'quick', 'the']

In [21]:
vectorizer2 = CountVectorizer(ngram_range=(1, 2))
docs

['the quick brown fox', 'jumped over the lazy dog']

In [22]:
vectorizer2.fit(docs)
docs

['the quick brown fox', 'jumped over the lazy dog']

In [23]:
vectorizer2.get_feature_names()

['brown',
 'brown fox',
 'dog',
 'fox',
 'jumped',
 'jumped over',
 'lazy',
 'lazy dog',
 'over',
 'over the',
 'quick',
 'quick brown',
 'the',
 'the lazy',
 'the quick']

In [24]:
docs2 = ['I am too clumsy to jump over the cat']
docs2

['I am too clumsy to jump over the cat']

In [25]:
vectors = vectorizer.transform(docs2)
vectors

<1x8 sparse matrix of type '<class 'numpy.int64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [26]:
vectors = vectors.todense()
vectors

matrix([[0, 0, 0, 0, 0, 1, 0, 1]], dtype=int64)

In [27]:
docs3 = docs2 + docs
docs3

['I am too clumsy to jump over the cat',
 'the quick brown fox',
 'jumped over the lazy dog']

In [28]:
vectors = vectorizer.transform(docs3)
vectors

<3x8 sparse matrix of type '<class 'numpy.int64'>'
	with 11 stored elements in Compressed Sparse Row format>

In [29]:
vectors = vectors.todense()  
vectors

matrix([[0, 0, 0, 0, 0, 1, 0, 1],
        [1, 0, 1, 0, 0, 0, 1, 1],
        [0, 1, 0, 1, 1, 1, 0, 1]], dtype=int64)

In [30]:
doc_vectors = pd.DataFrame(vectors) 
doc_vectors

Unnamed: 0,0,1,2,3,4,5,6,7
0,0,0,0,0,0,1,0,1
1,1,0,1,0,0,0,1,1
2,0,1,0,1,1,1,0,1


In [31]:
vectorizer.get_feature_names()

['brown', 'dog', 'fox', 'jumped', 'lazy', 'over', 'quick', 'the']

In [32]:
doc_vectors.columns = vectorizer.get_feature_names()
doc_vectors

Unnamed: 0,brown,dog,fox,jumped,lazy,over,quick,the
0,0,0,0,0,0,1,0,1
1,1,0,1,0,0,0,1,1
2,0,1,0,1,1,1,0,1


In [33]:
doc_vectors['text'] = docs3
doc_vectors

Unnamed: 0,brown,dog,fox,jumped,lazy,over,quick,the,text
0,0,0,0,0,0,1,0,1,I am too clumsy to jump over the cat
1,1,0,1,0,0,0,1,1,the quick brown fox
2,0,1,0,1,1,1,0,1,jumped over the lazy dog
