# Natural Language Processing 2 - Addtional Topics

### Reference Links -
1. CountVector Function Details - https://kavita-ganesan.com/how-to-use-countvectorizer/#.Xr5vkmgzY2w
2. Text PreProcessing Step - https://medium.com/@datamonsters/text-preprocessing-in-python-steps-tools-and-examples-bf025f872908
3. Word Clouds - https://www.datacamp.com/community/tutorials/wordcloud-python
4. Naive Bayes - https://www.datacamp.com/community/tutorials/naive-bayes-scikit-learn
5. NB for NLP - https://towardsdatascience.com/implementing-a-naive-bayes-classifier-for-text-categorization-in-five-steps-f9192cdd54c3

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
#Load the data from dataset
data = pd.read_csv('data/SMSSpamCollection', sep='\t' , names=['label','message'])

In [None]:
data.head()

In [None]:
data.info()

#### **Exploratory Data Analysis (EDA)**

In [None]:
data.describe()

In [None]:
data.groupby('label').describe()

### Visual  EDA

In [None]:
data['msgLen']=data.message.apply(len)
data.head()

In [None]:
data[data['label']=='spam'].msgLen.plot(bins=50,kind='hist')

In [None]:
data[data['label']=='ham'].msgLen.plot(bins=50,kind='hist')

In [None]:
data[data['label']=='spam'].msgLen.max()

In [None]:
data[data['label']=='spam'].msgLen.mean()

In [None]:
data[data['label']=='spam'].msgLen.min()

In [None]:
data[data['label']=='ham'].msgLen.max()

In [None]:
data[data['label']=='ham'].msgLen.mean()

In [None]:
data[data['label']=='ham'].msgLen.min()

In [None]:
data.groupby('label').msgLen.describe()

In [None]:
def textPreprocessing(data):
    #Remove Punctuation Logic
    import string
    removePunctuation = [char for char in data if char not in string.punctuation]
    
    #Join Chars to form sentences
    sentenceWithoutPunctuations = ''.join(removePunctuation)
    sentence = sentenceWithoutPunctuations.split()
    
    #StopwordRemoval
    from nltk.corpus import stopwords
    removeStopwords = [word for word in sentence if word.lower() not in stopwords.words('english')]
    
    return removeStopwords

In [None]:
#Text preprocessing
data['message'].head().apply(textPreprocessing)

In [None]:
data.head()

In [None]:
# Create Bag of Words - sklearn package CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
?CountVectorizer
#1. lowercases your text (set lowercase=false if you don’t want lowercasing)
#2. performs tokenization (converts raw text to smaller units of text)
#3. uses word level tokenization (meaning each word is treated as a separate token)
#4. ignores single characters during tokenization (say goodbye to words like ‘a’ and ‘I’)

In [None]:
bow = CountVectorizer(analyzer=textPreprocessing,min_df=3).fit(data['message'])

In [None]:
# show resulting vocabulary; the numbers are not counts, they are the position in the sparse vector.
bow.vocabulary_

In [None]:
len(bow.vocabulary_)

In [None]:
message_bow = bow.transform(data['message'])

In [None]:
message_bow.shape

In [None]:
pd.DataFrame(message_bow.toarray()).head()

**Working With N-Grams - Word level – bigrams only**

In [None]:
bow2gram = CountVectorizer(min_df=3,ngram_range=(2,2)).fit(data['message'])

In [None]:
bow2gram.vocabulary_

In [None]:
len(bow.vocabulary_)

In [None]:
# Word Cloud - https://www.datacamp.com/community/tutorials/wordcloud-python
## conda install -c conda-forge wordcloud
## conda update --all
import wordcloud

In [None]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
stopwords = set(STOPWORDS)

def show_wordcloud(data, title = None):
    wordcloud = WordCloud(
        background_color='white',
        stopwords=stopwords,
        max_words=200,
        max_font_size=40, 
        scale=3,
        random_state=1 # chosen at random by flipping a coin; it was heads
    ).generate(str(data))

    fig = plt.figure(1, figsize=(12, 12))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize=20)
        fig.subplots_adjust(top=2.3)

    plt.imshow(wordcloud)
    plt.show()

In [None]:
#show_wordcloud()
show_wordcloud(data[data['label']=='spam'].message)

In [None]:
show_wordcloud(data[data['label']=='ham'].message)

In [None]:
#TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
?TfidfTransformer

In [None]:
tfidfData = TfidfTransformer(use_idf=False).fit(message_bow)
tfidfDataFinal = tfidfData.transform(message_bow)

In [None]:
tfidfData = TfidfTransformer(use_idf=True).fit(message_bow)
tfidfDataFinal = tfidfData.transform(message_bow)

In [None]:
print(tfidfData.idf_[bow.vocabulary_['WINNER']])

In [None]:
tfidfDataFinal.shape 

In [None]:
pd.DataFrame(tfidfDataFinal.toarray()).tail()

### Classification Machine Learning Model 

In [None]:
# Data is ready to be supplied in a machine learning algo

In [None]:
#Training the model --- NaiveBayes Algo
#Handling String data ---- MultinomialNB
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB().fit(tfidfDataFinal,data['label'])

In [None]:
model

In [None]:
trainPred=model.predict(tfidfDataFinal)

In [None]:
pd.crosstab(data['label'],trainPred)

In [None]:
inputData = 'Win Lottery Guaranteed'
l1 = textPreprocessing(inputData)
l2 = bow.transform(l1)
l3 = tfidfData.transform(l2)

In [None]:
prediction = model.predict_proba(l3[0])
prediction