In [0]:
import nltk

In [0]:
dir(nltk)

In [0]:
from nltk.corpus import stopwords

In [0]:
nltk.download('stopwords')

In [0]:
stopwords.words('english')[0:10]

In [0]:
from google.colab import files
uploaded = files.upload()

In [0]:
raw = open('SMSSpamCollection.tsv').read()

In [0]:
raw[0:500]

In [0]:
# Here we are replacing the '\t' parameter with '\n' and the splitting the result at '\n'

New_data = raw.replace('\t','\n').split('\n')

In [0]:
New_data[0:5] # First 5 sample data

In [0]:
# Here we are going to separate the Label and Text details into diferent variables

labelList = New_data[0::2] # Its starting from position 0 and running upto the end and including every 2nd variable into labelList.
textList = New_data[1::2] # Its starting from position 1 and running upto the end and including every 2nd variable into textList.

In [0]:
print(labelList[0:5])
print(textList[0:5])

In [0]:
print(len(labelList))
print(len(textList))

In [0]:
# We are creating a Dataframe for the above created list.

import pandas as pd

newdata_df = pd.DataFrame({
    
    'label':labelList[:-1], # We are excluding the last entry as its a blank one. and this helps to match the length of both the lists.
    'bodylist':textList
})

In [0]:
newdata_df.head()

In [0]:
# We can use the pd.read_csv command to read the tsv file as well. 
# This is shortcut for the above process.

dataset = pd.read_csv('SMSSpamCollection.tsv', sep = '\t', header = None) # Header is None as the dataset is not having any column headers
dataset.columns = ['label','body_text']

In [0]:
dataset.head()

## Exploring the *dataset*

In [0]:
print('The input data has {} rows and {} columns'.format(len(dataset),len(dataset.columns))) # Printing the output in a formal statment.

In [0]:
# To find the number of spam/ham records.

print('Out of the {} rows, {} are spam and {} are ham records'.format(len(dataset),
                                                                       len(dataset[dataset['label'] == 'spam']),
                                                                       len(dataset[dataset['label'] == 'ham'])))

In [0]:
# Inorder to find the number of null records.

print('Number of null records in label column: {}'.format(dataset['label'].isnull().sum()))
print('Number of null records in body_text column: {}'.format(dataset['body_text'].isnull().sum()))

### Exploring the functions of the 're' module

In [0]:
import re

In [0]:
re_test = 'This is a made up string to test 2 different regex methods'
re_test_1 = 'This     is    a  made up     string   to test   2 different regex methods'
re_test_2 = 'This-is---a@made>>>>>>>up.string=======to.test.....2.different?regex=methods'

In [0]:
re.split('\s', re_test) # Here we are splitting the text with a single space. '\s' represents the space parameters. 

In [0]:
re.split('\s+', re_test_1) # '\s+' - represents one or more spaces between each words.

In [0]:
re.split('\W+', re_test_2) # '\W+' represents to split the string whenever one or more non character appears.

In [0]:
# The different approach here is to find the text.

re.findall('\S+', re_test) # Here '\S+' -  helps us to find the characters rather than the spaces or any special characters.
re.findall('\S+', re_test_1)

In [0]:
  re.findall('\w+', re_test_2) # As the test_2 does not have any space between the words, we use '\w+'

In [0]:
pep8_text = 'I try to follow PEP8 guidelines.'
pep7_text = 'I try to follow PEP7 guidelines.'
peep8_text = 'I try to follow PEEP8 guidelines.'

In [0]:
# Here we are trying to find which guideline the process is following.

print(re.findall('[A-Z]+[0-9]+', pep8_text))
print(re.findall('[A-Z]+[0-9]+', pep7_text))
print(re.findall('[A-Z]+[0-9]+', peep8_text))

In [0]:
# Functionality of the RE sunstitute function.

re.sub('[A-Z]+[[0-9]+', 'PEP8 Python Styleguide', pep8_text)

# Similarly it can be done for the remaining texts.


# NLP Basics:

1) Remove punctuations
2) Tokenization
3) Remove stopwords

### 1) Remove Punctuations

In [0]:
import string
string.punctuation # Gives the list of punctuations that needs to be consdiered in any texts

In [0]:
# Creating a function to remove any punctuations from any text variables.

def remove_punt(text):
  new_text = "".join([char for char in text if char not in string.punctuation]) # The join function used here helps us to obtain the original words rather than each characters individually.
  return new_text

In [0]:
dataset['body_text_clean'] = dataset['body_text'].apply(lambda x: remove_punt(x)) # The lambda function here helps us to call the function for each column values individually.

In [0]:
dataset.head() # All the punctuations are removed in the new column

### 2) Tokenization

In [0]:
# Functions to create tokens
def tokenize(text):
  tokens = re.split('\W+', text)
  return tokens

In [0]:
dataset['body_text_clean_tokens'] = dataset['body_text_clean'].apply(lambda x: tokenize(x.lower()))

In [0]:
dataset.head()

### 3) Remove Stopwords

In [0]:
stopword = nltk.corpus.stopwords.words('english') # Saving all the list of stopwords into a single variable

In [0]:
def cleaned_data(text):
  clean = [word for word in text if word not in stopword]
  return clean

In [0]:
dataset['clean_data'] = dataset['body_text_clean_tokens'].apply(lambda x: cleaned_data(x))

In [0]:
dataset.head()

## Supplement Data Cleaning

## Stemming:
 Reducing the derived words to their root words (removing the prefix or suffix of the words)

In [0]:
## Introducing the Porter Stemmer

ps = nltk.PorterStemmer()

In [0]:
# All the functionalities inside the Porter Stemmer

dir(ps)

In [0]:
# Here we are focusing on the 'stem' fuctionality of the Porter Stemmer

print(ps.stem('grows'))
print(ps.stem('growing'))

print(ps.stem('grown'))


# stem - Indentifies the words with same meaning.

In [0]:
dataset.head()

In [0]:
def stemming(text_data):
  text = [ps.stem(word) for word in text_data]
  return text

In [0]:
dataset['Stem_data'] = dataset['clean_data'].apply(lambda x: stemming(x))

In [0]:
dataset.head(10)

## Lemmatizing:
It uses a more informed analysis to create a group of words with the similar meaning.

The most common lemmatizer is the WordNet. Its a collection of nouns and adjectives and with more deep meaning.

In [0]:
# Calling the lemmatizer function

wn = nltk.WordNetLemmatizer()

In [0]:
# Functions under the WordNet Lemmatizer

dir(wn)

In [0]:
nltk.download('wordnet')

In [0]:
print(wn.lemmatize('grown'))

In [0]:
# Differnce i n using a Porter Stemmer and a Lemmatizer

print(ps.stem('goose'))
print(ps.stem('geese'))
print(wn.lemmatize('goose'))
print(wn.lemmatize('geese'))

In [0]:
def lematize_word(text):
  text = [wn.lemmatize(word) for word in text]
  return text

In [0]:
dataset['lemmatize_data'] = dataset['clean_data'].apply(lambda x: lematize_word(x))

In [0]:
dataset.head(10)

Note : Lemmatizer is much more accurate when compared to the Stemmer, but it takes longer time for its execution.

Important points:
Lemmatization: based on its usage, the machine looks for the appropriate dictionary form of the word.
Stemming: characters are removed of the end of the word by following language-specific rules.

================================================================================================================================================================

Consolidating all the NLTK basic function into a single function

In [0]:
df = pd.read_csv('SMSSpamCollection.tsv', sep = '\t', header = None)
df.columns = ['label', 'body_text']

In [0]:
def clean_data(text):
  p_data = "".join([word.lower() for word in text if word not in string.punctuation])
  tokens = re.split('\W+',p_data)
  clean_data = [word for word in tokens if word not in stopword]
  return clean_data

In [0]:
df['clean_data'] = df['body_text'].apply(lambda x: clean_data(x))

In [0]:
df.head()

================================================================================================================================================================

### Vectorizing:
Process of converting text into integers.

### Feature Vector:
n-dimensional vector of numberical features that rep a particulr object.

Working: It calculates all the unique words in the text and then for each row it calculates the number of times appearances of these unique words.

### Different types of Vectorization:
1) Count Vevtorization
2) N-grams
3) Term frequency - inverse document frequency

1) Count Vectorization

In [0]:
df = pd.read_csv('SMSSpamCollection.tsv', sep = '\t', header = None)
df.columns = ['label', 'body_text']

In [0]:
df.head()

In [0]:
def cleaning_data(text):
  p_data = "".join([word.lower() for word in text if word not in string.punctuation])
  tokens = re.split('\W+', p_data)
  clean = [ps.stem(word) for word in tokens if word not in stopword] #We are using the Porter Stemmer function to convert all the words.
  return clean

In [0]:
# Instead of using the 'lambda' functionality, we are using the inbuild feature of CountVectorizer to fit and transform the data.

from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer = cleaning_data) # Here we are calling the above function
x_counts = count_vect.fit_transform(df['body_text']) # The 'fit_transfor' command helps us to fit as well as transfor all the words into its root form for analysis.
print(x_counts.shape)
print(count_vect.get_feature_names()) # To print all the unique words from the main text.

As the original dataset is having 8107 unique words from the 5568 total text (Row number: 5568, Column number: 8107), we would be creating a sample dataset with 20-30 records.

In [0]:
df_sample = df[0:20]

In [0]:
count_vect_sample = CountVectorizer(analyzer = cleaning_data)
x_counts_sample = count_vect_sample.fit_transform(df_sample['body_text'])
print(x_counts_sample.shape)
print(count_vect_sample.get_feature_names())

Concept of Sparse Matrix

In [0]:
x_counts_sample

Here the Sparse Matrix specifies that it stores only the row with the non-zero values.

In order to print the original matrix we are converting  the above variable into an array format and the making it a DataFrame.


In [0]:
x_df = pd.DataFrame(x_counts_sample.toarray())

In [0]:
x_df

In [0]:
x_df.columns = [count_vect_sample.get_feature_names()]
x_df

In [0]:
dataset.head()

In [0]:
def cleaning_data1(text):
  p_data = "".join([word.lower() for word in text if word not in string.punctuation])
  tokens = re.split('\W+', p_data)
  clean = " ".join([ps.stem(word) for word in tokens if word not in stopword]) #We are using the Porter Stemmer function to convert all the words. Also we are joining them back inorder to form a sentence.
  return clean

In [0]:
df['clean_data'] = df['body_text'].apply(lambda x: cleaning_data1(x))

In [0]:
df.head()

2) N-Grams

In [0]:
# ngram is a function provided by CountVectorizer

ngram_sample = CountVectorizer(ngram_range=(2,2))
x_counts = ngram_sample.fit_transform(df['clean_data'])
print(x_counts.shape)
print(ngram_sample.get_feature_names())

In [0]:
df_sample = df[0:20]

ngram_sample1 = CountVectorizer(ngram_range=(2,2))
x_counts1 = ngram_sample1.fit_transform(df_sample['clean_data'])
print(x_counts1.shape)
print(ngram_sample1.get_feature_names())

In [0]:
# Converting the ngram matrix into a dataframe.

x_counts1_df = pd.DataFrame(x_counts1.toarray())
x_counts1_df.columns = ngram_sample1.get_feature_names()
x_counts1_df

3) Term frequency - inverse document frequency

Here, the row of the matrix rep unique text, column rep unique word but the content rep the weightage of the word in the next which is calculated by a formula.

Formula:
                TF-IDF = TF * IDF

Where: 

TF = (Number of time the word occurs in the text) / (Total number of words in text)

IDF = log(Total number of documents / Number of documents with word t in it)



Note: The rarer the word appears in te document, more the weightage is assigned to it.

In [0]:
def cleaning_data(text):
  p_data = "".join([word.lower() for word in text if word not in string.punctuation])
  tokens = re.split('\W+', p_data)
  clean = [ps.stem(word) for word in tokens if word not in stopword] #We are using the Porter Stemmer function to convert all the words.
  return clean

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer


tfidf_vector = TfidfVectorizer(analyzer = cleaning_data)
x_tfidf = tfidf_vector.fit_transform(df['body_text'])
print(x_tfidf.shape)
print(tfidf_vector.get_feature_names())

In [0]:

tfidf_vector1 = TfidfVectorizer(analyzer = cleaning_data)
x_tfidf1 = tfidf_vector1.fit_transform(df_sample['body_text'])
print(x_tfidf1.shape)
print(tfidf_vector1.get_feature_names())

In [0]:
x_tfidf_df = pd.DataFrame(x_tfidf.toarray())
x_tfidf_df.columns = tfidf_vector.get_feature_names()
x_tfidf_df

## Feature Engineering

We are trying to find new features that can be added to  the standard analysis

In [0]:
df.head()

First feature is to calcuate the length of each of the text.

The Hypothesis here is that the spam is considered to be of larger length than that of the ham.

In [0]:
df['body_text_len'] = df['body_text'].apply(lambda x: len(x) - x.count(" ")) # Here we are calculating the length of all the words and excluding the space characters.
df.head(10)

Secong feature is to calculate the percentage of punctuations in the provided text.

The Hypothesis here is that the spam would be having large pecent of puntuations.

In [0]:
def count_punct(text):
  count = sum([1 for char in text if char in string.punctuation])
  return round(count/(len(text) - text.count(" ")), 3)*100 # Here we are calculating the number of punctuations that are present in the whole text.

In [0]:
df['punct%'] = df['body_text'].apply(lambda x: count_punct(x))

In [0]:
df.head()

We are evaluating the above introduced features by plottng them

In [0]:
from  matplotlib import pyplot
import numpy as np
%matplotlib inline

In [0]:
#@title Needs to be checked
bins = np.linspace(0,200,40)

pyplot.hist(df[df['label'] == 'spam']['body_text_len'], bins, alpha = 0.5, normed = True , label = 'spam')
pyplot.hist(df[df['label'] == 'ham']['body_text_len'], bins, alpha = 0.5,  normed = True, label = 'ham')
pyplot.legend(loc = 'upper left')
pyplot.show()

In [0]:
#@title Needs to be checked
bins = np.linspace(0, 50, 40)

pyplot.hist(df[df['label'] == 'spam']['punct%'], bins, alpha = 0.5, normed = True , label = 'spam')
pyplot.hist(df[df['label'] == 'ham']['punct%'], bins, alpha = 0.5,  normed = True, label = 'ham')
pyplot.legend(loc = 'upper left')
pyplot.show()

In [0]:
df.head()

From the above analysis:

1) Our first hypothesis  - Spam body text is more than the ham text, is True
2) Second hypothesis - Punctuations are more in spam is False.

In [0]:
# Body Length distribution

bins = np.linspace(0, 200, 40)

pyplot.hist(df['body_text_len'], bins)
pyplot.title('Body Length Distribution', color = 'White')
pyplot.show()

In [0]:
bins = np.linspace(0, 50, 40)

pyplot.hist(df['punct%'], bins)
pyplot.title('Punctuation percent distribution', color = 'White')
pyplot.show()

From the above plot, the punctuation graph appears to be more skewed compared to the body length plot.

Hence we would considering that for the transformation purpose.

# Transformation:

The process of altering each data point in a systematic way so that they can be used for the model.

### Box-Cox Power transformation

The common range of exponentent used in this transformation si from (-2,2)

In [0]:
for i in [1,2,3,4,5]:
  pyplot.hist((df['punct%'])**(1/i), bins = 40)
  pyplot.title('Transformation : 1/{}'.format(str(i)), color = 'White')
  pyplot.show()

The main reason for using a transformation is to convert a skewed datapoint into a normalized one.

## Machine Learning classifiers

In [0]:
sample_df = pd.concat([df['body_text_len'], df['punct%'], x_tfidf_df],axis = 1)

In [0]:
sample_df.head()

## Exploring the Random Forest Classifiers

In [0]:
from sklearn.ensemble import RandomForestClassifier

In [0]:
# All the functionalities that are being provided by Random Forest.

print(dir(RandomForestClassifier))

In [0]:
print(RandomForestClassifier())

### Cross Validation

In [0]:
from sklearn.model_selection import KFold, cross_val_score


In [0]:
rf = RandomForestClassifier(n_jobs=-1) # Here we are passing the n_job parameter to -1 in order to increase the speed and to create all the decesion trees parallely.
k_fold = KFold(n_splits=5) # Here we are splitting the whole dataset into 5 parts.
cross_val_score(rf, sample_df, df['label'], cv = k_fold, scoring = 'accuracy', n_jobs=-1)

### RandomForestClassifier through Hold out set

In [0]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [0]:
X_train, X_test, y_train, y_test = train_test_split(sample_df, df['label'], test_size = 0.2)

In [0]:
X_train.shape

In [0]:
rf = RandomForestClassifier(n_estimators =50, max_depth = 20, n_jobs = -1 )
rf_model = rf.fit(X_train, y_train)

In [0]:
sorted(zip(rf_model.feature_importances_, X_train.columns), reverse = True)[0:10]

In [0]:
y_pred = rf_model.predict(X_test)
presicion, recall, fscore, support = score(y_test, y_pred, pos_label = 'spam', average = 'binary')

In [0]:
print('Presicion: {}, Recall: {}, Accuracy: {}'.format(round(presicion, 3),
                                                       round(recall, 3),
                                                       round((y_pred == y_test).sum()/ len(y_pred), 3) ))

Based on the above figures:

Precision is 100% - The mails that were identified as spam by the model were actually spam.

Recall 53.7% - Only 53% percent of the mails were correctly placed in the spam folder while the rest of the 47% went to the inbox folder.

Accuracy 94.3% - The model was able to identify 94% of the spam mails.

===================================================================


### Random Forest  model with grid serach

In [0]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(sample_df, df['label'], test_size = 0.2)

In [0]:
def train_RF(n_est, depth):
  rf = RandomForestClassifier(n_estimators= n_est, max_depth= depth, n_jobs=-1)
  rf_model = rf.fit(X_train, y_train)
  y_pred = rf_model.predict(X_test)
  presicion, recall, fscore, support = score(y_test, y_pred, pos_label = 'spam', average = 'binary')
  print('Est: {}, Depth: {} ------- Prescision: {} / Recall: {} / Accuracy: {}'.format(n_est, depth,
                                                                                       round(presicion, 3),
                                                                                       round(recall, 3),
                                                                                       round((y_test == y_pred).sum()/ len(y_pred), 3)))

In [0]:
for n_est in [10, 50, 100]:
  for depth in [10, 20, 30, None]:
    train_RF(n_est, depth)

### Random Forest Classifier with Grid Search CV

### Grid Serach:
Search all the possible parameters in a given grid to check the best model.

### Cross-Validation:
Divide the data into different subsets and repeat the holdout method.

In [0]:
from sklearn.model_selection import GridSearchCV

In [0]:
rf = RandomForestClassifier()
param = { 'n_estimators': [10,150,300],
         'max_depth': [30, 60, 90, None]}

gs = GridSearchCV(rf, param, cv = 5, n_jobs=-1)
gs_fit = gs.fit(sample_df, df['label'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending = False)[0:5]

## Gradient Boosting

Ensemble learning method that takes an iterative approach. It helps in creating a strong model based by focusing on the mistakes of the prior iterations.

In this method, all the models cannot be prepared parallely which is done in Random Forest.
Here the model depends on the previous output, inoreder to increase the weight of the wrong ones.

Though they are very hard to train and can be easily overfit, they are a very strong model.

In [0]:
from sklearn.ensemble import GradientBoostingClassifier

In [0]:
print(dir(GradientBoostingClassifier))

In [0]:
print(GradientBoostingClassifier())