## Loading the standard libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Loading the data

In [2]:
data = pd.read_csv('spam.csv')
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
data.shape

(5572, 2)

In [4]:
data = data[['Message', 'Category']]
data.head()

Unnamed: 0,Message,Category
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


## Converting the Category to numerical

In [5]:
cat_labels = {'ham' : 0, 'spam' : 1}
data = data.replace({'Category' : cat_labels})
data.head()

Unnamed: 0,Message,Category
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


## In order to apply Machine Learning Classification algorithms on this data, we see that Message feature is not numerical.

### So the task is to convert the Message column into a numerical column.

### On Message feature, we cannot apply Label Encoding or One Hot Encoding. as it is a text feature and not categorical feature

- Note : Label Encoding and One Hot Encoding is applied only on Categorical features

#### Then How to perform preprocessing of the Message Feature? - CountVectorizer

# Example for countvectorizer

In [6]:
text = ['It was the best of times', 'The Times of India', 'India is an incredible nation', 'I am living in India']
text

['It was the best of times',
 'The Times of India',
 'India is an incredible nation',
 'I am living in India']

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()

In [8]:
## Applying count vectorizer on text
vec.fit(text)

CountVectorizer()

In [9]:
vec.vocabulary_

{'it': 7,
 'was': 13,
 'the': 11,
 'best': 2,
 'of': 10,
 'times': 12,
 'india': 5,
 'is': 6,
 'an': 1,
 'incredible': 4,
 'nation': 9,
 'am': 0,
 'living': 8,
 'in': 3}

In [10]:
sorted(vec.vocabulary_)

['am',
 'an',
 'best',
 'in',
 'incredible',
 'india',
 'is',
 'it',
 'living',
 'nation',
 'of',
 'the',
 'times',
 'was']

In [11]:
## Transform the text by applying countvectorizer on the text

vec.transform(text)

<4x14 sparse matrix of type '<class 'numpy.int64'>'
	with 19 stored elements in Compressed Sparse Row format>

In [12]:
vec.transform(text).toarray()

array([[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0],
       [0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0],
       [1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0]], dtype=int64)

In [13]:
op = pd.DataFrame(vec.transform(text).toarray(), columns = vec.get_feature_names())
op

Unnamed: 0,am,an,best,in,incredible,india,is,it,living,nation,of,the,times,was
0,0,0,1,0,0,0,0,1,0,0,1,1,1,1
1,0,0,0,0,0,1,0,0,0,0,1,1,1,0
2,0,1,0,0,1,1,1,0,0,1,0,0,0,0
3,1,0,0,1,0,1,0,0,1,0,0,0,0,0


In [14]:
## Perform CountVec Encoding for a new document
text2 = ['Indian is a great nation']
vec2 = vec.transform(text2)
vec2

<1x14 sparse matrix of type '<class 'numpy.int64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [15]:
vec2.toarray()

array([[0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0]], dtype=int64)

In [16]:
op2 = pd.DataFrame(vec2.toarray(), columns = vec.get_feature_names())
op2

Unnamed: 0,am,an,best,in,incredible,india,is,it,living,nation,of,the,times,was
0,0,0,0,0,0,0,1,0,0,1,0,0,0,0


# ======================================

## Spam Ham Classification

## Seperate X and y

In [17]:
X = data['Message']
y = data['Category']

## Divide the data into train test split

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [19]:
X_train.shape, X_test.shape

((3900,), (1672,))

## Apply Count Vectorizor on Message Feature in both train and test data

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
ct_vec = CountVectorizer()

In [21]:
X_train_ct_vec = ct_vec.fit_transform(X_train)
X_train_ct_vec

<3900x7053 sparse matrix of type '<class 'numpy.int64'>'
	with 51668 stored elements in Compressed Sparse Row format>

In [22]:
X_test_ct_vec = ct_vec.transform(X_test)
X_test_ct_vec

<1672x7053 sparse matrix of type '<class 'numpy.int64'>'
	with 20578 stored elements in Compressed Sparse Row format>

## Apply Logistic Regression on X_train_ct_vec and y_train

In [23]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()

In [24]:
log_reg.fit(X_train_ct_vec, y_train)

LogisticRegression()

In [38]:
email_ham = [
    'Even my brother is not like to speak with me. They treat me like aids patent.'   ## ham
    ]
email_spam = ['England v Macedonia - dont miss the goals/team news. Txt ur national team to 87077 eg ENGLAND to 87077 Try:WALES, SCOTLAND 4txt/Ãº1.20 POBOXox36504W45WQ 16+']   ## spam']

In [39]:
log_reg.predict(ct_vec.transform(email_ham))

array([0], dtype=int64)

In [40]:
log_reg.predict(ct_vec.transform(email_spam))

array([1], dtype=int64)

## Prediction on the test data

In [28]:
y_pred = log_reg.predict(X_test_ct_vec)
y_pred

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

## Evaluation step

In [29]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred, y_test)

0.9796650717703349

## Pipeline Approach

#### Define the steps

In [41]:
steps = [('ct_vectorizer', ct_vec), ('logisticregression', log_reg)]

#### Create a pipeline from steps

In [42]:
from sklearn.pipeline import Pipeline
pipe = Pipeline(steps)

### Fit the pipeline on data

In [43]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('ct_vectorizer', CountVectorizer()),
                ('logisticregression', LogisticRegression())])

In [44]:
pipe.score(X_test, y_test)

0.9796650717703349

In [45]:
pipe.predict(email_ham)

array([0], dtype=int64)

In [46]:
pipe.predict(email_spam)

array([1], dtype=int64)

## Shortcut method using make_pipeline

In [47]:
from sklearn.pipeline import make_pipeline
mk_pipe = make_pipeline(CountVectorizer(), LogisticRegression())
mk_pipe.fit(X_train, y_train)

Pipeline(steps=[('countvectorizer', CountVectorizer()),
                ('logisticregression', LogisticRegression())])

In [48]:
mk_pipe.score(X_test, y_test)

0.9796650717703349

### How to use make pipeline

In [49]:
## 1. SimpleImputer
## 2. StandardScaler
## 3. LabelEncoder
## 4. CountVectorizer
## 5. LogisticRegrssion

In [50]:
steps = [(), (), (), (), ()]

In [52]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

In [53]:
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(SimpleImputer(), StandardScaler(), LabelEncoder(), CountVectorizer(), LogisticRegression())
pipe.fit()