# Spam detector

Supervised Learning. Binary classification

Goal: Predict the probability that a given email is a spam email

Data from [Applied Text Mining in Python | Coursera](https://www.coursera.org/learn/python-text-mining/)

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import helper

# helper.reproducible(seed=9)
sns.set()

## 1. Data Processing and Exploratory Data Analysis

In [2]:
data_path = 'data/spam.csv'
target = ['target']

df_original = pd.read_csv(data_path)
print("{} rows \n{} columns \ntarget: {}".format(*df_original.shape, target))

df_original.head(3)

5572 rows 
2 columns 
target: ['target']


Unnamed: 0,text,target
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam


### Explore and Clean the target

In [3]:
print(df_original[target].squeeze().value_counts(dropna=False))

ham     4825
spam     747
Name: target, dtype: int64


In [4]:
# change ()'ham', 'spam') to (0,1)
df_original['target'] = np.where(df_original['target'] == 'spam', 1, 0)

print('Ratio of email spam: {:.3f}'.format(np.mean(df_original['target'])))

Ratio of email spam: 0.134


###  Split original data into training and test set

In [5]:
from sklearn.model_selection import train_test_split

df, df_test = train_test_split(
    df_original, test_size=0.2, stratify=df_original[target], random_state=0)


### Show training data

In [6]:
df.head(3)

Unnamed: 0,text,target
1257,Am also doing in cbe only. But have to pay.,0
5461,Ok i thk i got it. Then u wan me 2 come now or...,0
1612,RT-KIng Pro Video Club>> Need help? info@ringt...,1


#### Non-numerical Data

In [7]:
df.describe(include=['O'])

Unnamed: 0,text
count,4457
unique,4182
top,"Sorry, I'll call later"
freq,24


#### Missing values

In [8]:
high_missing = helper.missing(df, limit=0.4)

No missing values found


### Transform data

#### Enhance and add new features

#### Classify features
Change categorical variables as dtype 'categorical' and sort columns: numerical + categorical + target

In [9]:
num = list(df.select_dtypes(include=[np.number]))

df = helper.classify_data(df, target, numerical=num)

pd.DataFrame(dict(df.dtypes), index=["Type"])[df.columns].head() # fancy-show data types

n numerical:   0
n categorical: 1


Unnamed: 0,text,target
Type,category,float32


In [11]:
copy_df = df.copy()  # checkpoint
del(df)

## 2. Neural Network model

### Select the features

In [13]:
df = copy_df.copy() # Restore checkpoint

data = df.copy()
data.head(3)

Unnamed: 0,text,target
1257,Am also doing in cbe only. But have to pay.,0.0
5461,Ok i thk i got it. Then u wan me 2 come now or...,0.0
1612,RT-KIng Pro Video Club>> Need help? info@ringt...,1.0


### Split the data into training and validation sets

In [14]:
def validation_split(data, val_size=0.2):
    
    train, val = train_test_split(
       data, test_size=val_size, random_state=0, shuffle=True, stratify=data[target])

    # Separate the data into features and target (x=features, y=target)
    x_train, y_train = train.drop(target, axis=1).values, train[target].values
    x_val, y_val = val.drop(target, axis=1).values, val[target].values
    # _nc: non-categorical yet (needs one-hot encoding)

    return x_train, y_train, x_val, y_val


x_train, y_train, x_val, y_val = validation_split(data, val_size=0.2)

### One-hot encode the output

In [16]:
import keras


def one_hot_output(y_train, y_val):
    num_classes = len(np.unique(y_train))
    y_train = keras.utils.to_categorical(y_train, num_classes)
    y_val = keras.utils.to_categorical(y_val, num_classes)
    return y_train, y_val


y_train, y_val = one_hot_output(y_train, y_val)

Using TensorFlow backend.


### Imbalanced target

In [None]:
# from sklearn.utils import class_weight

# y_plain = np.ravel(y_train[:,1])

# cw = class_weight.compute_class_weight('balanced', np.unique(y_plain), y_plain)

# cw = {idx : value for idx, value in enumerate(cw)}

In [17]:
print("train size \t X:{} \t Y:{}".format(x_train.shape, y_train.shape))
print("val size \t X:{} \t Y:{}".format(x_val.shape, y_val.shape))

train size 	 X:(3565, 1) 	 Y:(3565, 2)
val size 	 X:(892, 1) 	 Y:(892, 2)
