# import required packages

In [42]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split 
from sklearn import metrics

# read the input dataset

In [46]:
csv_dataset = pd.read_csv('../data/spam.csv', encoding='latin-1')

del csv_dataset['Unnamed: 2']
del csv_dataset['Unnamed: 3']
del csv_dataset['Unnamed: 4']

csv_dataset.columns = ['class', 'data']

def table(df, col):
    return df.groupby(col).count()

### Imbalanced class proportion observed in distribution of response variable

In [8]:
table(csv_dataset,"class")

Unnamed: 0_level_0,data
class,Unnamed: 1_level_1
ham,4825
spam,747


In [9]:
print(csv_dataset.head())

  class                                               data
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [10]:
# converting responses to int
csv_dataset.loc[csv_dataset['class']=='ham', 'class'] = 0
csv_dataset.loc[csv_dataset['class']=='spam', 'class'] = 1

print(csv_dataset.head())

  class                                               data
0     0  Go until jurong point, crazy.. Available only ...
1     0                      Ok lar... Joking wif u oni...
2     1  Free entry in 2 a wkly comp to win FA Cup fina...
3     0  U dun say so early hor... U c already then say...
4     0  Nah I don't think he goes to usf, he lives aro...


In [11]:
## No. of examples belonging to class "SPAM" is far too less compared to number of examples for class "HAM"
## seperating dependent and independent variables
y_class = csv_dataset.pop('class')
X_data  = csv_dataset['data'].str.strip()

## splitting dataset
x_train, x_test, y_train, y_test = train_test_split(X_data, y_class, test_size = 0.3, stratify=y_class)

In [12]:
print("=======================")
print(y_train.groupby(y_train).count())
print(y_test.groupby(y_test).count())
print("=======================")
print(type(x_train).__name__)
print(x_train.shape)
print(type(x_train.to_frame()).__name__)
print((x_train.to_frame()).shape)

x_train_1 = x_train
x_test_1 = x_test
y_train_1 = y_train
y_test_1 = y_test



class
0    3377
1     523
Name: class, dtype: int64
class
0    1448
1     224
Name: class, dtype: int64
Series
(3900,)
DataFrame
(3900, 1)


In [13]:
x_train = x_train.to_frame()
x_test = x_test.to_frame()
y_train = y_train.to_frame()
y_test = y_test.to_frame()


train = x_train.join(y_train)
test = x_test.join(y_test)

x_train_merged_doc = train.groupby('class')['data'].apply('.'.join).reset_index()


In [14]:
x_train.shape

(3900, 1)


# Null accuracy : accuracy by always predicting the most frequent class #

In [15]:
print(y_test["class"].value_counts())

#calculate null accuracy for binary classifier
print("\nNull Accuracy : " + str(max((y_test["class"].mean()), (1-(y_test["class"].mean())))))

0    1448
1     224
Name: class, dtype: int64

Null Accuracy : 0.866028708134


In [16]:
## creating TFIDF features
#vectorizer = TfidfVectorizer(ngram_range=(1, 2),  sublinear_tf = True, stop_words='english')
vectorizer = TfidfVectorizer(ngram_range=(1, 2),sublinear_tf = True, stop_words='english', max_df=0.5)
#vectorizer.fit(raw_documents = x_train_merged_doc['data'], y = x_train_merged_doc['class'])
#features_train_transformed   = vectorizer.transform(raw_documents=x_train, copy = False)
features_train_transformed    = vectorizer.fit_transform(raw_documents=x_train['data'])
features_test_transformed     = vectorizer.transform(raw_documents=x_test['data'], copy = False)

print('=======================')
print(features_train_transformed.shape)
print('=======================')
print(features_test_transformed.shape)


(3900, 28303)
(1672, 28303)



## Using the classic Naive Bayes classifier which is a proven classification technique for SPAM / HAM problem


In [43]:
mnb = MultinomialNB(alpha=1e-10, fit_prior=True)
mnb.fit(X=features_train_transformed, y=np.char.mod('%d',y_train['class'].values))
pred_y = mnb.predict(features_test_transformed)
actual_y = np.char.mod('%d',y_test['class'].values)
cf_matrix = metrics.confusion_matrix(actual_y, pred_y)

TP = cf_matrix[1,1]
FN = cf_matrix[1,0]
FP = cf_matrix[0,1]
TN = cf_matrix[0,0]

1
True
1
True
1
True
1
True
0
True
0
True
0
True
0
True


### Classification Accuracy

In [24]:
#metrics.accuracy_score(actual_y, pred_y)
print("Classification accuracy : " + str(((TP+TN)/float(TP+TN+FP+FN))*100) + "%")

accuracy : 98.1459330144%


### Classification Error

In [25]:
# 1-metrics.accuracy_score(actual_y, pred_y)
print("Classification Error : " + str(((FP+FN)/float(TP+TN+FP+FN))*100) + "%")

Classification Error : 1.85406698565%


### Sensitivity : When actual value is +ve, how often is the prediction correct

In [33]:
#print(metrics.recall_score(actual_y, pred_y))
print("Sensitivity : " + str(((TP)/float(TP+FN))*100) + "%")

Sensitivity : 91.9642857143%


### Specificity : When actual vaue is -ve, how ofen is the prediction correct

In [36]:
print("Specificity : " + str(TN/float(TN+FP)))

Specificity : 0.991022099448


#### > The model is very likely to predict the -ve instance correctly compared to +ve instances.

#### > Hence, we can define our classifier as highly Specific but not highly Sensitive

#### > The primary reason is imbalanced distribution of response variable. We can suspect the computation of prior's of each class in the training phase might have high variance from sample to sample owing to low Sensitivity.



### Precision : Ability to find -ve examples i.e. how much biased is the classifier towards +ve examples

In [41]:
#print("Precision : " + str(metrics.precision_score(actual_y, pred_y)))
print("Precision : " + str(TP/float(TP+FP)))


Precision : 0.940639269406


### Recall : Ability to find +ve examples i.e. how good the classifier is in finding the +ve examples

In [44]:
print("Recall : " + str(TP/(TP+FN)))

Recall : 0.919642857143


## balanced F-measure : weighted average of the precision and recall

In [45]:
precision = TP/float(TP+FP)
recall = TP/(TP+FN)

F1 = 2*precision*recall/(precision + recall)

print("F1 score : " + str(F1))

F1 score : 0.930022573363
