#  Spam Classification in SciKit-Learn
data from https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection
Data processing was inspired by https://www.kaggle.com/overflow012/d/uciml/sms-spam-collection-dataset/text-preprocessing-classification

upgrade SciKit-Learn to 0.19.1.

In [1]:
! pip install -U scikit-learn

Exception:
Traceback (most recent call last):
  File "C:\Users\GSX\Anaconda3\lib\site-packages\pip\basecommand.py", line 215, in main
    status = self.run(options, args)
  File "C:\Users\GSX\Anaconda3\lib\site-packages\pip\commands\install.py", line 335, in run
    wb.build(autobuilding=True)
  File "C:\Users\GSX\Anaconda3\lib\site-packages\pip\wheel.py", line 749, in build
    self.requirement_set.prepare_files(self.finder)
  File "C:\Users\GSX\Anaconda3\lib\site-packages\pip\req\req_set.py", line 380, in prepare_files
    ignore_dependencies=self.ignore_dependencies))
  File "C:\Users\GSX\Anaconda3\lib\site-packages\pip\req\req_set.py", line 487, in _prepare_file
    req_to_install, finder)
  File "C:\Users\GSX\Anaconda3\lib\site-packages\pip\req\req_set.py", line 428, in _check_skip_installed
    req_to_install, upgrade_allowed)
  File "C:\Users\GSX\Anaconda3\lib\site-packages\pip\index.py", line 465, in find_requirement
    all_candidates = self.find_all_candidates(req.name)
  Fil

In [2]:
import pandas as pd

####
#  Return the k most frequently appearing keywords in the dataframe
def top_k(data_df, vec, k):
    X = vec.fit_transform(data_df['sms'].values)
    labels = vec.get_feature_names()
    
    return pd.DataFrame(columns = labels, data = X.toarray()).sum().sort_values(ascending = False)[:k]



sms_df = pd.read_csv('spam.csv', encoding='latin-1')
sms_df.columns = ['class', 'sms', 'a', 'b', 'c']


In [3]:
## Data wrangling / cleaning
sms_df2=sms_df.drop(labels=['a','b','c'],axis=1)
sms_df=sms_df2.apply(lambda x: x.astype(str).str.lower())


## Results

In [4]:
sms_df

Unnamed: 0,class,sms
0,ham,"go until jurong point, crazy.. available only ..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor... u c already then say...
4,ham,"nah i don't think he goes to usf, he lives aro..."
5,spam,freemsg hey there darling it's been 3 week's n...
6,ham,even my brother is not like to speak with me. ...
7,ham,as per your request 'melle melle (oru minnamin...
8,spam,winner!! as a valued network customer you have...
9,spam,had your mobile 11 months or more? u r entitle...


In [5]:
sms_df.groupby('class').describe()

Unnamed: 0_level_0,sms,sms,sms,sms
Unnamed: 0_level_1,count,unique,top,freq
class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4515,"sorry, i'll call later",30
spam,747,653,please call our customer service representativ...,4


## Vectorizing the Text

In [6]:
## Generate feature vectors
from sklearn.feature_extraction.text import CountVectorizer
my_vec=CountVectorizer(decode_error = 'ignore', stop_words = 'english')
X=my_vec.fit_transform(sms_df['sms'].values)



## most frequent terms in spam

In [7]:
top_spam = top_k(sms_df[sms_df['class'] == 'spam'], my_vec, 30)

top_spam

free          224
txt           163
ur            144
mobile        127
text          125
stop          121
claim         113
reply         104
www            98
prize          93
just           78
cash           76
won            76
uk             74
150p           71
send           70
new            69
nokia          67
win            64
urgent         63
tone           60
week           60
50             57
contact        56
service        56
msg            54
com            54
18             51
16             51
guaranteed     50
dtype: int64

## Vs ham...

In [8]:
top_ham = top_k(sms_df[sms_df['class'] == 'ham'], my_vec, 30)

top_ham

gt       318
lt       316
just     293
ok       287
ll       265
ur       241
know     236
good     233
got      232
like     232
come     227
day      209
time     201
love     199
going    169
home     165
want     164
lor      162
need     158
sorry    157
don      151
da       150
today    139
later    135
dont     132
did      129
send     129
think    128
pls      123
hi       122
dtype: int64

## Regularize URLs and Numbers

Import _regularize_ here, and use *regularize_urls* and *regularize_numbers*
on the columns.

In [9]:
# Regularize/tokenize URLs and numbers

import regularize
from regularize import regularize_urls
from regularize import regularize_numbers
sms_df['sms']=regularize_numbers(regularize_urls(sms_df['sms']))


In [10]:
# Top-30 spam terms
my_vec=CountVectorizer(decode_error = 'ignore', stop_words = 'english')
X=my_vec.fit_transform(sms_df['sms'].values)
top_spam = top_k(sms_df[sms_df['class'] == 'spam'], my_vec, 30)

top_spam


_num_         3289
free           228
txt            165
ur             144
_url_          141
mobile         129
stop           126
text           125
claim          113
reply          104
prize           92
just            78
won             76
cash            76
nokia           71
send            70
win             70
new             69
urgent          63
week            60
tone            59
box             57
msg             56
service         56
contact         56
guaranteed      50
ppm             49
customer        49
mins            47
phone           46
dtype: int64

In [11]:
# Vector of 'important' words
from collections import OrderedDict
from itertools import repeat
top_words=top_ham.index.tolist()+top_spam.index.tolist()
top_words = list(OrderedDict(zip(top_words, repeat(None))))
relevant_vec=CountVectorizer(decode_error = 'ignore', vocabulary = top_words, stop_words = 'english')

In [12]:
import sklearn.model_selection as ms
from sklearn.feature_extraction.text import TfidfTransformer
import numpy as np

# X is the feature array, based off relevant words
X = relevant_vec.fit_transform(sms_df['sms'].values).toarray()

# Compute the length of each sms message, normalized
# by max length
Xlen = np.zeros((X.shape[0],1))
inx = 0
for v in sms_df['sms'].values:
        Xlen[inx,0] = len(v)
        inx += 1
Xlen = Xlen / max(Xlen)
# Add the length as another feature
X = np.hstack((X, Xlen))

y = np.array((sms_df['class'] == 'spam').astype(int))

# Now we split...
X_train, X_test, y_train, y_test = ms.train_test_split(X, 
                                                    y, test_size=0.2, random_state=42)

X_train=X_train.astype(np.float32)
X_test=X_test.astype(np.float32)
X_train

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.09110867],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.16684961],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.05049396],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.04939627],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.02854007],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.03841932]], dtype=float32)

## Classifier Evaluation

In [13]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import sklearn.model_selection as ms
from sklearn.linear_model import LogisticRegression
import numpy as np

# Results, as a list of dictionaries
classifier_results = []

In [14]:
## Sample depth-2 decision tree
max_depth=5
for i in range(1,max_depth+1):
    dt_model = DecisionTreeClassifier(max_depth=i)
    dt_model.fit(X_train, y_train)
    y_pred_test = dt_model.predict(X_test)
    test_score = dt_model.score(X_test, y_test)
    classifier_results.append({'Classifier': 'DecTree', 'Depth': i, 'Score': test_score})

# Code for creating and testing classifiers
#L1 penalty logistic
LogReg_L1 = LogisticRegression(penalty='l1', random_state=42,solver='liblinear')
LogReg_L1.fit(X_train, y_train)
y_pred_test = LogReg_L1.predict(X_test)
test_score = LogReg_L1.score(X_test, y_test)
classifier_results.append({'Classifier': 'LogReg-L1', 'Score': test_score})

#L2 penalty logistic
LogReg_L2 = LogisticRegression(penalty='l2', random_state=42, solver='liblinear')
LogReg_L2.fit(X_train, y_train)
y_pred_test = LogReg_L2.predict(X_test)
test_score = LogReg_L2.score(X_test, y_test)
classifier_results.append({'Classifier': 'LogReg-L2', 'Score': test_score})

#SVM
SVC=SVC(random_state=42)
SVC.fit(X_train, y_train)
y_pred_test = SVC.predict(X_test)
test_score = SVC.score(X_test, y_test)
classifier_results.append({'Classifier': 'SVC', 'Score': test_score})

In [15]:
pd.DataFrame(classifier_results)

Unnamed: 0,Classifier,Depth,Score
0,DecTree,1.0,0.93991
1,DecTree,2.0,0.93991
2,DecTree,3.0,0.947085
3,DecTree,4.0,0.950673
4,DecTree,5.0,0.960538
5,LogReg-L1,,0.973094
6,LogReg-L2,,0.9713
7,SVC,,0.9713


## Ensembles

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier

## Compute ensemble classifier results here

In [17]:
# Code for classifier construction and testing mentioned in St
RFClassifier = RandomForestClassifier(n_estimators=31, random_state=314)
RFClassifier.fit(X_train, y_train)
y_pred_test = RFClassifier.predict(X_test)
test_score = RFClassifier.score(X_test, y_test)
classifier_results.append({'Classifier': 'RandomForestClassifier', 'Count':31,'Score': test_score})

Bag_DecTree = BaggingClassifier(n_estimators=31, random_state=42)
Bag_DecTree.fit(X_train, y_train)
y_pred_test = Bag_DecTree.predict(XBag_LogReg_L1 = BaggingClassifier(base_estimator=LogReg_L1, n_estimators=31, random_state=314)
Bag_LogReg_L1.fit(X_train, y_train)
y_pred_test = Bag_LogReg_L1.predict(X_test)
test_score = Bag_LogReg_L1.score(X_test, y_test)_test)
test_score = Bag_DecTree.score(X_test, y_test)
classifier_results.append({'Classifier': 'Bag-DecTree', 'Count':31, 'Score': test_score})


classifier_results.append({'Classifier': 'Bag-LogReg-L1', 'Count':31,'Score': test_score})

Bag_LogReg_L2 = BaggingClassifier(base_estimator=LogReg_L2, n_estimators=31, random_state=314)
Bag_LogReg_L2.fit(X_train, y_train)
y_pred_test = Bag_LogReg_L2.predict(X_test)
test_score = Bag_LogReg_L2.score(X_test, y_test)
classifier_results.append({'Classifier': 'Bag-LogReg-L2', 'Count':31,'Score': test_score})

Bag_SVM = BaggingClassifier(base_estimator=SVC, n_estimators=31, random_state=314)
Bag_SVM.fit(X_train, y_train)
y_pred_test = Bag_SVM.predict(X_test)
test_score = Bag_SVM.score(X_test, y_test)
classifier_results.append({'Classifier': 'Bag-SVM', 'Count':31,'Score': test_score})

Boost_DecTree = AdaBoostClassifier(n_estimators=31, random_state=42)
Boost_DecTree.fit(X_train, y_train)
y_pred_test = Boost_DecTree.predict(X_test)
test_score = Boost_DecTree.score(X_test, y_test)
classifier_results.append({'Classifier': 'Boost-DecTree', 'Count':31, 'Score': test_score})

Boost_LogReg_L1 = AdaBoostClassifier(base_estimator=LogReg_L1, n_estimators=31, random_state=314)
Boost_LogReg_L1.fit(X_train, y_train)
y_pred_test = Boost_LogReg_L1.predict(X_test)
test_score = Boost_LogReg_L1.score(X_test, y_test)
classifier_results.append({'Classifier': 'Boost-LogReg-L1', 'Count':31,'Score': test_score})

Boost_LogReg_L2 = AdaBoostClassifier(base_estimator=LogReg_L2, n_estimators=31, random_state=314)
Boost_LogReg_L2.fit(X_train, y_train)
y_pred_test = Boost_LogReg_L2.predict(X_test)
test_score = Boost_LogReg_L2.score(X_test, y_test)
classifier_results.append({'Classifier': 'Boost-LogReg-L2', 'Count':31,'Score': test_score})

Boost_SVM = AdaBoostClassifier(algorithm='SAMME', n_estimators=31, random_state=314)
Boost_SVM.fit(X_train, y_train)
y_pred_test = Boost_SVM.predict(X_test)
test_score = Boost_SVM.score(X_test, y_test)
classifier_results.append({'Classifier': 'Boost-SVM', 'Count':31,'Score': test_score})





In [18]:
pd.DataFrame(classifier_results)

Unnamed: 0,Classifier,Count,Depth,Score
0,DecTree,,1.0,0.93991
1,DecTree,,2.0,0.93991
2,DecTree,,3.0,0.947085
3,DecTree,,4.0,0.950673
4,DecTree,,5.0,0.960538
5,LogReg-L1,,,0.973094
6,LogReg-L2,,,0.9713
7,SVC,,,0.9713
8,RandomForestClassifier,31.0,,0.982063
9,Bag-DecTree,31.0,,0.978475


## Neural Networks

In [19]:
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier

In [20]:
#Code for classifier construction and testing mentioned in Step 3.0 of HW document
Percep = Perceptron(random_state=42, max_iter=5, tol=None)
Percep.fit(X_train, y_train)
y_pred_test = Percep.predict(X_test)
test_score = Percep.score(X_test, y_test)
classifier_results.append({'Classifier': 'Perceptron','Score': test_score})

MLP3 = MLPClassifier(hidden_layer_sizes = (3,),random_state=42)
MLP3.fit(X_train, y_train)
y_pred_test = MLP3.predict(X_test)
test_score = MLP3.score(X_test, y_test)
classifier_results.append({'Classifier': 'MLPClassifier','Hidden':(3,),'Score': test_score})

MLP10 = MLPClassifier(hidden_layer_sizes = (10,),random_state=42)
MLP10.fit(X_train, y_train)
y_pred_test = MLP10.predict(X_test)
test_score = MLP10.score(X_test, y_test)
classifier_results.append({'Classifier': 'MLPClassifier','Hidden':(10,),'Score': test_score})

MLP30 = MLPClassifier(hidden_layer_sizes = (10,10,10),random_state=42)
MLP30.fit(X_train, y_train)
y_pred_test = MLP30.predict(X_test)
test_score = MLP30.score(X_test, y_test)
classifier_results.append({'Classifier': 'MLPClassifier','Hidden':(10,10,10),'Score': test_score})


In [21]:
pd.DataFrame(classifier_results)

Unnamed: 0,Classifier,Count,Depth,Hidden,Score
0,DecTree,,1.0,,0.93991
1,DecTree,,2.0,,0.93991
2,DecTree,,3.0,,0.947085
3,DecTree,,4.0,,0.950673
4,DecTree,,5.0,,0.960538
5,LogReg-L1,,,,0.973094
6,LogReg-L2,,,,0.9713
7,SVC,,,,0.9713
8,RandomForestClassifier,31.0,,,0.982063
9,Bag-DecTree,31.0,,,0.978475


## TensorFlow

In [22]:


#Define TensorFlow columns
columns=[]
for word in top_words:
    columns.append(tf.contrib.layers.real_valued_column(word))
columns.append(tf.contrib.layers.real_valued_column('length'))




You are using pip version 9.0.1, however version 9.0.3 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


Instructions for updating:
Use the retry module or similar alternatives.


In [23]:
# Create function input_fn(x,y)
def input_fn(x,y):
    n=len(columns)
    tensor_x={v: tf.constant(x[:,k]) for k,v in enumerate(columns)}
    tensor_y=tf.constant(y)
    
    return tensor_x, tensor_y
    
        

# Create function train_input_fn()
def train_input_fn():
    return input_fn(X_train, y_train)
# Create function test_input_fn()
def test_input_fn():
    return input_fn(X_test, y_test)


In [24]:
#Create DNNClassifier
tf.set_random_seed(42)
DNNClass = tf.estimator.DNNClassifier(feature_columns=columns, hidden_units=[5,5])


INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\GSX\\AppData\\Local\\Temp\\tmp6ugganci', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x0000000011C09320>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [25]:
# train
train_dat=DNNClass.train(input_fn=train_input_fn, steps=1000)


INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into C:\Users\GSX\AppData\Local\Temp\tmp6ugganci\model.ckpt.
INFO:tensorflow:loss = 3017.84, step = 1
INFO:tensorflow:global_step/sec: 64.8636
INFO:tensorflow:loss = 532.8, step = 101 (1.546 sec)
INFO:tensorflow:global_step/sec: 90.5682
INFO:tensorflow:loss = 455.428, step = 201 (1.105 sec)
INFO:tensorflow:global_step/sec: 89.5141
INFO:tensorflow:loss = 407.266, step = 301 (1.117 sec)
INFO:tensorflow:global_step/sec: 92.538
INFO:tensorflow:loss = 373.386, step = 401 (1.081 sec)
INFO:tensorflow:global_step/sec: 92.1542
INFO:tensorflow:loss = 347.786, step = 501 (1.085 sec)
INFO:tensorflow:global_step/sec: 90.0381
INFO:tensorflow:loss = 327.089, step = 601 (1.111 sec)
INFO:tensorflow:global_step/sec: 93.1414
INFO:t

In [26]:
#evaluate
test_dat=DNNClass.evaluate(input_fn=test_input_fn, steps=1)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-04-13-16:47:19
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\GSX\AppData\Local\Temp\tmp6ugganci\model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2018-04-13-16:47:21
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.973094, accuracy_baseline = 0.865471, auc = 0.981523, auc_precision_recall = 0.951545, average_loss = 0.121954, global_step = 1000, label/mean = 0.134529, loss = 135.979, prediction/mean = 0.111127


In [27]:
# results
for key in sorted(test_dat):
  print('%s: %s' % (key, test_dat[key]))

accuracy: 0.973094
accuracy_baseline: 0.865471
auc: 0.981523
auc_precision_recall: 0.951545
average_loss: 0.121954
global_step: 1000
label/mean: 0.134529
loss: 135.979
prediction/mean: 0.111127


In [28]:
# Create LinearClassifier
tf.set_random_seed(42)
LinearClass = tf.estimator.LinearClassifier(feature_columns=columns)


INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\GSX\\AppData\\Local\\Temp\\tmp8_zo5y4e', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x0000000011C09E48>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [29]:
# train
train_dat_l=LinearClass.train(input_fn=train_input_fn,steps=1000)


INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into C:\Users\GSX\AppData\Local\Temp\tmp8_zo5y4e\model.ckpt.
INFO:tensorflow:loss = 3089.35, step = 1
INFO:tensorflow:global_step/sec: 50.6137
INFO:tensorflow:loss = 469.83, step = 101 (1.979 sec)
INFO:tensorflow:global_step/sec: 137.629
INFO:tensorflow:loss = 381.65, step = 201 (0.727 sec)
INFO:tensorflow:global_step/sec: 144.699
INFO:tensorflow:loss = 347.852, step = 301 (0.691 sec)
INFO:tensorflow:global_step/sec: 140.629
INFO:tensorflow:loss = 330.052, step = 401 (0.712 sec)
INFO:tensorflow:global_step/sec: 140.432
INFO:tensorflow:loss = 319.123, step = 501 (0.712 sec)
INFO:tensorflow:global_step/sec: 147.148
INFO:tensorflow:loss = 311.758, step = 601 (0.680 sec)
INFO:tensorflow:global_step/sec: 146.609
INFO:

In [30]:
# evaluate
test_dat_l=LinearClass.evaluate(input_fn=test_input_fn, steps=1)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-04-13-16:48:27
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\GSX\AppData\Local\Temp\tmp8_zo5y4e\model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2018-04-13-16:48:30
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.973991, accuracy_baseline = 0.865471, auc = 0.97772, auc_precision_recall = 0.95136, average_loss = 0.090108, global_step = 1000, label/mean = 0.134529, loss = 100.47, prediction/mean = 0.13526


In [31]:
# results
for key in sorted(test_dat_l):
  print('%s: %s' % (key, test_dat_l[key]))

accuracy: 0.973991
accuracy_baseline: 0.865471
auc: 0.97772
auc_precision_recall: 0.95136
average_loss: 0.090108
global_step: 1000
label/mean: 0.134529
loss: 100.47
prediction/mean: 0.13526
