### Load Datasets

In [1]:
import pandas as pd

data = pd.read_csv('spam.csv')
data.head(15)

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [2]:
data.shape

(5572, 2)

In [3]:
data.isnull().sum()

label    0
text     0
dtype: int64

In [4]:
data.label.unique()

array(['ham', 'spam'], dtype=object)

In [73]:
from sklearn import preprocessing

label_encoding = preprocessing.LabelEncoder()

data['label'] = label_encoding.fit_transform(data['label'].astype(str))

In [74]:
data.label.value_counts()

0    4825
1     747
Name: label, dtype: int64

## Feature Extraction

In [75]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()

x_train_tfidf = tfidf_vectorizer.fit_transform(data['text'])

tfidf_vectorizer.vocabulary_

{'go': 3550,
 'until': 8030,
 'jurong': 4350,
 'point': 5920,
 'crazy': 2327,
 'available': 1303,
 'only': 5537,
 'in': 4087,
 'bugis': 1751,
 'great': 3634,
 'world': 8489,
 'la': 4476,
 'buffet': 1749,
 'cine': 2048,
 'there': 7645,
 'got': 3594,
 'amore': 1069,
 'wat': 8267,
 'ok': 5504,
 'lar': 4512,
 'joking': 4318,
 'wif': 8392,
 'oni': 5533,
 'free': 3358,
 'entry': 2949,
 'wkly': 8447,
 'comp': 2165,
 'to': 7756,
 'win': 8405,
 'fa': 3087,
 'cup': 2386,
 'final': 3207,
 'tkts': 7743,
 '21st': 411,
 'may': 4930,
 '2005': 402,
 'text': 7595,
 '87121': 784,
 'receive': 6297,
 'question': 6190,
 'std': 7230,
 'txt': 7933,
 'rate': 6242,
 'apply': 1156,
 '08452810075over18': 77,
 'dun': 2802,
 'say': 6633,
 'so': 7024,
 'early': 2823,
 'hor': 3927,
 'already': 1042,
 'then': 7640,
 'nah': 5238,
 'don': 2712,
 'think': 7660,
 'he': 3781,
 'goes': 3558,
 'usf': 8075,
 'lives': 4665,
 'around': 1207,
 'here': 3831,
 'though': 7680,
 'freemsg': 3365,
 'hey': 3841,
 'darling': 2443,
 'it

In [76]:
print(x_train_tfidf[20])

  (0, 6743)	0.4867303258073519
  (0, 7621)	0.22605624014747247
  (0, 5244)	0.38387153321200107
  (0, 3864)	0.3654615976273315
  (0, 7127)	0.5476316191742238
  (0, 3953)	0.2622388614389741
  (0, 4206)	0.19985701675873627
  (0, 8609)	0.14986725780873528


In [8]:
x_train_tfidf.shape

(5572, 8673)

### Model

In [77]:
from sklearn.model_selection import train_test_split

Y = data['label']

x_train, x_test, y_train, y_test = train_test_split(x_train_tfidf, Y, test_size=0.2)

In [78]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(max_depth=15)
clf.fit(x_train, y_train)


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=15,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [79]:
y_pred = clf.predict(x_test)

In [80]:
y_pred[0:4]

array([0, 0, 0, 0])

#### Evaluating First model 

In [81]:
from sklearn.metrics import accuracy_score
print("Accuracy Score : ", accuracy_score(y_test, y_pred))

Accuracy Score :  0.967713004484305


In [82]:
df_y = pd.DataFrame({'y_test': y_test, 'y_prd': y_pred})
df_y.sample(10)

Unnamed: 0,y_test,y_prd
3118,0,0
660,0,0
5400,0,0
5286,0,0
2221,0,0
3463,0,0
2226,0,0
3991,0,0
827,0,0
666,0,0


## Second Model (Ensemble)

In [83]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier

In [84]:
estimator = []

estimator.append(('LR', LogisticRegression(C=1, solver='liblinear',max_iter = 200)))
estimator.append(('SVC', SVC(kernel='linear',gamma ='auto', probability = True)))
estimator.append(('DTC', DecisionTreeClassifier(max_depth=15)))

In [85]:
# Voting Classifier with hard voting
vot_hard = VotingClassifier(estimators = estimator, voting ='hard')

In [86]:
Y = data['label']
x_train, x_test, y_train, y_test = train_test_split(x_train_tfidf, Y, test_size=0.2)

In [87]:
vot_hard.fit(x_train, y_train)

VotingClassifier(estimators=[('LR',
                              LogisticRegression(C=1, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=200,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='liblinear', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('SVC',
                              SVC(C=1.0, cache_size=200, class_weight=None,
                                  coef0=0.0, decision_functio...
                                  verbose=False)),
                             ('DTC',
                      

In [88]:
y_pred = vot_hard.predict(x_test)

In [89]:
y_pred.shape

(1115,)

In [90]:
from sklearn.metrics import accuracy_score
print("Accuracy Score : ", accuracy_score(y_test, y_pred))

Accuracy Score :  0.9757847533632287


In [91]:
# Voting Classifier with soft voting
vot_soft = VotingClassifier(estimators = estimator, voting ='soft')
vot_soft.fit(x_train, y_train)
y_pred = vot_soft.predict(x_test)

In [92]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy Score : ", accuracy)

Accuracy Score :  0.97847533632287


## Serializing 

In [93]:
import joblib
import sklearn

In [94]:
filename = 'models/voting_clf_soft_model.joblib'
scikit_learn_version = sklearn.__version__

model_params = {}

model_params['preprocessing'] = tfidf_vectorizer
model_params['model'] = vot_soft
model_params['sklearn_version'] = scikit_learn_version
model_params['accuracy'] = accuracy



joblib.dump(model_params, filename)

['models/voting_clf_soft_model.joblib']

### Load Joblib model and test it

In [95]:
clf_checkpoint = joblib.load(filename)

In [96]:
reloaded_vect = clf_checkpoint['preprocessing']
reloaded_vect

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [97]:
clf_model = clf_checkpoint['model']
clf_model

VotingClassifier(estimators=[('LR',
                              LogisticRegression(C=1, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=200,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='liblinear', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('SVC',
                              SVC(C=1.0, cache_size=200, class_weight=None,
                                  coef0=0.0, decision_functio...
                                  verbose=False)),
                             ('DTC',
                      

In [98]:
x_test_trans_new = reloaded_vect.fit_transform(x_test)
y_pred = clf_model.predict(x_test_trans_new)
y_pred

AttributeError: lower not found