In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn import metrics
import spacy 

In [2]:
# Data Reading
test_data_df = pd.read_csv('spam_test.csv',encoding='latin-1')
train_data_df = pd.read_csv('spam_train.csv',encoding='latin-1')
test_data_df.info()
test_data_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 2 columns):
Label      299 non-null object
Message    299 non-null object
dtypes: object(2)
memory usage: 4.8+ KB


Unnamed: 0,Label,Message
0,ham,i will meet mary in the morning 4:45-5:00 am
1,info,"Hi Customer, Booking ID: W6YT81I. Seats: SILVE..."
2,info,"DEAR Tyler, THANK YOU FOR YOUR BOOKING (ORDER ..."
3,spam,Thank You for registering for Mirchi Duathlon....
4,info,Thank you for using your HDFC bank Debitcard e...


In [3]:
train_data_df.info()
train_data_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 2 columns):
Label      699 non-null object
Message    699 non-null object
dtypes: object(2)
memory usage: 11.0+ KB


Unnamed: 0,Label,Message
0,ham,oh how abt 2 days before Christmas
1,info,"Welcome to OVATION HOLD R.No. 184, 114, 395, 3..."
2,info,Thank you for using your ICICI bank CREDITcard...
3,ham,schedule a meeting with the entire team in the...
4,ham,Tommy is my brother


In [4]:
# Data Preprocessing
nlp = spacy.load('en_core_web_sm')
# Function to remove 'StopWords', 'Check for string only'
def text_preprocessing(msg):
    str_message = ''
    doc = nlp(msg) # Breaking message in to tokens
    for token in doc:
        if token.text.isalpha():
            if token.is_stop == False:
                if len(token.text)>=3:
                    str_message = str_message + ' ' + token.text
    return str_message
# Applying data preprocessing in all the training messages
train_data_df['Message'] = train_data_df['Message'].apply(text_preprocessing)

In [5]:
train_data_df.head()

Unnamed: 0,Label,Message
0,ham,abt days Christmas
1,info,Welcome OVATION HOLD
2,info,Thank ICICI bank CREDITcard ending ALIKE Snap...
3,ham,schedule meeting entire team office tomorrow
4,ham,Tommy brother


In [6]:
# Build a Pipeline
text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', MultinomialNB()),
])

In [7]:
X_train = train_data_df['Message']
y_train = train_data_df['Label']

In [8]:
#Pass training data in to the pipeline
text_clf.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...rue,
        vocabulary=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [9]:
#Preprocess Test Data
X_test = test_data_df['Message'].apply(text_preprocessing)
y_test = test_data_df['Label']
#Pass test data to classifier pipeline
predictions = text_clf.predict(X_test)

In [10]:
#Evaluate test results using confusion matrices
print(metrics.confusion_matrix(y_test, predictions))
# Accuracy report
print(metrics.classification_report(y_test,predictions))

[[ 77  15   2]
 [  0 146   1]
 [  3   3  52]]
              precision    recall  f1-score   support

         ham       0.96      0.82      0.89        94
        info       0.89      0.99      0.94       147
        spam       0.95      0.90      0.92        58

   micro avg       0.92      0.92      0.92       299
   macro avg       0.93      0.90      0.91       299
weighted avg       0.92      0.92      0.92       299



In [11]:
# Predict on real input and check
sample_text = "Greetings, Paytm SELLER SERVICES wishes you a Happy Ganesh Chaturthi"

In [12]:
# Pass clean data to the pipe line for vectorization and classification
sample_text = text_preprocessing(sample_text)
print(text_clf.predict([sample_text]))

['spam']
