In [1]:
# The original code can be downloaded from below github code repository
# https://github.com/PacktPublishing/Python-Machine-Learning-By-Example/tree/master/Chapter03
# The downloaded code is modified for a different data set and used Spacy for text cleaning
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from collections import defaultdict
import spacy 

In [2]:
messages = pd.read_csv('SMSSpamCollection', sep='\t', names=["label", "message"])
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
# Clean up function using Spacy
nlp = spacy.load('en_core_web_sm')
# Function to remove 'StopWords', 'Check for string only'
def text_preprocessing(msg):
    str_message = ''
    doc = nlp(msg) # Breaking message in to tokens
    for token in doc:
        if token.text.isalpha():
            if len(token.text)>=3:
                str_message = str_message + ' ' + token.text
    return str_message


In [4]:
# Convert Ham, Spam Categorical data in to numerical as 1,0
def to_numerical (cls_label):
    if cls_label == 'ham':
        return 1
    elif cls_label == 'spam':
        return 0
    

In [5]:
# Data Preprocessing
X = messages['message'].apply(text_preprocessing)
y = messages['label'].apply(to_numerical)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [7]:
#Starting With the Prior. we first group the data by label
def get_label_index(labels):
    label_index = defaultdict(list)
    for index, label in enumerate(labels):
        label_index[label].append(index)
    return label_index #label_index = {0:[10,2,3,4,5,60],1:[44,5,5,5,5,5]}


In [8]:
#Calculate Prior
def get_prior(label_index):
    """ Compute prior based on training samples
    Args:
        label_index (grouped sample indices by class/label)
    Returns:
        dictionary, with class label as key, corresponding prior as the value
    """
    prior = {label: len(index) for label, index in label_index.items()} #prior = {0: 6, 1: 6}
    total_count = sum(prior.values())
    for label in prior:
        prior[label] /= float(total_count)
    return prior

In [9]:
# Calculate Likelyhood
def get_likelihood(term_document_matrix, label_index, smoothing=0):
    """ Compute likelihood based on training samples
    Args:
        term_document_matrix (sparse matrix)
        label_index (grouped sample indices by class)
        smoothing (integer, additive Laplace smoothing parameter)
    Returns:
        dictionary, with class as key, corresponding conditional probability P(feature|class) vector as value
    """
    likelihood = {}
    for label, index in label_index.items():
        likelihood[label] = term_document_matrix[index, :].sum(axis=0) + smoothing
        likelihood[label] = np.asarray(likelihood[label])[0]
        total_count = likelihood[label].sum()
        likelihood[label] = likelihood[label] / float(total_count)
    return likelihood

In [10]:
# Calculate the Posterior. P (S/x)
def get_posterior(term_document_matrix, prior, likelihood):
    """ Compute posterior of testing samples, based on prior and likelihood
    Args:
        term_document_matrix (sparse matrix)
        prior (dictionary, with class label as key, corresponding prior as the value)
        likelihood (dictionary, with class label as key, corresponding conditional probability vector as value)
    Returns:
        dictionary, with class label as key, corresponding posterior as value
    """
    num_docs = term_document_matrix.shape[0]
    posteriors = []
    for i in range(num_docs):
        # posterior is proportional to prior * likelihood
        # = exp(log(prior * likelihood))
        # = exp(log(prior) + log(likelihood))
        posterior = {key: np.log(prior_label) for key, prior_label in prior.items()}
        for label, likelihood_label in likelihood.items():
            term_document_vector = term_document_matrix.getrow(i)
            counts = term_document_vector.data
            indices = term_document_vector.indices
            for count, index in zip(counts, indices):
                posterior[label] += np.log(likelihood_label[index]) * count
        # exp(-1000):exp(-999) will cause zero division error,
        # however it equates to exp(0):exp(1)
        min_log_posterior = min(posterior.values())
        for label in posterior:
            try:
                posterior[label] = np.exp(posterior[label] - min_log_posterior)
            except:
                # if one's log value is excessively large, assign it infinity
                posterior[label] = float('inf')
        # normalize so that all sums up to 1
        sum_posterior = sum(posterior.values())
        for label in posterior:
            if posterior[label] == float('inf'):
                posterior[label] = 1.0
            else:
                posterior[label] /= sum_posterior
        posteriors.append(posterior.copy())
    return posteriors


In [11]:
#Train the model
cv = CountVectorizer(stop_words="english", max_features=500) #Vector representation of words with term frequency
term_docs = cv.fit_transform(X_train)

label_index = get_label_index(y_train)
prior = get_prior(label_index)
print("Bayes prior = ",prior)

smoothing = 1
likelihood = get_likelihood(term_docs, label_index, smoothing)
print("Bayes likelihood = ",likelihood)

Bayes prior =  {1: 0.865897435897436, 0: 0.1341025641025641}
Bayes likelihood =  {1: array([1.21980971e-03, 1.62641295e-03, 8.13206473e-04, 1.78905424e-03,
       9.75847768e-04, 1.05716842e-03, 1.87037489e-03, 4.06603237e-04,
       2.27697812e-03, 1.62641295e-03, 5.36716272e-03, 1.13848906e-03,
       9.75847768e-04, 2.43961942e-04, 1.62641295e-04, 1.70773359e-03,
       4.71659754e-03, 1.38245100e-03, 2.43961942e-04, 8.13206473e-05,
       1.05716842e-03, 1.62641295e-04, 1.62641295e-04, 8.13206473e-05,
       1.05716842e-03, 1.21980971e-03, 3.98471172e-03, 2.11433683e-03,
       1.46377165e-03, 8.13206473e-04, 9.75847768e-04, 1.46377165e-03,
       1.87037489e-03, 1.05716842e-03, 2.68358136e-03, 1.70773359e-03,
       2.27697812e-03, 1.95169554e-03, 2.92754330e-03, 8.13206473e-05,
       9.75847768e-04, 1.38245100e-03, 1.13848906e-03, 3.25282589e-04,
       2.03301618e-03, 7.31885826e-04, 1.21980971e-03, 1.13848906e-03,
       1.05716842e-03, 2.03301618e-03, 1.13848906e-03, 3.984711

       0.00171065, 0.00146628, 0.00219941, 0.00024438, 0.00024438])}


In [12]:
#Pass test data to classifier pipeline
term_docs_test = cv.transform(X_test)
posterior = get_posterior(term_docs_test, prior, likelihood)

correct = 0.0
for pred, actual in zip(posterior, y_test):
    print(pred,actual)
    if actual == 1:
        if pred[1] >= 0.5:
            correct += 1
    elif pred[0] > 0.5:
        correct += 1

print('The accuracy on {0} testing samples is: {1:.1f}%'.format(len(y_test), correct/len(y_test)*100))


{1: 0.9950937079190871, 0: 0.004906292080912877} 1
{1: 0.8171458089054962, 0: 0.1828541910945038} 1
{1: 0.9999231324039412, 0: 7.686759605884258e-05} 1
{1: 0.9824144653443414, 0: 0.01758553465565863} 1
{1: 0.4263315883462188, 0: 0.5736684116537812} 1
{1: 0.9993715300495305, 0: 0.0006284699504695545} 1
{1: 0.9993765403055139, 0: 0.0006234596944860571} 1
{1: 0.9999999167267382, 0: 8.327326182922253e-08} 1
{1: 0.9992674468191595, 0: 0.0007325531808405091} 1
{1: 0.8658974358974358, 0: 0.1341025641025641} 1
{1: 0.9993898320203493, 0: 0.0006101679796507142} 1
{1: 0.9996861475937786, 0: 0.0003138524062214354} 1
{1: 0.9975943982420732, 0: 0.0024056017579268552} 1
{1: 0.9999930435844959, 0: 6.956415504059732e-06} 1
{1: 2.2706932417433892e-13, 0: 0.999999999999773} 0
{1: 0.9570350891920582, 0: 0.042964910807941806} 1
{1: 0.9999999489549702, 0: 5.104502981152857e-08} 1
{1: 0.7368740449157285, 0: 0.26312595508427156} 1
{1: 0.3743403878172993, 0: 0.6256596121827007} 1
{1: 0.9956692263776971, 0: 0.0

{1: 0.9966755851763375, 0: 0.003324414823662507} 0
{1: 0.788601495545944, 0: 0.21139850445405592} 1
{1: 0.9967860997179476, 0: 0.0032139002820524283} 1
{1: 0.9997273396315981, 0: 0.0002726603684018928} 1
{1: 0.9999122253697306, 0: 8.777463026942223e-05} 1
{1: 0.8658974358974358, 0: 0.1341025641025641} 1
{1: 0.0025697743380806173, 0: 0.9974302256619194} 0
{1: 0.20966218788881893, 0: 0.790337812111181} 0
{1: 0.9833425525765103, 0: 0.01665744742348977} 1
{1: 0.9999867256346754, 0: 1.3274365324628198e-05} 1
{1: 0.9483503010847396, 0: 0.05164969891526035} 1
{1: 0.9993616256278949, 0: 0.0006383743721051546} 1
{1: 0.8804800277383995, 0: 0.11951997226160055} 1
{1: 0.987900600157102, 0: 0.012099399842897993} 1
{1: 0.9817238997578821, 0: 0.018276100242117857} 1
{1: 0.9998187732159273, 0: 0.0001812267840727128} 1
{1: 0.9999768748831386, 0: 2.3125116861384786e-05} 1
{1: 0.9999999985256451, 0: 1.4743548779669952e-09} 1
{1: 0.9977401572790957, 0: 0.0022598427209042913} 1
{1: 0.8883478036566197, 0: 0

{1: 0.9924745920075667, 0: 0.0075254079924332905} 1
{1: 0.9760904910233598, 0: 0.023909508976640258} 1
{1: 0.999955145756618, 0: 4.485424338193457e-05} 1
{1: 0.7497600508283188, 0: 0.25023994917168124} 0
{1: 0.7846319954601536, 0: 0.21536800453984636} 1
{1: 0.3911458605531684, 0: 0.6088541394468316} 0
{1: 0.9868771183698765, 0: 0.013122881630123493} 1
{1: 0.999999995047599, 0: 4.952401068856448e-09} 1
{1: 0.9633426397320495, 0: 0.03665736026795052} 1
{1: 0.9927804947228666, 0: 0.0072195052771334145} 1
{1: 0.8658974358974358, 0: 0.1341025641025641} 1
{1: 0.9797706224325999, 0: 0.020229377567400095} 1
{1: 0.9908824891268508, 0: 0.00911751087314913} 1
{1: 0.9994614417121314, 0: 0.0005385582878686833} 1
{1: 0.9645009455521478, 0: 0.035499054447852174} 1
{1: 0.9985002513427628, 0: 0.001499748657237284} 1
{1: 0.9535153335369762, 0: 0.046484666463023856} 1
{1: 1.8808937991395148e-05, 0: 0.9999811910620086} 0
{1: 0.9999668999800442, 0: 3.310001995577884e-05} 1
{1: 0.9999476086712475, 0: 5.2391

{1: 0.9997663681259721, 0: 0.00023363187402788754} 1
{1: 0.999999420981429, 0: 5.790185709840747e-07} 1
{1: 0.9962977592155725, 0: 0.0037022407844274573} 1
{1: 0.9931114828549029, 0: 0.006888517145097129} 1
{1: 0.9670992043805049, 0: 0.032900795619495125} 1
{1: 0.9772587799018879, 0: 0.022741220098112032} 1
{1: 0.9884830760156889, 0: 0.01151692398431116} 1
{1: 1.87083512649981e-09, 0: 0.9999999981291648} 0
{1: 0.9991570365681058, 0: 0.0008429634318941871} 1
{1: 0.9996358517148457, 0: 0.00036414828515430654} 1
{1: 0.9991560968699719, 0: 0.0008439031300281526} 1
{1: 0.9928907744180285, 0: 0.007109225581971444} 1
{1: 1.3035616740893035e-10, 0: 0.9999999998696438} 0
{1: 0.8658974358974358, 0: 0.1341025641025641} 1
{1: 0.997712435887467, 0: 0.002287564112532934} 1
{1: 1.1700381021761358e-07, 0: 0.9999998829961898} 0
{1: 0.9966330808981718, 0: 0.003366919101828229} 1
{1: 0.9999881487452968, 0: 1.1851254703109373e-05} 1
{1: 0.9999273784912844, 0: 7.262150871558957e-05} 1
{1: 0.992890774418028

{1: 0.9985587226658608, 0: 0.0014412773341391837} 1
{1: 2.318622293506227e-06, 0: 0.9999976813777065} 0
{1: 0.9973630897760903, 0: 0.002636910223909635} 1
{1: 0.9999999942446968, 0: 5.755303198547051e-09} 1
{1: 0.9983965574312376, 0: 0.001603442568762417} 1
{1: 4.7897063862334086e-06, 0: 0.9999952102936137} 0
{1: 0.9883834748109838, 0: 0.011616525189016277} 1
{1: 0.9995501089726955, 0: 0.0004498910273045407} 1
{1: 0.9997745588725147, 0: 0.00022544112748540274} 1
{1: 0.9955339693486566, 0: 0.004466030651343425} 1
{1: 2.1961318477004268e-08, 0: 0.9999999780386816} 0
{1: 0.8658974358974358, 0: 0.1341025641025641} 1
{1: 0.9999139090865263, 0: 8.609091347371605e-05} 1
{1: 0.9995903177905711, 0: 0.0004096822094289885} 1
{1: 0.9758298322918141, 0: 0.024170167708185943} 1
{1: 0.9838267291834515, 0: 0.01617327081654855} 1
{1: 0.9997547620132828, 0: 0.0002452379867171326} 1
{1: 0.9981160881751757, 0: 0.0018839118248242745} 1
{1: 0.9654367440244531, 0: 0.03456325597554689} 1
{1: 0.933182904426747

{1: 2.881441001205461e-10, 0: 0.9999999997118559} 0
{1: 0.9989875380552283, 0: 0.0010124619447716787} 1
{1: 0.8658974358974358, 0: 0.1341025641025641} 1
{1: 0.9997663681259721, 0: 0.00023363187402788754} 1
{1: 0.9999999657855948, 0: 3.421440516308278e-08} 1
{1: 0.9983902281708322, 0: 0.0016097718291677463} 1
{1: 0.9973175733681071, 0: 0.002682426631892896} 1
{1: 0.9999919810446382, 0: 8.018955361819415e-06} 1
{1: 0.880264558456687, 0: 0.11973544154331289} 1
{1: 0.9583323847326695, 0: 0.04166761526733048} 1
{1: 0.9962719469898866, 0: 0.0037280530101133852} 1
{1: 0.9984654206117957, 0: 0.0015345793882043665} 1
{1: 0.999999945447094, 0: 5.455290602236543e-08} 1
{1: 9.832448357090158e-13, 0: 0.9999999999990168} 0
{1: 0.0008489406499255524, 0: 0.9991510593500744} 0
{1: 7.705790498801033e-10, 0: 0.999999999229421} 0
{1: 0.8804800277383995, 0: 0.11951997226160055} 1
{1: 1.2207118667544187e-11, 0: 0.9999999999877929} 0
{1: 0.9999577213597318, 0: 4.227864026821245e-05} 1
{1: 0.9995488519830104,

{1: 0.9733525637456759, 0: 0.02664743625432414} 1
{1: 0.9147114770115043, 0: 0.0852885229884957} 1
{1: 0.8658974358974358, 0: 0.1341025641025641} 1
{1: 0.9884986121547324, 0: 0.011501387845267505} 1
{1: 0.9950660465809764, 0: 0.0049339534190236374} 1
{1: 0.9956692263776971, 0: 0.004330773622302905} 1
{1: 0.961103846857754, 0: 0.03889615314224609} 1
{1: 0.9952005438278452, 0: 0.004799456172154843} 1
{1: 0.9966363667272586, 0: 0.0033636332727414045} 1
{1: 0.33178462039734546, 0: 0.6682153796026545} 1
{1: 0.9994425483102621, 0: 0.0005574516897379032} 1
{1: 0.0008194194453308326, 0: 0.9991805805546692} 0
{1: 0.9997663681259721, 0: 0.00023363187402788754} 1
{1: 3.934728786081414e-07, 0: 0.9999996065271214} 0
{1: 0.9999818676827322, 0: 1.8132317267721005e-05} 1
{1: 0.9999119783878333, 0: 8.802161216673357e-05} 1
{1: 0.9747956640411496, 0: 0.02520433595885036} 1
{1: 0.5468042524585033, 0: 0.45319574754149666} 1
{1: 0.998820560135443, 0: 0.0011794398645570095} 1
{1: 0.9999959789642998, 0: 4.02

{1: 1.279473817229256e-10, 0: 0.9999999998720526} 0
{1: 0.9998502356110758, 0: 0.0001497643889242746} 1
{1: 0.9883834748109837, 0: 0.011616525189016319} 1
{1: 0.9706652141955909, 0: 0.029334785804409062} 1
{1: 0.9999986377619327, 0: 1.3622380673424056e-06} 1
{1: 0.9956692263776971, 0: 0.004330773622302905} 1
{1: 0.999397463748151, 0: 0.0006025362518490734} 1
{1: 0.8658974358974358, 0: 0.1341025641025641} 1
{1: 0.003891380835063019, 0: 0.9961086191649371} 0
{1: 0.9890180957529933, 0: 0.010981904247006763} 1
{1: 0.9528475013326193, 0: 0.04715249866738064} 0
{1: 0.9974873579117773, 0: 0.0025126420882226994} 1
{1: 0.9772207636879827, 0: 0.022779236312017278} 1
{1: 0.9946661562211131, 0: 0.0053338437788869155} 1
{1: 0.9999999294070824, 0: 7.059291760305206e-08} 1
{1: 0.9532976131835382, 0: 0.046702386816461806} 1
{1: 0.9801661940521055, 0: 0.019833805947894495} 1
{1: 0.9992585800706557, 0: 0.000741419929344302} 1
{1: 0.9930844118082419, 0: 0.006915588191758104} 1
{1: 0.9772587799018879, 0: 

{1: 0.9854402499527999, 0: 0.014559750047200143} 1
{1: 0.9955870994758091, 0: 0.004412900524190808} 1
{1: 0.9999935847062971, 0: 6.415293702970145e-06} 1
{1: 1.227283164620821e-05, 0: 0.9999877271683538} 0
{1: 0.9457875621973142, 0: 0.05421243780268586} 1
{1: 0.9991919918917033, 0: 0.0008080081082967315} 1
{1: 0.9962497332116641, 0: 0.0037502667883358484} 1
{1: 0.9856156996550864, 0: 0.014384300344913507} 1
{1: 3.6156198330097958e-09, 0: 0.9999999963843802} 0
{1: 0.8658974358974358, 0: 0.1341025641025641} 1
{1: 0.7771086009187039, 0: 0.22289139908129607} 1
{1: 0.9907776758183364, 0: 0.009222324181663626} 1
{1: 0.009237356160879931, 0: 0.99076264383912} 0
{1: 0.9999999998733278, 0: 1.2667219506051725e-10} 1
{1: 0.9907054618249689, 0: 0.009294538175031154} 1
{1: 0.6628720612956743, 0: 0.3371279387043257} 1
{1: 0.7975377755517489, 0: 0.2024622244482511} 1
{1: 0.9998313341637532, 0: 0.00016866583624674866} 1
{1: 0.9971187127606943, 0: 0.002881287239305646} 1
{1: 0.9999999157748818, 0: 8.42

{1: 0.990875531964892, 0: 0.009124468035107972} 1
{1: 0.9999998245817401, 0: 1.7541825987723272e-07} 1
{1: 1.1213699277268318e-06, 0: 0.9999988786300723} 0
{1: 2.7309595325767096e-05, 0: 0.9999726904046742} 0
{1: 0.9818258540063374, 0: 0.018174145993662613} 1
{1: 6.336170739968256e-07, 0: 0.999999366382926} 0
{1: 0.9972068618039523, 0: 0.002793138196047617} 1
{1: 0.9989875380552283, 0: 0.0010124619447716787} 1
{1: 0.8804800277383995, 0: 0.11951997226160055} 1
{1: 0.8658974358974358, 0: 0.1341025641025641} 1
{1: 1.6786549412712166e-11, 0: 0.9999999999832134} 0
{1: 3.461247586100487e-07, 0: 0.9999996538752414} 0
{1: 0.9992897671310227, 0: 0.000710232868977298} 1
{1: 0.9927804947228666, 0: 0.0072195052771334145} 1
{1: 0.8658974358974358, 0: 0.1341025641025641} 1
{1: 0.9999997537179464, 0: 2.462820536926151e-07} 1
{1: 0.9692144685371855, 0: 0.030785531462814496} 1
{1: 1.9165455808617954e-06, 0: 0.9999980834544191} 0
{1: 0.9987022460295054, 0: 0.0012977539704945428} 1
{1: 0.9860928529283438

{1: 0.9982681102642803, 0: 0.0017318897357196839} 1
{1: 0.9624939090084292, 0: 0.03750609099157079} 1
{1: 0.9999842906915781, 0: 1.570930842186025e-05} 1
{1: 0.937244741418771, 0: 0.06275525858122899} 1
{1: 0.9998140816468638, 0: 0.00018591835313617612} 1
{1: 0.9759779792332094, 0: 0.02402202076679061} 1
{1: 0.9999995783148059, 0: 4.216851940589149e-07} 1
{1: 0.7013392490179237, 0: 0.2986607509820763} 1
{1: 0.9579986417012808, 0: 0.04200135829871919} 1
{1: 0.9882424169111012, 0: 0.011757583088898797} 1
{1: 0.9999052912302574, 0: 9.470876974259e-05} 1
{1: 0.0008063286017634719, 0: 0.9991936713982366} 0
{1: 0.9974166658506097, 0: 0.0025833341493902424} 1
{1: 0.9997361438478206, 0: 0.0002638561521793352} 1
{1: 0.9997958587958454, 0: 0.00020414120415455009} 1
{1: 0.9531425480092317, 0: 0.046857451990768344} 1
{1: 0.9999161961993974, 0: 8.380380060258202e-05} 1
{1: 0.993209854794308, 0: 0.006790145205692044} 1
{1: 0.9999999625942544, 0: 3.740574565633871e-08} 1
{1: 0.9889056405965997, 0: 0.