In [56]:
import nltk
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from nltk import wordnet
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from pandas_ml import ConfusionMatrix
import re

In [3]:
with open('data/spam/SMSSpamCollection','r') as f:
    text = f.read()

In [8]:
data = [value.split('\t') for value in text.split('\n')]
df = pd.DataFrame(data)
df.columns = ['label','text']
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Data Preprocessing
- Step-1 : Label Encoder (ham =0 , spam = 1)
- Step-2 : Text processing 

In [10]:
df['label'].replace(['ham','spam'],[0,1],inplace=True)

In [20]:
wnl = WordNetLemmatizer()

In [21]:
def poslemma(pos_tag):
    word, pos = pos_tag
    if pos.startswith('R'):
        pos = 'r'
    elif pos.startswith('V'):
        pos = 'v'
    elif pos.startswith('J'):
        pos ='a'
    else:
        pos ='n'
        
    return wnl.lemmatize(word,pos=pos)
    
def lematization(sent):
    tokens = sent.split()
    pos = pos_tag(tokens)
    return " ".join([poslemma(tag) for tag in pos])

In [47]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
def tokenization(text):
    text = text.split()
    text_list = [word for word in text if word not in stop]
    return " ".join(text_list)

In [48]:
tokenization('innomatics is in nizampet')

'innomatics nizampet'

In [49]:
def textprocessing(sample):
    sample = sample.lower()
    sample = re.sub(r'[^a-z0-9]'," ",sample)
    sample = tokenization(sample)
    sample = lematization(sample)
    return sample

In [50]:
df.dropna(axis=0,inplace=True)
df['clean_text'] = df['text'].apply(textprocessing)

In [51]:
df.head()

Unnamed: 0,label,text,clean_text
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,0,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,0,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think go usf life around though


In [57]:
cv = CountVectorizer() # less than 5000
tfidf = TfidfVectorizer() # less than 5000 
hashv = HashingVectorizer() # is prefered if feature are more than 5000 + 

In [59]:
x_cv = cv.fit_transform(df['clean_text']).toarray()
x_tfidf = tfidf.fit_transform(df['clean_text']).toarray()
#x_hashv = hashv.fit_transform(df['clean_text']).toarray()

In [62]:
len(cv.get_feature_names())

7626

In [78]:
df['label'] = df['label'].astype('int')
y = df['label'].values

In [79]:
from sklearn.model_selection import train_test_split
x_train, x_test,y_train,y_test = train_test_split(x_cv,y,test_size = 0.3)

In [80]:
model_nb = MultinomialNB()
model_dt = DecisionTreeClassifier()

In [82]:
model_nb.fit(x_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [83]:
# Evaluation 
y_pred = model_nb.predict(x_test)
cm = ConfusionMatrix(y_test,y_pred)
cm

Predicted  False  True  __all__
Actual                         
False       1441    13     1454
True           8   211      219
__all__     1449   224     1673

In [86]:
cm.stats_overall

  num = df[df > 1].dropna(axis=[0, 1], thresh=1).applymap(lambda n: choose(n, 2)).sum().sum() - np.float64(nis2 * njs2) / n2


OrderedDict([('Accuracy', 0.9874476987447699),
             ('95% CI', (0.98087617851218323, 0.99221359436006129)),
             ('No Information Rate', 'ToDo'),
             ('P-Value [Acc > NIR]', 3.1541413901400216e-74),
             ('Kappa', 0.94536310294902093),
             ("Mcnemar's Test P-Value", 'ToDo')])

In [87]:
cm.classification_report

Unnamed: 0_level_0,precision,recall,F1_score,support
Classes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
False,0.994479,0.991059,0.992766,1454
True,0.941964,0.96347,0.952596,219
__avg / total__,0.987605,0.987448,0.987508,1673


In [112]:
test = 'have free coffee dail 1245'

In [113]:
sample = textprocessing(test)

In [114]:
x_sample = cv.transform([sample]).toarray()

In [115]:
model_nb.predict_proba(x_sample)

array([[ 0.77452447,  0.22547553]])

In [116]:
sample

'free coffee dail 1245'