In [1]:
import pandas as pd
import csv

#Data Loading

messages = [line.rstrip() for line in open('dataset.csv')]

print len(messages)

5574


In [2]:
#Appending column headers
messages = pd.read_csv('dataset.csv', sep='\t', quoting=csv.QUOTE_NONE,names=["label", "message"])

In [3]:
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
data_size=messages.shape
print(data_size)

(5574, 2)


In [5]:
messages_col_names=list(messages.columns)
print(messages_col_names)

['label', 'message']


In [6]:
print(messages.groupby('label').describe())

      message                                                               
        count unique                                                top freq
label                                                                       
ham      4827   4518                             Sorry, I'll call later   30
spam      747    653  Please call our customer service representativ...    4


In [7]:
#Identifying the outcome/target variable.
message_target=messages['label'] 

print(message_target.unique())

['ham' 'spam']


### tokenization

In [8]:
import nltk
from nltk.tokenize import word_tokenize

In [9]:
def split_tokens(message):

  message=message.lower()

  message = unicode(message, 'utf8') #convert bytes into proper unicode

  word_tokens =word_tokenize(message)

  return word_tokens

In [10]:
messages['tokenized_message'] = messages.apply(lambda row: split_tokens(row['message']),axis=1)

In [17]:
#nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [11]:
list(messages.columns)

['label', 'message', 'tokenized_message']

In [12]:
messages['tokenized_message'].head()

0    [go, until, jurong, point, ,, crazy.., availab...
1             [ok, lar, ..., joking, wif, u, oni, ...]
2    [free, entry, in, 2, a, wkly, comp, to, win, f...
3    [u, dun, say, so, early, hor, ..., u, c, alrea...
4    [nah, i, do, n't, think, he, goes, to, usf, ,,...
Name: tokenized_message, dtype: object

In [13]:
len(list(messages['tokenized_message']))

5574

### Lemmatization : method to convert a word into its base/root form.

In [14]:
from nltk.stem.wordnet import WordNetLemmatizer

def split_into_lemmas(message):

    lemma = []

    lemmatizer = WordNetLemmatizer()

    for word in message:

        a=lemmatizer.lemmatize(word)

        lemma.append(a)

    return lemma


In [15]:
messages['lemmatized_message'] = messages.apply(lambda row: split_into_lemmas(row['tokenized_message']),axis=1)

In [16]:
print('Tokenized message:',messages['tokenized_message'][11])
print('Lemmatized message:',messages['lemmatized_message'][11])

('Tokenized message:', [u'six', u'chances', u'to', u'win', u'cash', u'!', u'from', u'100', u'to', u'20,000', u'pounds', u'txt', u'>', u'csh11', u'and', u'send', u'to', u'87575.', u'cost', u'150p/day', u',', u'6days', u',', u'16+', u'tsandcs', u'apply', u'reply', u'hl', u'4', u'info'])
('Lemmatized message:', [u'six', u'chance', u'to', u'win', u'cash', u'!', u'from', u'100', u'to', u'20,000', u'pound', u'txt', u'>', u'csh11', u'and', u'send', u'to', u'87575.', u'cost', u'150p/day', u',', u'6days', u',', u'16+', u'tsandcs', u'apply', u'reply', u'hl', u'4', u'info'])


In [17]:
list(messages.columns)

['label', 'message', 'tokenized_message', 'lemmatized_message']

### Stop Word Removal : 
#### commons words that do not add any relevance for classification (For eg. “the”, “a”, “an”, “in” etc.)

In [18]:
from nltk.corpus import stopwords

def stopword_removal(message):

    stop_words = set(stopwords.words('english'))

    filtered_sentence = []

    filtered_sentence = ' '.join([word for word in message if word not in stop_words])

    return filtered_sentence


In [19]:
messages['preprocessed_message'] = messages.apply(lambda row: stopword_removal(row['lemmatized_message']),axis=1)

In [20]:
list(messages.columns)

['label',
 'message',
 'tokenized_message',
 'lemmatized_message',
 'preprocessed_message']

In [21]:
messages['preprocessed_message'].head()

0    go jurong point , crazy.. available bugis n gr...
1                      ok lar ... joking wif u oni ...
2    free entry 2 wkly comp win fa cup final tkts 2...
3          u dun say early hor ... u c already say ...
4            nah n't think go usf , life around though
Name: preprocessed_message, dtype: object

In [22]:
Training_data=pd.Series(list(messages['preprocessed_message']))

Training_label=pd.Series(list(messages['label']))

In [26]:
print(len(Training_data))
print(len(Training_label))

5574
5574


<img src='bags_of_words.JPEG'>

In [27]:
#Term Document Matrix
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

tf_vectorizer = CountVectorizer(ngram_range=(1, 2),min_df = (1/len(Training_label)), max_df = 0.7)

Total_Dictionary_TDM = tf_vectorizer.fit(Training_data)

message_data_TDM = Total_Dictionary_TDM.transform(Training_data)

In [28]:
#Term Frequency Inverse Document Frequency (TFIDF)
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2),min_df = (1/len(Training_label)), max_df = 0.7)

Total_Dictionary_TFIDF = tfidf_vectorizer.fit(Training_data)

message_data_TFIDF = Total_Dictionary_TFIDF.transform(Training_data)

In [29]:
#Train and Test Data
from sklearn.cross_validation import train_test_split#Splitting the data for training and testing

train_data,test_data, train_label, test_label = train_test_split(message_data_TDM, Training_label, test_size=.1)



### Decision Tree Classification

In [30]:
from sklearn.tree import DecisionTreeClassifier#Creating a decision classifier model

classifier=DecisionTreeClassifier() #Model training

classifier = classifier.fit(train_data, train_label) #After being fitted, the model can then be used to predict the output.

message_predicted_target = classifier.predict(test_data)

score = classifier.score(test_data, test_label)

print('Decision Tree Classifier : ',score)

('Decision Tree Classifier : ', 0.96953405017921146)


### Stochastic Gradient Descent Classifier

In [31]:
seed=7

from sklearn.linear_model import SGDClassifier

classifier =  SGDClassifier(loss='modified_huber', shuffle=True,random_state=seed)

classifier = classifier.fit(train_data, train_label)

score = classifier.score(test_data, test_label)

print('SGD classifier : ',score)

('SGD classifier : ', 0.9838709677419355)




### Support Vector Machine

In [32]:
from sklearn.svm import SVC

classifier = SVC(kernel="linear", C=0.025,random_state=seed)

classifier = classifier.fit(train_data, train_label)

score = classifier.score(test_data, test_label)

print('SVM Classifier : ',score)

('SVM Classifier : ', 0.98028673835125446)


### Random Forest Classifier

In [39]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=10,random_state=seed)
#classifier = RandomForestClassifier(max_depth=10, n_estimators=25, max_features=60,random_state=seed)

classifier = classifier.fit(train_data, train_label)

score = classifier.score(test_data, test_label)

print('Random Forest Classifier : ',score)

('Random Forest Classifier : ', 0.85483870967741937)


In [40]:
#Cross validation
#Stratified Shuffle Split
seed=7

from sklearn.cross_validation import StratifiedShuffleSplit

#creating cross validation object with 10% test size 

cross_val = StratifiedShuffleSplit(Training_label,1, test_size=0.1,random_state=seed)

In [41]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn import svm

classifiers = [
    DecisionTreeClassifier(),
    SGDClassifier(loss='modified_huber', shuffle=True),
    SVC(kernel="linear", C=0.025),
    KNeighborsClassifier(),
    OneVsRestClassifier(svm.LinearSVC()),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=10),
   ]

for clf in classifiers:
    score=0
    for train_index, test_index in cross_val:
        X_train, X_test = message_data_TDM [train_index], message_data_TDM [test_index]
        y_train, y_test = Training_label[train_index], Training_label[test_index]
        clf.fit(X_train, y_train)
        score=score+clf.score(X_test, y_test)

    print(score)

0.971326164875
0.974910394265
0.969534050179
0.901433691756
0.974910394265
0.865591397849


### Classification Accuracy

In [42]:
from sklearn.metrics import accuracy_score

print('Accuracy Score',accuracy_score(test_label,message_predicted_target))  

classifier = classifier.fit(train_data, train_label)

score=classifier.score(test_data, test_label)

test_label.value_counts()

('Accuracy Score', 0.96953405017921146)


ham     477
spam     81
dtype: int64

### Confusion Matrix

In [44]:
from sklearn.metrics import confusion_matrix

print('Confusion Matrix')
print(confusion_matrix(test_label,message_predicted_target))

Confusion Matrix
[[471   6]
 [ 11  70]]


### Classification Report

In [45]:
from sklearn.metrics import classification_report

target_names = ['spam', 'ham']

print(classification_report(test_label, message_predicted_target, target_names=target_names))

             precision    recall  f1-score   support

       spam       0.98      0.99      0.98       477
        ham       0.92      0.86      0.89        81

avg / total       0.97      0.97      0.97       558



In [46]:
messages['label'].value_counts()

ham     4827
spam     747
Name: label, dtype: int64