In [2]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)

In [3]:
df = pd.read_csv('SMSSpamCollection', 
                 sep='\t', names =['label','sms_message'])

In [4]:
df.head()

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [5]:
#Convert the values in the 'label' column to numerical values using map method as follows: {'ham':0, 'spam':1} This maps the 'ham' value to 0 and the 'spam' value to 1.
#Also, to get an idea of the size of the dataset we are dealing with, print out number of rows and columns using 'shape'.

In [6]:
df['label'] = df.label.map({'ham':0, 'spam':1})

In [7]:
print(df.shape)  # returns (rows, columns)

(5572, 2)


In [8]:
df.head()

Unnamed: 0,label,sms_message
0,0,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives around here though"


In [9]:
#Implementing Bag of Words in scikit-learn
#Here we will look to create a frequency matrix on a smaller document set to make sure we understand how the 
#document-term matrix generation happens. We have created a sample document set 'documents'.
documents = ['Hello, how are you!',
                'Win money, win from home.',
                'Call me now.',
                'Hello, Call hello you tomorrow?']

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()

In [11]:
print(count_vector)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)


In [12]:
#Fit your document dataset to the CountVectorizer object you have created using fit(), and 
#get the list of words which have been categorized as features using the get_feature_names() method.
count_vector.fit(documents)
#count_vector.get_feature_names()
count_vector.transform(documents).toarray()

array([[1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 2, 0],
       [0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0],
       [0, 1, 0, 2, 0, 0, 0, 0, 0, 1, 0, 1]], dtype=int64)

In [13]:
#Convert the array we obtained, loaded into 'doc_array', into a dataframe and 
#set the column names to the word names(which you computed earlier using get_feature_names(). 
#Call the dataframe 'frequency_matrix'.
frequency_matrix = pd.DataFrame(count_vector.transform(documents).toarray(), 
                                columns=count_vector.get_feature_names())
frequency_matrix

Unnamed: 0,are,call,from,hello,home,how,me,money,now,tomorrow,win,you
0,1,0,0,1,0,1,0,0,0,0,0,1
1,0,0,1,0,1,0,0,1,0,0,2,0
2,0,1,0,0,0,0,1,0,1,0,0,0
3,0,1,0,2,0,0,0,0,0,1,0,1


In [14]:
# split into training and testing sets
# USE from sklearn.model_selection import train_test_split.
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['sms_message'], 
                                                    df['label'], 
                                                    random_state=1)

print('Number of rows in the total set: {}'.format(df.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 5572
Number of rows in the training set: 4179
Number of rows in the test set: 1393


In [15]:
#Firstly, we have to fit our training data (X_train) into CountVectorizer() and return the matrix.
#Secondly, we have to transform our testing data (X_test) to return the matrix.
from sklearn.feature_extraction.text import CountVectorizer

# Instantiate the CountVectorizer method
count_vector = CountVectorizer()

# Fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train)

# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()
testing_data = count_vector.transform(X_test)

In [16]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [17]:
predictions = naive_bayes.predict(testing_data)
predictions

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

In [18]:
# Import the Bagging, RandomForest, and AdaBoost Classifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier

In [19]:
# Instantiate a BaggingClassifier with:
# 200 weak learners (n_estimators) and everything else as default values
bag_mod = BaggingClassifier(n_estimators=200)


# Instantiate a RandomForestClassifier with:
# 200 weak learners (n_estimators) and everything else as default values
rf_mod = RandomForestClassifier(n_estimators=200)

# Instantiate an a AdaBoostClassifier with:
# With 300 weak learners (n_estimators) and a learning_rate of 0.2
ada_mod = AdaBoostClassifier(n_estimators=300, learning_rate=0.2)

In [20]:
# Fit your BaggingClassifier to the training data
bag_mod.fit(training_data, y_train)

# Fit your RandomForestClassifier to the training data
rf_mod.fit(training_data, y_train)

# Fit your AdaBoostClassifier to the training data
ada_mod.fit(training_data, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.2,
                   n_estimators=300, random_state=None)

In [21]:
# Predict using BaggingClassifier on the test data
bag_preds = bag_mod.predict(testing_data) 

# Predict using RandomForestClassifier on the test data
rf_preds = rf_mod.predict(testing_data)

# Predict using AdaBoostClassifier on the test data
ada_preds = ada_mod.predict(testing_data)

In [33]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [34]:
def print_metrics(y_true, preds, model_name=None):
    '''
    INPUT:
    y_true - the y values that are actually true in the dataset (numpy array or pandas series)
    preds - the predictions for those values from some model (numpy array or pandas series)
    model_name - (str - optional) a name associated with the model if you would like to add it to the print statements 
    
    OUTPUT:
    None - prints the accuracy, precision, recall, and F1 score
    '''
    if model_name == None:
        print('Accuracy score: ', format(accuracy_score(y_true, preds)))
        print('Precision score: ', format(precision_score(y_true, preds)))
        print('Recall score: ', format(recall_score(y_true, preds)))
        print('F1 score: ', format(f1_score(y_true, preds)))
        print('\n\n')
    
    else:
        print('Accuracy score for ' + model_name + ' :' , format(accuracy_score(y_true, preds)))
        print('Precision score ' + model_name + ' :', format(precision_score(y_true, preds)))
        print('Recall score ' + model_name + ' :', format(recall_score(y_true, preds)))
        print('F1 score ' + model_name + ' :', format(f1_score(y_true, preds)))
        print('\n\n')

In [35]:
# Print Bagging scores
print_metrics(y_test, bag_preds, 'bagging')

# Print Random Forest scores
print_metrics(y_test, rf_preds, 'random forest')

# Print AdaBoost scores
print_metrics(y_test, ada_preds, 'adaboost')

# Naive Bayes Classifier scores
print_metrics(y_test, predictions, 'naive bayes')

Accuracy score for bagging : 0.9748743718592965
Precision score bagging : 0.9166666666666666
Recall score bagging : 0.8918918918918919
F1 score bagging : 0.9041095890410958



Accuracy score for random forest : 0.9784637473079684
Precision score random forest : 1.0
Recall score random forest : 0.8378378378378378
F1 score random forest : 0.911764705882353



Accuracy score for adaboost : 0.9770279971284996
Precision score adaboost : 0.9693251533742331
Recall score adaboost : 0.8540540540540541
F1 score adaboost : 0.9080459770114943



Accuracy score for naive bayes : 0.9885139985642498
Precision score naive bayes : 0.9720670391061452
Recall score naive bayes : 0.9405405405405406
F1 score naive bayes : 0.9560439560439562





In [37]:
#Conclusion
#One of the major advantages that Naive Bayes has over other classification algorithms is 
#its ability to handle an extremely large number of features. In our case, each word is treated as a feature 
#and there are thousands of different words. Also, it performs well even with the presence of irrelevant features and 
#is relatively unaffected by them. The other major advantage it has is its relative simplicity. 
#Naive Bayes' works well right out of the box and tuning it's parameters is rarely ever necessary, except usually in cases 
#where the distribution of the data is known. It rarely ever overfits the data. 
#Another important advantage is that its model training and prediction times are very fast for the amount of data it can handle.