A simple NLP model to classify text messages into spam/ham

In [1]:
# Import required libraries
import pandas as pd
import string
from nltk.corpus import stopwords

In [2]:
# Import dataset
data = pd.read_csv('SpamCollection',sep='\t', names=['response','message'])
data.head()

Unnamed: 0,response,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
# Explore dataset
data.groupby('response').describe()

Unnamed: 0_level_0,message,message,message,message
Unnamed: 0_level_1,count,unique,top,freq
response,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [4]:
# Function to remove punctuations and stopwords
def preprocess_func(mess):
    mess = [char for char in mess if char not in string.punctuation]
    mess = ''.join(mess)
    return [word for word in mess.split() if word not in stopwords.words('english')]

In [5]:
# Check if function is working fine
data['message'].head(5).apply(preprocess_func)

0    [Go, jurong, point, crazy, Available, bugis, n...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3        [U, dun, say, early, hor, U, c, already, say]
4    [Nah, I, dont, think, goes, usf, lives, around...
Name: message, dtype: object

In [6]:
# Create train and test sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data['message'], data['response'])

print('Ratio of train to test data: ', len(x_train)/len(x_test))

Ratio of train to test data:  3.0


In [7]:
# Converting text messages to bag of words
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer=preprocess_func).fit(x_train)

training_bag_of_words = vectorizer.transform(x_train)

In [8]:
print(type(training_bag_of_words))
print(training_bag_of_words.shape)

<class 'scipy.sparse.csr.csr_matrix'>
(4179, 9879)


In [10]:
# Convert to tf-idf
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(training_bag_of_words)

training_tfidf = tfidf_transformer.transform(training_bag_of_words)

In [11]:
# Import and use the Naive Bayes model for classification
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB().fit(training_tfidf, y_train)

So, our model is trained now

Now, we have to convert our test data into required format, i.e., tf-idf.
Then, we use the model on this data for classification

In [12]:
# Convert test data to required format, i.e., tf-idf
test_bag_of_words = vectorizer.transform(x_test)
test_message_tfidf = tfidf_transformer.transform(test_bag_of_words)

In [13]:
# Predict the test data using the trained model
predicted_response = classifier.predict(test_message_tfidf)

In [14]:
print(type(predicted_response))
print(predicted_response.shape)
print(predicted_response)

<class 'numpy.ndarray'>
(1393,)
['ham' 'ham' 'ham' ... 'ham' 'ham' 'ham']


So, we have made our predictions using our trained model. 
Now let's check how well our model performed

In [15]:
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

# Let's check model accuracy
accuracy = accuracy_score(y_test, predicted_response)
print('accuracy: ', accuracy)
print()

# Let's check the confusion matrix
cm = confusion_matrix(y_test,predicted_response)
print('Confusion matrix: \n', cm)
print()

# Let's print the f1-score
f1 = f1_score(y_test,predicted_response, pos_label='spam')
print('f1_score:', f1)

accuracy:  0.95908111988514

Confusion matrix: 
 [[1218    0]
 [  57  118]]

f1_score: 0.8054607508532422
