1. [Loading and preparing data](#1)
1. [Stemming and Training](#2)
1. [Build The Model](#3)
1. [Test The Model](#4)
1. [Model Evaluation](#5)

<a id='1'></a>
# 1.Loading and preparing Data

In [1]:
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
# To ignore warnings
import warnings
warnings.filterwarnings('ignore')

/kaggle/input/spam-text-message-classification/SPAM text message 20170820 - Data.csv


In [2]:
dataset = pd.read_csv('/kaggle/input/spam-text-message-classification/SPAM text message 20170820 - Data.csv')
dataset.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
from nltk.corpus import stopwords

In [5]:
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [6]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [7]:
corpus = []
for i in range(0,len(dataset)):
    sent = re.sub('[^a-zA-Z0-9]',' ',dataset['Message'][i])
    sent=sent.lower()
    sent=sent.split()
    sent=[ps.stem(word) for word in sent if not word in stopwords.words('english')]
    sent=' '.join(sent)
    corpus.append(sent)

In [8]:
y=pd.get_dummies(dataset['Category'],drop_first=True)

<a id = "3"></a><br>
# 3.Build The Model

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(corpus, y, test_size = 0.20, random_state = 42)

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=2500,ngram_range=(1,2))
X_train=cv.fit_transform(X_train).toarray()

<a id = "4"></a><br>
# 4.Test the Model

In [11]:
X_test=cv.transform(X_test).toarray()

In [12]:
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier().fit(X_train,y_train)

In [13]:
y_pred=classifier.predict(X_test)

In [14]:
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix

<a id='5'></a>
# 5.Model Evaluation

In [15]:
print(f"Accuracy score is: %{round(accuracy_score(y_test,y_pred),2)*100}")

Accuracy score is: %98.0


In [16]:
confusion_matrix(y_test,y_pred)

array([[966,   0],
       [ 23, 126]])

In [17]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       1.00      0.85      0.92       149

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115

