# SMS SPAM CLASSIFICATION

### Import and read the data using Pandas

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('SMSSpamCollection', '\t', names = ["label", "message"])

### Clean the data of unwanted things

In [3]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [5]:
WL = WordNetLemmatizer()

In [6]:
corpus = []

In [8]:
for i in range(len(data)):
    
    temp = re.sub('[^a-zA-Z]', ' ', data['message'][i])
    temp = temp.lower()
    temp = temp.split()
    temp = [WL.lemmatize(word) for word in temp if not word in set(stopwords.words('english'))]
    temp = ' '.join(temp)
    corpus.append(temp)

### Creating a TFIDF dataset from the given data

In [57]:
from sklearn.feature_extraction.text import TfidfVectorizer
TFV = TfidfVectorizer(max_features=3000)
X = TFV.fit_transform(corpus).toarray()

In [58]:
Y = pd.get_dummies(data['label'])
Y = Y.iloc[:, 1].values

### Train-Test Split

In [59]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

### Use this data to train the Naive Bayes Classifier

In [60]:
from sklearn.naive_bayes import MultinomialNB
Spam_Detector = MultinomialNB().fit(X_train, Y_train)

In [61]:
Y_prediction = Spam_Detector.predict(X_test)

### Compare using a Confusion Matrix

In [62]:
from sklearn.metrics import confusion_matrix
Confusion_mat = confusion_matrix(Y_test, Y_prediction)

In [63]:
Confusion_mat

array([[954,   1],
       [ 23, 137]], dtype=int64)

### Check the Accuracy

In [64]:
from sklearn.metrics import accuracy_score
Accuracy = accuracy_score(Y_test, Y_prediction)

In [65]:
Accuracy

0.97847533632287