In [61]:
import pandas as pd
import sklearn
import itertools
import numpy as np
import seaborn as sb
import re
import nltk
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from matplotlib import pyplot as plt
from sklearn.linear_model import PassiveAggressiveClassifier
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [62]:
df = pd.read_csv('spam.csv')


In [63]:
df.shape

(5572, 2)

In [64]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [65]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [67]:
df.loc[df['Category'] == 'spam' , 'Category',] = 0

In [68]:
df.loc[df['Category'] == 'ham' , 'Category',] = 1

In [69]:
df

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,0,This is the 2nd time we have tried 2 contact u...
5568,1,Will ü b going to esplanade fr home?
5569,1,"Pity, * was in mood for that. So...any other s..."
5570,1,The guy did some bitching but I acted like i'd...


In [70]:
X = df.Message

In [71]:
label_train = df.Category

In [72]:
y = df.Category

In [73]:
train_df = df.drop("Category", axis = 1)

In [74]:
lemmatizer = WordNetLemmatizer()
stpwrds = list(stopwords.words('english'))

In [75]:
stpwrds

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [76]:
for x in range(len(df)) :
    corpus = []
    review = df['Message'][x]
    review = re.sub(r'[^a-zA-Z\s]', '', review)
    review = review.lower()
    review = nltk.word_tokenize(review)
    for y in review :
        if y not in stpwrds :
            corpus.append(lemmatizer.lemmatize(y))
    review = ' '.join(corpus)
    df['Message'][x] = review   

In [77]:
train_df['Message'][2182]

'Ok.'

In [78]:
X_train, X_test, Y_train, Y_test = train_test_split(train_df['Message'], label_train, test_size=0.2, random_state=0)

In [79]:
X_train


1114    No I'm good for the movie, is it ok if I leave...
3589    If you were/are free i can give. Otherwise nal...
3095    Have you emigrated or something? Ok maybe 5.30...
1012          I just got home babe, are you still awake ?
3320                      Kay... Since we are out already
                              ...                        
4931    Hi, the SEXYCHAT girls are waiting for you to ...
3264                              So u gonna get deus ex?
1653    For ur chance to win a £250 cash every wk TXT:...
2607    R U &SAM P IN EACHOTHER. IF WE MEET WE CAN GO ...
2732    Mm feeling sleepy. today itself i shall get th...
Name: Message, Length: 4457, dtype: object

In [80]:


tfidf_v = TfidfVectorizer()
tfidf_X_train = tfidf_v.fit_transform(X_train)
tfidf_X_test = tfidf_v.transform(X_test)

In [81]:
print(tfidf_X_train,tfidf_X_test)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 59003 stored elements and shape (4457, 7788)>
  Coords	Values
  (0, 3529)	0.5175925398749917
  (0, 957)	0.29476022488012754
  (0, 3661)	0.17431982768913998
  (0, 4084)	0.3262953610451736
  (0, 3615)	0.22585328777532282
  (0, 4932)	0.23690832706203474
  (0, 3781)	0.19041173246070994
  (0, 3769)	0.18042801578406606
  (0, 4623)	0.39784723610540723
  (0, 6861)	0.15947646969493862
  (0, 2943)	0.1899305167946923
  (0, 3196)	0.24822101298759333
  (0, 4809)	0.22585328777532282
  (1, 3977)	0.3744245569147255
  (1, 4716)	0.3744245569147255
  (1, 2603)	0.3744245569147255
  (1, 816)	0.3744245569147255
  (1, 4696)	0.356993081574047
  (1, 5021)	0.30523271593524504
  (1, 3158)	0.21583480417583248
  (1, 1625)	0.15426678446395914
  (1, 2990)	0.1800269515465103
  (1, 1048)	0.15226099525528095
  (1, 7502)	0.23779586589483032
  (1, 7751)	0.09694093885116822
  :	:
  (4454, 1671)	0.18089314399986517
  (4454, 7564)	0.1877616697997559
  (4454, 1715

In [82]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [83]:
classifier = PassiveAggressiveClassifier()
classifier.fit(tfidf_X_train,Y_train)

ValueError: Unknown label type: (array([0, 1], dtype=object),)

In [21]:
acuracyt

0.967713004484305

In [22]:
import joblib


In [23]:
joblib.dump(model, 'model.pkl')

['model.pkl']

In [24]:
X_test_vec.shape

(1115, 7440)

In [25]:
X_train_vec.shape

(4457, 7440)

In [26]:
X_train.shape

(4457,)

In [27]:
X_test.shape

(1115,)