In [1]:
import pandas as pd

# DATA EXPLORATION

In [2]:
df=pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [4]:
df['Spam']=df['Category'].apply(lambda x:1 if x=='spam' else 0)
df.head()

Unnamed: 0,Category,Message,Spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


# DATA CLEANSING AND DATA COLLECTION

In [5]:
df.Message.isna().sum()

0

In [6]:
df.Spam.isna().sum()

0

# MODEL TRAINING

In [7]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(df.Message,df.Spam,test_size=0.25)

# VECTORISING THE TEXT DATA:
#### Analysing and taking unique features of text models to split them
#### in spam or not spam text models.

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
v=CountVectorizer()
x_train_count=v.fit_transform(x_train.values)
x_train_count.toarray()[:2]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [9]:
from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB()
model.fit(x_train_count,y_train)

MultinomialNB()

# TESTING MODEL AND EFFICIENCY

In [10]:
emails=[
    'Hey Rahul wanna watch a movie tonight the tickets are on 50% sale',
    "Upto 20% discount on parking,exclusive offer just for you.Don't miss this reward."
]
emails_count=v.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)

In [11]:
x_test_count=v.transform(x_test)
model.score(x_test_count,y_test)

0.9806173725771715

In [12]:
from sklearn.pipeline import Pipeline
clf=Pipeline([
    ('vectorizer',CountVectorizer()),
    ('nb',MultinomialNB())
])

In [13]:
clf.fit(x_train,y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])

In [14]:
clf.score(x_test,y_test)

0.9806173725771715

In [15]:
clf.predict(emails)

array([0, 1], dtype=int64)

In [None]:
x=input("Enter text:")
yp=clf.predict([x])
if yp==0:
    print(x)
elif yp==1:
    print("Spam Text")
    with open('Spam.txt','a') as file:
        file.write(x)

In [None]:
x=input("Enter text:")
yp=clf.predict([x])
if yp==0:
    print(x)
elif yp==1:
    print("Spam Text")
    with open('Spam.txt','a') as file:
        file.write(x)