In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
df = pd.read_csv("spam.csv", encoding= "ISO-8859-1")

In [4]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
df.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [6]:
df.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis =1, inplace=True)

In [7]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
df.rename(columns = {"v1": "Status", "v2": "Email"}, inplace = True)

In [11]:
df.loc[df["Status"] == "ham", "Status"] = 1
df.loc[df["Status"] == "spam", "Status"] = 0

In [12]:
df.head()

Unnamed: 0,Status,Email
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [13]:
X = df["Email"]
y = df["Status"]

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [15]:
feature_extraction = TfidfVectorizer(min_df=1, stop_words="english", lowercase=True)

In [16]:
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

In [17]:
y_train = y_train.astype("int")
y_test = y_test.astype("int")

In [18]:
model = LogisticRegression()

In [19]:
model.fit(X_train_features, y_train)

In [20]:
prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(prediction_on_training_data, y_train)

In [22]:
print("accuracy on training data: ", accuracy_on_training_data )

accuracy on training data:  0.9659791052772569


In [23]:
prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(prediction_on_test_data, y_test)

In [24]:
print("accuracy on test data: ", accuracy_on_test_data )

accuracy on test data:  0.9521479064709081


#### Building a predictive system

In [27]:
input_email = ["You just won money contact me to claim it"]
# conver text to feature extractin
fetured_text = feature_extraction.transform(input_email)

# making predictions
prediction = model.predict(fetured_text)

print(prediction)

[0]


In [28]:
if(prediction[0] == 1):
    print ("Ham mail")
else:
    print("Spam mail")

Spam mail
