In [79]:
# importing required libraries 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [78]:
mail_data = pd.read_csv('mail_data.csv')

In [61]:
mail_data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [62]:
#finding null values
mail_data.isnull().sum() 

Category    0
Message     0
dtype: int64

In [63]:
# splitting the data
X = mail_data['Message']
y = mail_data['Category']

In [64]:
# label spam - 0, ham - 1
mail_data.loc[mail_data['Category'] == 'spam', 'Category'] = 0
mail_data.loc[mail_data['Category'] == 'ham', 'Category'] = 1

In [65]:
print(X)
print(y)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object
0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


In [66]:
# splitting the data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [67]:
print(X.shape, X_train.shape, X_test.shape)

(5572,) (4457,) (1115,)


In [68]:
# transform text -> vectors
vectorizer = TfidfVectorizer(min_df = 1, stop_words = 'english', lowercase = True)

X_train_vector = vectorizer.fit_transform(X_train)
X_test_vector = vectorizer.transform(X_test)

# convert y values -> integers
y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [69]:
print(X_train)

1642    Hi , where are you? We're at  and they're not ...
2899          If you r @ home then come down within 5 min
480     When're you guys getting back? G said you were...
3485    Tell my  bad character which u Dnt lik in me. ...
157                           I'm leaving my house now...
                              ...                        
905     We're all getting worried over here, derek and...
5192    Oh oh... Den muz change plan liao... Go back h...
3980    CERI U REBEL! SWEET DREAMZ ME LITTLE BUDDY!! C...
235     Text & meet someone sexy today. U can find a d...
5157                              K k:) sms chat with me.
Name: Message, Length: 4457, dtype: object


In [70]:
print(X_train_vector)

  (0, 4346)	0.3238008504874723
  (0, 2353)	0.2545072203817634
  (0, 6695)	0.3616065178053154
  (0, 5895)	0.415102954745324
  (0, 2719)	0.299459437576315
  (0, 3822)	0.37729728742748087
  (0, 3789)	0.4750235197588447
  (0, 3321)	0.2638802854739516
  (1, 4343)	0.6555659308129219
  (1, 1858)	0.5163195438969705
  (1, 3365)	0.5510421389942982
  (2, 4267)	0.531599749449541
  (2, 6215)	0.43979370278404856
  (2, 6597)	0.40097414833733686
  (2, 5672)	0.32606636481997364
  (2, 3025)	0.3502912545366897
  (2, 3185)	0.3663054742561573
  (3, 4447)	0.18080236341909536
  (3, 3084)	0.14346439189216004
  (3, 2877)	0.30030357190007717
  (3, 5515)	0.16439483489485024
  (3, 6922)	0.13381964389308706
  (3, 7080)	0.19700844583868773
  (3, 5535)	0.30030357190007717
  (3, 7398)	0.19877707762085306
  :	:
  (4454, 397)	0.31848634658760416
  (4454, 4027)	0.2561192223695296
  (4454, 6409)	0.2511086901671169
  (4454, 7382)	0.23350338191116915
  (4454, 4578)	0.28626353932821713
  (4455, 6600)	0.3164025961524856
  (4

In [71]:
# model training - logistic regression
model = LogisticRegression()

In [72]:
model.fit(X_train_vector, y_train)

In [73]:
# model evaluation - accuracy test
training_data_prediction = model.predict(X_train_vector)
training_data_accuracy = accuracy_score(y_train, training_data_prediction)

test_data_prediction = model.predict(X_test_vector)
test_data_accuracy = accuracy_score(y_test, test_data_prediction)

In [74]:
print('Accuracy on the training data is: ', training_data_accuracy)
print('Accuracy on the test data is: ', test_data_accuracy)

Accuracy on the training data is:  0.9681400044873233
Accuracy on the test data is:  0.9704035874439462


In [75]:
# make a prediction: Spam or Ham

input_mail = ["As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune"]

# convert text to feature vectors
input_data_vector = vectorizer.transform(input_mail)

prediction = model.predict(input_data_vector)
print(prediction)

if (prediction[0]==1):
  print('Ham mail')
else:
  print('Spam mail')

[1]
Ham mail


In [77]:
# make a prediction: Spam or Ham

input_mail = ["Want 2 get laid tonight? Want real Dogging locations sent direct 2 ur mob? Join the UK's largest Dogging Network bt Txting GRAVEL to 69888! Nt. ec2a. 31p.msg@150p"]

# convert text to feature vectors
input_data_vector = vectorizer.transform(input_mail)

prediction = model.predict(input_data_vector)
print(prediction)

if (prediction[0]==1):
  print('Ham mail')
else:
  print('Spam mail')

[0]
Spam mail
