-----------

SECTION 1: Importing the modules 
-------------

----------

In [85]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import TfidfVectorizer # feature extraction 
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score 


--------------------------------------

SECTION 2: Data collection and Pre-Processing 
-------------

--------------------------------------

In [86]:
# loading data in the form of dataframe 

url = "C:/Users/Alpana/Desktop/cipherbyte/spam detector/Spam Email Detection.xlsx"
raw_mail_data = pd.read_excel(url)



In [87]:
print(raw_mail_data)

        v1                                                 v2 Unnamed: 2   
0      ham  Go until jurong point, crazy.. Available only ...        NaN  \
1      ham                      Ok lar... Joking wif u oni...        NaN   
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3      ham  U dun say so early hor... U c already then say...        NaN   
4      ham  Nah I don't think he goes to usf, he lives aro...        NaN   
...    ...                                                ...        ...   
5567  spam  This is the 2nd time we have tried 2 contact u...        NaN   
5568   ham              Will �_ b going to esplanade fr home?        NaN   
5569   ham  Pity, * was in mood for that. So...any other s...        NaN   
5570   ham  The guy did some bitching but I acted like i'd...        NaN   
5571   ham                         Rofl. Its true to its name        NaN   

     Unnamed: 3 Unnamed: 4  
0           NaN        NaN  
1           NaN        NaN  


In [88]:
# droping unnamed column 
raw_mail_data = raw_mail_data.drop(raw_mail_data.columns[raw_mail_data.columns.str.startswith('Unnamed')], axis=1)

In [89]:
# changing column names
column_name = ["Category(spam/ham)", "Message"]
raw_mail_data.columns = column_name

In [90]:
# replacing null values(missing values) with null string 

mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)), '')

In [91]:
mail_data.head(6)

Unnamed: 0,Category(spam/ham),Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...


In [92]:
# number of rows and columns in the dataframe 
mail_data.shape 

(5572, 2)

----------------------------------


SECTION 3: Lable Encoding
--------------



--------------------

In [93]:
# spam mail as 0 ; ham mail as 1

mail_data.loc[mail_data["Category(spam/ham)"] == 'spam', 'Category(spam/ham)', ] = 0
mail_data.loc[mail_data["Category(spam/ham)"] == 'ham', 'Category(spam/ham)', ] = 1


spam - 0
ham - 1

In [94]:
mail_data.head()

Unnamed: 0,Category(spam/ham),Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [95]:
# seperating the data as text(message) and labels(catogery)

X = mail_data["Message"]

Y = mail_data["Category(spam/ham)"]


In [96]:
print(X)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will �_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [97]:
print(Y)

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category(spam/ham), Length: 5572, dtype: object


--------------------------------------

SECTION 4: Splitting the data into training data and test data
----------------

--------------------------------------

In [98]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state= 3)

# random state is 3 because by using this the above function will split data in the same way everytime 
# 0.2 mean % of data is going in test data


In [99]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)


(5572,)
(4457,)
(1115,)


4457 rows are in training data 
1115 rows are in testing data

--------------------------------------

SECTION 5: Feature Extraction 
--------------------------------------------------------------------------------------------------------------------

--------------------------------------

In [100]:
# Convert the text data into feature vectors that can be fed into the logistic regression. 

X_train = X_train.astype("str")

In [101]:
# Create a TfidfVectorizer
feature_extraction = TfidfVectorizer(min_df=1, stop_words= 'english')


# Step 1: Fit the vectorizer on the training data
feature_extraction.fit(X_train)
 
# Step 2: Transform the training data using the same vectorizer
X_train_features = feature_extraction.transform(X_train)

# Step 2: Transform the test data using the same vectorizer
X_test_features = feature_extraction.transform(X_test)

In [102]:
# Converting Y_test and Y_train in integers 

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')


In [103]:
print(X_train_features)

  (0, 6580)	0.20305518394534605
  (0, 4497)	0.2910887633154199
  (0, 4279)	0.3893042361045832
  (0, 3962)	0.2411608243124387
  (0, 3375)	0.32207229533730536
  (0, 3369)	0.21816477736422235
  (0, 3251)	0.258880502955985
  (0, 3126)	0.4403035234544808
  (0, 2116)	0.38519642807943744
  (0, 742)	0.32207229533730536
  (1, 7420)	0.35056971070320353
  (1, 6850)	0.4306015894277422
  (1, 6422)	0.5652509076654626
  (1, 6397)	0.4769136859540388
  (1, 4045)	0.380431198316959
  (2, 5806)	0.4917598465723273
  (2, 3899)	0.40088501350982736
  (2, 2220)	0.413484525934624
  (2, 2103)	0.42972812260098503
  (2, 934)	0.4917598465723273
  (3, 7430)	0.5202633571003087
  (3, 6121)	0.49038631686936035
  (3, 1838)	0.37086806414877077
  (3, 1595)	0.5927091854194291
  (4, 5027)	0.450781293477235
  :	:
  (4452, 2116)	0.3092200696489299
  (4453, 7250)	0.5787739591782677
  (4453, 1758)	0.45610005640082985
  (4453, 1000)	0.6760129013031282
  (4454, 7323)	0.31166263834107377
  (4454, 5351)	0.42618909997886
  (4454, 30

--------------------------------------

SECTION 6: Training the model 
-------------------------------------------------------------------------------------------------------------------------------------------------

--------------------------------------

Logostic Regression 

In [104]:
model = LogisticRegression()

In [105]:
# Training the model(logistic regression model) with the training data

model.fit(X_train_features, Y_train)

Evaluating the trained model 

In [106]:
# Predicting the training data 

prediction_on_training_data = model.predict(X_train_features)

# it predicts values of Y corresponding to each X


accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)


In [107]:
print("predicting on training data: ", prediction_on_training_data)

predicting on training data:  [1 1 1 ... 1 1 1]


In [108]:
print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  0.9661207089970832


In [109]:
# Predicting the test data 

prediction_on_test_data = model.predict(X_test_features)

# it predicts values of Y corresponding to each X


accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)


In [110]:
print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on test data :  0.9623318385650225



**BUILIDING A PREDICTING SYSTEM**

In [111]:
input_mail = ["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times."]

# convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

# making prediction 
prediction = model.predict(input_data_features)
print(prediction)

[1]


In [112]:
if (prediction[0] == 1):
    print("It is Ham mail")

else:
    print("It is a Spam mail")


It is Ham mail
