In [185]:

import pandas as pd  # for importing the dataset from file
from sklearn.model_selection import train_test_split  # for splitting data into training and testing sets
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score # for evaluating the classification models
import numpy as np

In [193]:
df = pd.read_csv("emails.csv") # Reading the contents of the "emails.csv" and storing it in a DataFrame named 'df'
df.head()  # Displaying the first few rows of the DataFrame 'df' to check the data is right and see what kind of data we are dealing with

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0


In [195]:
print("Rows: ", df.shape[0]) # Print the dimensions of the DataFrame
print("Columns: ", df.shape[1])

Rows:  5172
Columns:  3002


In [187]:
df.isnull().sum()  # Calculate the sum of missing/null values in each column of the DataFrame 'df'

In [194]:
df.describe()  # Generating descriptive statistics of the DataFrame 'df' like count, mean, standard deviation, etc.

Unnamed: 0,the,to,ect,and,for,of,a,you,hou,in,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
count,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,...,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0,5172.0
mean,6.640565,6.188128,5.143852,3.075599,3.12471,2.62703,55.517401,2.466551,2.024362,10.600155,...,0.005027,0.012568,0.010634,0.098028,0.004254,0.006574,0.00406,0.914733,0.006961,0.290023
std,11.745009,9.534576,14.101142,6.04597,4.680522,6.229845,87.574172,4.314444,6.967878,19.281892,...,0.105788,0.199682,0.116693,0.569532,0.096252,0.138908,0.072145,2.780203,0.098086,0.453817
min,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,1.0,0.0,1.0,0.0,12.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3.0,3.0,1.0,1.0,2.0,1.0,28.0,1.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,8.0,7.0,4.0,3.0,4.0,2.0,62.25,3.0,1.0,12.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
max,210.0,132.0,344.0,89.0,47.0,77.0,1898.0,70.0,167.0,223.0,...,4.0,7.0,2.0,12.0,3.0,4.0,3.0,114.0,4.0,1.0


In [189]:
X = df.iloc[:, 1:3001]  # Creating a new DataFrame 'X' by selecting columns 1 to 3000 both included from the DataFrame 'df'
Y = df.iloc[:, -1].values  # Creating a new variable 'Y' by extracting the values from the last column of the DataFrame 'df'
# This is because in many machine learning tasks, the last column is often the target or dependent variable that is wanted to predict or analyze.


In [190]:
train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.25)  # Splitting the data into training and testing sets
#this means we are going to use 75% of the data for training and 25% for testing

In [191]:
class perceptron:

    def __init__(self, learning_rate=0.01, n_iters=1000):
        self.lr = learning_rate  # Learning rate for weight updates
        self.n_iters = n_iters  # Number of iterations for training
        self.activation_func = unit_step_func  # Activation function (assumed to be defined elsewhere)
        self.weights = None  # Weights for each feature
        self.bias = None  # Bias term


    def fit(self, X, y):
        n_samples, n_features = X.shape

        # Initialize parameters
        self.weights = np.zeros(n_features)  # Initialize weights as zeros
        self.bias = 0  # Initialize bias as zero

        y_ = np.where(y > 0 , 1, 0)  # Transform target labels to binary values

        # Learn weights
        for _ in range(self.n_iters):
            for idx, x_i in enumerate(X):
                linear_output = np.dot(x_i, self.weights) + self.bias  # Calculate linear output
                y_predicted = self.activation_func(linear_output)  # Apply activation function

                # Perceptron update rule
                update = self.lr * (y_[idx] - y_predicted)  # Calculate update value
                self.weights += update * x_i  # Update weights
                self.bias += update  # Update bias


    def predict(self, X):
        linear_output = np.dot(X, self.weights) + self.bias  # Calculate linear output
        y_predicted = self.activation_func(linear_output)  # Apply activation function
        return y_predicted

In [192]:
perceptron = Perceptron(learning_rate=0.01, n_iters=1)  # Creating a Perceptron classifier
perceptron.fit(train_x, train_y)  # Training the classifier using the training data
y_pred_perceptron = perceptron.predict(test_x)  # Making predictions on the test data using the trained classifier
accuracy_perceptron = accuracy_score(test_y, y_pred_perceptron)  # Calculating the accuracy score
precision_perceptron = precision_score(test_y, y_pred_perceptron)  # Calculating the accuracy score
recall_perceptron = recall_score(test_y, y_pred_perceptron)  # Calculating the accuracy score
f1_perceptron = f1_score(test_y, y_pred_perceptron)  # Calculating the accuracy score
print(f"The Accuracy for Perceptron Classifier is {accuracy_perceptron}.")
print(f"The Precision for Perceptron Classifier is {precision_perceptron}.")
print(f"The Recall for Perceptron Classifier is {recall_perceptron}.")
print(f"The F1 Score for Perceptron Classifier is {f1_perceptron}.")

The Accuracy for Perceptron Classifier is 0.8669760247486465.
The Precision for Perceptron Classifier is 0.9310344827586207.
The Recall for Perceptron Classifier is 0.5806451612903226.
The F1 Score for Perceptron Classifier is 0.7152317880794703.
