In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,classification_report

In [7]:
data=pd.read_csv("emails.csv")
data

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5167,Email 5168,2,2,2,3,0,0,32,0,0,...,0,0,0,0,0,0,0,0,0,0
5168,Email 5169,35,27,11,2,6,5,151,4,3,...,0,0,0,0,0,0,0,1,0,0
5169,Email 5170,0,0,1,1,0,0,11,0,0,...,0,0,0,0,0,0,0,0,0,1
5170,Email 5171,2,7,1,0,2,1,28,2,0,...,0,0,0,0,0,0,0,1,0,1


In [41]:
data.isnull().sum()

Email No.     0
the           0
to            0
ect           0
and           0
             ..
military      0
allowing      0
ff            0
dry           0
Prediction    0
Length: 3002, dtype: int64

In [12]:
x=data.drop(columns=["Email No.","Prediction"],axis=1)
y=data["Prediction"]

0       0
1       0
2       0
3       0
4       0
       ..
5167    0
5168    0
5169    1
5170    1
5171    0
Name: Prediction, Length: 5172, dtype: int64

In [14]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2,random_state=42)

In [None]:
'''
    KNeighborsClassifier in Scikit-Learn is a machine learning algorithm based on the K-Nearest Neighbors (KNN) method, 
    which classifies a data point by looking at its “k” nearest data points in the feature space. The classifier assigns 
    the most common class among these neighbors to the new point. It’s useful for classification tasks with non-linear decision 
    boundaries and supports both Euclidean and other distance metrics.

    Functions:

    .fit(X, y): Trains the classifier with features X and labels y.
    .predict(X): Predicts the class labels for new data X.

    Main Parameters:

    n_neighbors: Number of neighbors to consider (default: 5).
    weights: Determines neighbor influence, options are 'uniform' or 'distance'.
    metric: Distance metric for neighbors (default: 'minkowski', p=2 for Euclidean distance).
    algorithm: Search algorithm for neighbors (e.g., 'auto', 'ball_tree', 'kd_tree', 'brute').
'''

In [32]:
knn_model=KNeighborsClassifier(n_neighbors=5)
knn_model.fit(xtrain,ytrain)
knn_predictions=knn_model.predict(np.array(xtest))
knn_accuracy=accuracy_score(np.array(ytest),knn_predictions)
knn_report=classification_report(ytest,knn_predictions,output_dict=True)



In [33]:
pd.DataFrame(knn_report)

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.930736,0.725146,0.862802,0.827941,0.871939
recall,0.872801,0.837838,0.862802,0.855319,0.862802
f1-score,0.900838,0.777429,0.862802,0.839134,0.865544
support,739.0,296.0,0.862802,1035.0,1035.0


In [None]:
'''
    SVC (Support Vector Classifier) in Scikit-Learn is a popular implementation of Support Vector Machine (SVM) for 
    classification tasks. SVM finds a hyperplane in the feature space that best separates classes, maximizing the margin between
    support vectors (the closest points to the hyperplane from each class). SVC is effective for binary and multiclass 
    classification and is known for its robustness, especially with high-dimensional data.

    Key Functions:

    .fit(X, y): Trains the model with input X and labels y.
    .predict(X): Predicts class labels for new data X.
    Main Parameters:

    C: Regularization parameter that controls the trade-off between achieving a low training error and a low testing error.
    Higher 
    values result in fewer misclassified training examples.
    kernel: Specifies the kernel type ('linear', 'poly', 'rbf', 'sigmoid') to transform data. rbf is the default and most 
    commonly 
    used for non-linear problems.
    gamma: Kernel coefficient for ‘rbf’, ‘poly’, and ‘sigmoid’. Controls the influence of individual training samples; a high 
    gamma makes the decision boundary closer to the data points.
    degree: Degree of the polynomial kernel function (‘poly’).
'''

In [37]:
svm_model=SVC()
svm_model.fit(xtrain,ytrain)
svm_predictions=svm_model.predict(xtest)
svm_accuracy=accuracy_score(ytest,svm_predictions)
svm_report=classification_report(ytest,svm_predictions,output_dict=True)
pd.DataFrame(svm_report)

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.804204,0.908397,0.817391,0.8563,0.834002
recall,0.983762,0.402027,0.817391,0.692894,0.817391
f1-score,0.884967,0.557377,0.817391,0.721172,0.791279
support,739.0,296.0,0.817391,1035.0,1035.0


In [None]:
'''
accuracy_score:Measures the percentage of correctly predicted labels out of the total labels.
Formula: (Number of Correct Predictions) / (Total Predictions)

classification_report
Provides a detailed report of multiple classification metrics, including precision, recall, F1-score, and support (the number
of true instances for each label).Useful for understanding model performance on a per-class basis, especially with imbalanced
datasets.
Without output_dict=True: A formatted string report.
With output_dict=True: A dictionary that can be converted to a DataFrame.

'''