### Import libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn import metrics
from tqdm.notebook import tqdm
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("/kaggle/input/email-spam-classification-dataset-csv/emails.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.describe().T

### Without upsampling

In [None]:
df = df.drop("Email No.", axis=1)

In [None]:
df.isna().sum()

In [None]:
sns.distplot(x=df["Prediction"])
plt.show()

In [None]:
x = df.drop("Prediction", axis=1)
y = df[["Prediction"]]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

### KNN with elbow plot

In [None]:
k_values = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29]
accuracy_values = []

In [None]:
for i in tqdm(range(len(k_values))):
    model = KNeighborsClassifier(n_neighbors=k_values[i])
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    accuracy_values.append(accuracy)

In [None]:
accuracy_values

In [None]:
px.line(x=k_values, y=accuracy_values)

In [None]:
optimal_k = -1
optimal_accuracy = -1
for i in list(zip(k_values, accuracy_values)):
    if i[1] > optimal_accuracy:
        optimal_k = i[0]
        optimal_accuracy = i[1]

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=optimal_k)

In [None]:
knn_model.fit(x_train, y_train)

In [None]:
y_pred = knn_model.predict(x_test)

In [None]:
print(metrics.classification_report(y_test, y_pred))

### SVM

In [None]:
svm_model = SVC()

In [None]:
svm_model.fit(x_train, y_train)

In [None]:
y_pred = svm_model.predict(x_test)

In [None]:
print(metrics.classification_report(y_test, y_pred))

### With upsampling

In [None]:
spam_data = df[df["Prediction"] == 1] 
ham_data = df[df["Prediction"] == 0]

In [None]:
spam_upsample = resample(spam_data,
             replace=True,
             n_samples=int(0.8*len(ham_data)),
             random_state=42)

In [None]:
new_df = ham_data
new_df = new_df.append(spam_upsample)

In [None]:
new_df.head()

In [None]:
new_df.shape

In [None]:
new_df = new_df.sample(frac=1)

In [None]:
sns.distplot(new_df["Prediction"])
plt.show()

In [None]:
x = new_df.drop("Prediction", axis=1)
y = new_df[["Prediction"]]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

### KNN with elbow plot

In [None]:
k_values = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29]
accuracy_values = []

In [None]:
for i in tqdm(range(len(k_values))):
    model = KNeighborsClassifier(n_neighbors=k_values[i])
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    accuracy_values.append(accuracy)

In [None]:
px.line(x=k_values, y=accuracy_values)

In [None]:
optimal_k = -1
optimal_accuracy = -1
for i in list(zip(k_values, accuracy_values)):
    if i[1] > optimal_accuracy:
        optimal_k = i[0]
        optimal_accuracy = i[1]

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=optimal_k)

In [None]:
knn_model.fit(x_train, y_train)

In [None]:
y_pred = knn_model.predict(x_test)

In [None]:
print(metrics.classification_report(y_test, y_pred))

# SVM

In [None]:
svm_model = SVC()

In [None]:
svm_model.fit(x_train, y_train)

In [None]:
y_pred = svm_model.predict(x_test)

In [None]:
print(metrics.classification_report(y_test, y_pred))

# Functions