# Scaling & Normalization

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Spam Email

### Load Dataset

In [21]:
df_spam = pd.read_csv("spam.csv", sep="\t", names=["label", "text"])
df_spam["label"] = df_spam["label"].map({"ham": 0, "spam": 1})
df_spam.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


### Converting Text to Features

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words="english")
X = vectorizer.fit_transform(df_spam["text"])
y = df_spam["label"]

### Train-Test Split

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42)

### Training KNN WITHOUT Scaling

In [24]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
pred_no_scale = knn.predict(X_test)
acc_no_scale = accuracy_score(y_test, pred_no_scale)
acc_no_scale

0.9167264895908112

### Scaling

In [25]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(with_mean=False)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Retraining KNN WITH Scaling

In [26]:
knn_scaled = KNeighborsClassifier(n_neighbors=5)
knn_scaled.fit(X_train_scaled, y_train)
pred_scaled = knn_scaled.predict(X_test_scaled)
acc_scaled = accuracy_score(y_test, pred_scaled)
acc_scaled

0.8851399856424982

### Compare both results

In [27]:
print("Accuracy without scaling:", acc_no_scale)
print("Accuracy with scaling:", acc_scaled)

Accuracy without scaling: 0.9167264895908112
Accuracy with scaling: 0.8851399856424982
