In [None]:
#1.Feature Scaling & Regression

#libraries used

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import r2_score, mean_squared_error



In [18]:
# Load Ames Housing dataset
df = pd.read_csv("AmesHousing.csv")

# Select only numerical features
df_num = df.select_dtypes(include=[np.number])

# Drop rows with missing values for simplicity
df_num = df_num.dropna()

X = df_num.drop("SalePrice", axis=1)
y = df_num["SalePrice"]


In [19]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [20]:
#linear regressio without scaling

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

print("Linear Regression (No Scaling)")
print("R2 Score:", r2_score(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))


Linear Regression (No Scaling)
R2 Score: 0.7934175802006809
RMSE: 37384.808495919686


In [21]:
#Apply Feature Scaling

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [22]:
#Linear Regression (WITH Scaling)

lr_scaled = LinearRegression()
lr_scaled.fit(X_train_scaled, y_train)

y_pred_scaled = lr_scaled.predict(X_test_scaled)

print("Linear Regression (With Scaling)")
print("R2 Score:", r2_score(y_test, y_pred_scaled))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_scaled)))


Linear Regression (With Scaling)
R2 Score: 0.793417580196978
RMSE: 37384.808496254744


In [23]:
#Ridge Regression

ridge = Ridge(alpha=1.0)
ridge.fit(X_train_scaled, y_train)

y_ridge = ridge.predict(X_test_scaled)

print("Ridge Regression (With Scaling)")
print("R2 Score:", r2_score(y_test, y_ridge))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_ridge)))


Ridge Regression (With Scaling)
R2 Score: 0.7934839024753292
RMSE: 37378.80690930219


In [None]:
#observations:

print("""
Observation:
- Linear Regression shows minimal change after scaling.
- Ridge Regression performs better after scaling.
- Scaling stabilizes coefficients and improves regularization.
""")



Observation:
- Linear Regression shows minimal change after scaling.
- Ridge Regression performs better after scaling.
- Scaling stabilizes coefficients and improves regularization.



In [26]:
#Feature Scaling & Classification
#libraries 

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report


In [29]:
# Load SpamAssassin dataset
df = pd.read_csv("spam_assassin.csv")
df.head()

Unnamed: 0,text,target
0,From ilug-admin@linux.ie Mon Jul 29 11:28:02 2...,0
1,From gort44@excite.com Mon Jun 24 17:54:21 200...,1
2,From fork-admin@xent.com Mon Jul 29 11:39:57 2...,1
3,From dcm123@btamail.net.cn Mon Jun 24 17:49:23...,1
4,From ilug-admin@linux.ie Mon Aug 19 11:02:47 2...,0


In [None]:
#Separate Features & Target

X_text = df["text"]     # email content
y = df["target"]        # spam = 1, ham = 0

In [32]:
#Convert Text to Numerical Features (TF-IDF)

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    stop_words="english",
    max_features=3000
)

X = tfidf.fit_transform(X_text)


In [33]:
#Train-Test Split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [34]:
#Logistic Regression (WITHOUT Scaling)

log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

y_pred = log_reg.predict(X_test)

print("Logistic Regression (No Scaling)")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Logistic Regression (No Scaling)
Accuracy: 0.9887931034482759
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       779
           1       1.00      0.97      0.98       381

    accuracy                           0.99      1160
   macro avg       0.99      0.98      0.99      1160
weighted avg       0.99      0.99      0.99      1160



In [35]:
#Apply Feature Scaling

scaler = StandardScaler(with_mean=False)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [36]:
#Logistic Regression (WITH Scaling)

log_reg_scaled = LogisticRegression(max_iter=1000)
log_reg_scaled.fit(X_train_scaled, y_train)

y_pred_scaled = log_reg_scaled.predict(X_test_scaled)

print("Logistic Regression (With Scaling)")
print("Accuracy:", accuracy_score(y_test, y_pred_scaled))
print(classification_report(y_test, y_pred_scaled))


Logistic Regression (With Scaling)
Accuracy: 0.9974137931034482
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       779
           1       1.00      0.99      1.00       381

    accuracy                           1.00      1160
   macro avg       1.00      1.00      1.00      1160
weighted avg       1.00      1.00      1.00      1160



In [37]:
#KNN

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

y_knn = knn.predict(X_test_scaled)

print("KNN (With Scaling)")
print("Accuracy:", accuracy_score(y_test, y_knn))
print(classification_report(y_test, y_knn))


KNN (With Scaling)
Accuracy: 0.9267241379310345
              precision    recall  f1-score   support

           0       0.99      0.90      0.94       779
           1       0.82      0.99      0.90       381

    accuracy                           0.93      1160
   macro avg       0.91      0.94      0.92      1160
weighted avg       0.94      0.93      0.93      1160



In [38]:
#Observations:
print("""
Observations:
1. TF-IDF converts email text into numerical features.
2. Logistic Regression shows improved performance after scaling.
3. KNN performs poorly without scaling and improves significantly after scaling.
4. Feature scaling is essential for distance-based models.
""")



Observations:
1. TF-IDF converts email text into numerical features.
2. Logistic Regression shows improved performance after scaling.
3. KNN performs poorly without scaling and improves significantly after scaling.
4. Feature scaling is essential for distance-based models.

