## TASK 1 — Housing Dataset: Feature Scaling and Regression Model
## TASK 2 — Email Dataset: Feature Processing and Classification Model


In [None]:
import pandas as pd
df = pd.read_csv("Housing.csv")
df = df.replace({"yes":1, "no":0})
df["furnishingstatus"] = df["furnishingstatus"].map({
    "unfurnished":0,
    "semi-furnished":1,
    "furnished":2
})

In [None]:
#Split data
from sklearn.model_selection import train_test_split
X = df.drop("price", axis=1)
y = df["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
#Train Regression (Before Scaling)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Before Scaling R2:", r2_score(y_test, y_pred))
print("Before Scaling RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
#Scale Numerical Features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
#Retrain After Scaling
model_scaled = LinearRegression()
model_scaled.fit(X_train_scaled, y_train)
y_pred_scaled = model_scaled.predict(X_test_scaled)
print("After Scaling R2:", r2_score(y_test, y_pred_scaled))
print("After Scaling RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_scaled)))

Before Scaling R2: 0.6494754192267804
Before Scaling RMSE: 1331071.4167895103
After Scaling R2: 0.6494754192267792
After Scaling RMSE: 1331071.4167895128


In [41]:
#TASK 2

In [42]:
import pandas as pd

# Reload your email dataset (use your actual filename)
email = pd.read_csv("email_spam.csv")
# ---- Build target CLEANLY ----
y = email["type"].astype(str).str.lower()

y = y.map({
    "spam": 1,
    "not spam": 0
})
# Remove any rows where y is still NaN
clean_df = email.loc[~y.isna()].copy()
y = y.loc[~y.isna()]
# Rebuild X from the SAME cleaned dataframe
X_text = clean_df["text"]
print("NaNs in y:", y.isna().sum())
print("Unique y values:", y.unique())
print("Shape after cleaning:", clean_df.shape)

NaNs in y: 0
Unique y values: [1 0]
Shape after cleaning: (84, 3)


In [43]:
#Convert text to numbers (TF-IDF)
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words="english")
X_vectorized = vectorizer.fit_transform(X_text)
print("X shape:", X_vectorized.shape)
print("y length:", len(y))

X shape: (84, 2641)
y length: 84


In [44]:
#Train–test split (now safe)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_vectorized, y, test_size=0.2, random_state=42, stratify=y
)

In [45]:
#Train Logistic Regression 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
print("Accuracy before scaling:",
      accuracy_score(y_test, clf.predict(X_test)))

Accuracy before scaling: 0.7058823529411765
