# 🏠📧 Housing Price Prediction & Spam Email Classification

**Author:** Rawal Vipul  
**ML Tasks:** Regression & Binary Classification

Customized notebook for AI/ML internship submission.

---


# Dataset: Housing prices
### Predict prices using features like square footage, bedrooms, location 
# Dataset: Email dataset 
### Classify emails as spam/not spam using logistic regression 

In [1]:
# Analysis customized and executed by Rawal Vipul
import os
import glob
import pandas as pd
import numpy as np
from IPython.display import display

In [6]:
# Simple Linear Regression (minimal code)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

# Improved R2 (Polynomial Regression)
from sklearn.preprocessing import PolynomialFeatures


In [9]:
# --- Load local Housing dataset (house.csv) ---
# Minimal code: read the CSV already downloaded

housing_path = "house.csv"
housing_df = pd.read_csv(housing_path, header=None)

In [3]:
# Assign column names for Boston Housing (with optional extra column)
if housing_df.shape[1] == 15:
    housing_df.columns = [
        "CRIM","ZN","INDUS","CHAS","NOX","RM","AGE","DIS",
        "RAD","TAX","PTRATIO","B","LSTAT","MEDV","MEDV01"
    ]
elif housing_df.shape[1] == 14:
    housing_df.columns = [
        "CRIM","ZN","INDUS","CHAS","NOX","RM","AGE","DIS",
        "RAD","TAX","PTRATIO","B","LSTAT","MEDV"
    ]

print("Housing dataset loaded:", housing_df.shape)
print("Columns:", list(housing_df.columns))
display(housing_df.head())

Housing dataset loaded: (506, 15)
Columns: ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV', 'MEDV01']


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV,MEDV01
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0,1
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6,1
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7,1
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4,1
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2,1


In [5]:
# Assumes housing_df is already loaded in the previous cell

target_col = "MEDV" if "MEDV" in housing_df.columns else housing_df.columns[-1]

X = housing_df.drop(columns=[target_col])
y = housing_df[target_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = LinearRegression()
model.fit(X_train, y_train)
preds = model.predict(X_test)

print("Target column:", target_col)
print("MAE:", mean_absolute_error(y_test, preds))
print("R2:", r2_score(y_test, preds))

Target column: MEDV
MAE: 3.189091965887881
R2: 0.6687594935356285


In [7]:
# Assumes housing_df is already loaded in the previous cell

target_col = "MEDV" if "MEDV" in housing_df.columns else housing_df.columns[-1]

X = housing_df.drop(columns=[target_col])
y = housing_df[target_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

model = LinearRegression()
model.fit(X_train_poly, y_train)
preds = model.predict(X_test_poly)

print("Polynomial (degree=2) R2:", r2_score(y_test, preds))
print("Polynomial (degree=2) MAE:", mean_absolute_error(y_test, preds))

Polynomial (degree=2) R2: 0.8055829448025917
Polynomial (degree=2) MAE: 2.574835626681786


In [8]:
# --- Load local SMS Spam dataset (mail.csv) ---
# Minimal code: read the CSV already downloaded

spam_path = "mail.csv"
spam_df = pd.read_csv(spam_path, encoding="latin-1")

print("Spam dataset loaded:", spam_df.shape)
print("Columns:", list(spam_df.columns))
display(spam_df.head())

Spam dataset loaded: (5572, 5)
Columns: ['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [10]:
# Logistic Regression for spam classification
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [11]:
# Detect label and text columns
# Kaggle SMS spam dataset typically uses columns: v1 (label), v2 (text)
label_col = None
text_col = None
for c in spam_df.columns:
    if c.lower() in {"v1", "label", "class", "spam"}:
        label_col = c
    if c.lower() in {"v2", "text", "message"}:
        text_col = c

if label_col is None or text_col is None:
    # fallback to first two columns
    label_col = spam_df.columns[0]
    text_col = spam_df.columns[1]

X = spam_df[text_col].astype(str)
y = spam_df[label_col].astype(str)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

vectorizer = TfidfVectorizer(stop_words="english")
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_vec, y_train)
preds = clf.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, preds))
print(classification_report(y_test, preds))

Accuracy: 0.967713004484305
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       966
        spam       1.00      0.76      0.86       149

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115

