# Diamonds Price Prediction - Full Analysis
This notebook performs data cleaning, EDA, sampling, Linear Regression, PCA-based modelling, and Lasso/Ridge regularisation.

In [None]:
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import seaborn as sns
import matplotlib.pyplot as plt

## Load Dataset

In [None]:
df = pd.read_csv('diamonds.csv')  # Update path if needed
df.head()

## Data Cleaning

In [None]:
df = df.drop_duplicates().dropna()
df['price'] = pd.to_numeric(df['price'], errors='coerce')
df = df.dropna(subset=['price'])
df.head()

## Exploratory Data Analysis

In [None]:
df.describe()

In [None]:
corr = df.corr(numeric_only=True)
corr['price'].sort_values(ascending=False)

## Create 12,500 Sample

In [None]:
diamonds_model = df.sample(n=12500, random_state=42).reset_index(drop=True)
diamonds_model.to_csv('diamonds_model_12500.csv', index=False)
diamonds_model.head()

## Linear Regression with All Features

In [None]:
X = diamonds_model.drop('price', axis=1)
y = diamonds_model['price']

categorical = X.select_dtypes(include=['object','category']).columns.tolist()
numeric = X.select_dtypes(include=[np.number]).columns.tolist()

preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(drop='first', sparse=False), categorical),
    ('num', StandardScaler(), numeric)
])

pipe_lr = Pipeline([('pre', preprocessor), ('lr', LinearRegression())])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipe_lr.fit(X_train, y_train)
y_pred_lr = pipe_lr.predict(X_test)

r2_lr = r2_score(y_test, y_pred_lr)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
rmse_lr = mean_squared_error(y_test, y_pred_lr, squared=False)

r2_lr, mae_lr, rmse_lr

## PCA with 2 Components

In [None]:
num_cols = diamonds_model.select_dtypes(include=[np.number]).columns.tolist()
num_corr = diamonds_model[num_cols].corr()['price'].abs().sort_values(ascending=False).drop('price')
strong_nums = num_corr[num_corr>0.3].index.tolist()

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_pca_scaled = scaler.fit_transform(diamonds_model[strong_nums])

pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_pca_scaled)

df_pca = pd.DataFrame(X_pca, columns=['PC1','PC2'])
df_pca['price'] = diamonds_model['price'].values

pipe_pca_lr = Pipeline([('sc', StandardScaler()), ('lr', LinearRegression())])
X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(df_pca[['PC1','PC2']], df_pca['price'], test_size=0.2, random_state=42)

pipe_pca_lr.fit(X_train_p, y_train_p)
y_pred_pca = pipe_pca_lr.predict(X_test_p)

r2_pca = r2_score(y_test_p, y_pred_pca)
mae_pca = mean_absolute_error(y_test_p, y_pred_pca)
rmse_pca = mean_squared_error(y_test_p, y_pred_pca, squared=False)

r2_pca, mae_pca, rmse_pca

## Lasso and Ridge Models

In [None]:
pipe_lasso = Pipeline([('pre', preprocessor), ('lasso', Lasso(alpha=1.0, random_state=42, max_iter=5000))])
pipe_ridge = Pipeline([('pre', preprocessor), ('ridge', Ridge(alpha=1.0, random_state=42))])

pipe_lasso.fit(X_train, y_train)
pipe_ridge.fit(X_train, y_train)

y_pred_lasso = pipe_lasso.predict(X_test)
y_pred_ridge = pipe_ridge.predict(X_test)

r2_lasso = r2_score(y_test, y_pred_lasso)
r2_ridge = r2_score(y_test, y_pred_ridge)

mae_lasso = mean_absolute_error(y_test, y_pred_lasso)
rmse_lasso = mean_squared_error(y_test, y_pred_lasso, squared=False)

mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
rmse_ridge = mean_squared_error(y_test, y_pred_ridge, squared=False)

r2_lasso, mae_lasso, rmse_lasso, r2_ridge, mae_ridge, rmse_ridge