# Extract and Clean Data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import csr_matrix

In [2]:
df = pd.read_csv("../diamonds.csv")   
df.drop('Unnamed: 0', axis=1, inplace=True)
df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [3]:
df.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [4]:
df.shape

(53940, 10)

In [5]:
df.dtypes

carat      float64
cut         object
color       object
clarity     object
depth      float64
table      float64
price        int64
x          float64
y          float64
z          float64
dtype: object

In [6]:
categorical_df = df.select_dtypes(include=['object'])
categorical_columns = categorical_df.columns
categorical_df

Unnamed: 0,cut,color,clarity
0,Ideal,E,SI2
1,Premium,E,SI1
2,Good,E,VS1
3,Premium,I,VS2
4,Good,J,SI2
...,...,...,...
53935,Ideal,D,SI1
53936,Good,D,SI1
53937,Very Good,D,SI1
53938,Premium,H,SI2


In [7]:
numerical_df = df.select_dtypes(include=['float64', 'int64'])
numerical_columns = numerical_df.columns
numerical_df

Unnamed: 0,carat,depth,table,price,x,y,z
0,0.23,61.5,55.0,326,3.95,3.98,2.43
1,0.21,59.8,61.0,326,3.89,3.84,2.31
2,0.23,56.9,65.0,327,4.05,4.07,2.31
3,0.29,62.4,58.0,334,4.20,4.23,2.63
4,0.31,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...
53935,0.72,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,61.0,58.0,2757,6.15,6.12,3.74


In [8]:
for column in categorical_columns:
    categories = df[column].unique()
    print(f"Categories for column '{column}': {categories.tolist()}")

Categories for column 'cut': ['Ideal', 'Premium', 'Good', 'Very Good', 'Fair']
Categories for column 'color': ['E', 'I', 'J', 'H', 'F', 'G', 'D']
Categories for column 'clarity': ['SI2', 'SI1', 'VS1', 'VS2', 'VVS2', 'VVS1', 'I1', 'IF']


In [9]:
encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(df[categorical_columns])
categorical_df = encoder.transform(df[categorical_columns])
# encoder.categories_
# [array(['Fair', 'Good', 'Ideal', 'Premium', 'Very Good'], dtype=object),
#  array(['D', 'E', 'F', 'G', 'H', 'I', 'J'], dtype=object),
#  array(['I1', 'IF', 'SI1', 'SI2', 'VS1', 'VS2', 'VVS1', 'VVS2'],
#        dtype=object)]
categories = [item for sublist in encoder.categories_ for item in sublist]
categories
#categorical_df = pd.DataFrame(categorical_df, columns=categories)
categorical_df = pd.DataFrame.sparse.from_spmatrix(categorical_df, columns=categories)
categorical_df

Unnamed: 0,Fair,Good,Ideal,Premium,Very Good,D,E,F,G,H,I,J,I1,IF,SI1,SI2,VS1,VS2,VVS1,VVS2
0,0,0,1.0,0,0,0,1.0,0,0,0,0,0,0,0,0,1.0,0,0,0,0
1,0,0,0,1.0,0,0,1.0,0,0,0,0,0,0,0,1.0,0,0,0,0,0
2,0,1.0,0,0,0,0,1.0,0,0,0,0,0,0,0,0,0,1.0,0,0,0
3,0,0,0,1.0,0,0,0,0,0,0,1.0,0,0,0,0,0,0,1.0,0,0
4,0,1.0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,1.0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53935,0,0,1.0,0,0,1.0,0,0,0,0,0,0,0,0,1.0,0,0,0,0,0
53936,0,1.0,0,0,0,1.0,0,0,0,0,0,0,0,0,1.0,0,0,0,0,0
53937,0,0,0,0,1.0,1.0,0,0,0,0,0,0,0,0,1.0,0,0,0,0,0
53938,0,0,0,1.0,0,0,0,0,0,1.0,0,0,0,0,0,1.0,0,0,0,0


In [10]:
numerical_df

Unnamed: 0,carat,depth,table,price,x,y,z
0,0.23,61.5,55.0,326,3.95,3.98,2.43
1,0.21,59.8,61.0,326,3.89,3.84,2.31
2,0.23,56.9,65.0,327,4.05,4.07,2.31
3,0.29,62.4,58.0,334,4.20,4.23,2.63
4,0.31,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...
53935,0.72,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,61.0,58.0,2757,6.15,6.12,3.74


In [11]:
combined = pd.concat([numerical_df, categorical_df], axis=1)
combined

Unnamed: 0,carat,depth,table,price,x,y,z,Fair,Good,Ideal,...,I,J,I1,IF,SI1,SI2,VS1,VS2,VVS1,VVS2
0,0.23,61.5,55.0,326,3.95,3.98,2.43,0,0,1.0,...,0,0,0,0,0,1.0,0,0,0,0
1,0.21,59.8,61.0,326,3.89,3.84,2.31,0,0,0,...,0,0,0,0,1.0,0,0,0,0,0
2,0.23,56.9,65.0,327,4.05,4.07,2.31,0,1.0,0,...,0,0,0,0,0,0,1.0,0,0,0
3,0.29,62.4,58.0,334,4.20,4.23,2.63,0,0,0,...,1.0,0,0,0,0,0,0,1.0,0,0
4,0.31,63.3,58.0,335,4.34,4.35,2.75,0,1.0,0,...,0,1.0,0,0,0,1.0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53935,0.72,60.8,57.0,2757,5.75,5.76,3.50,0,0,1.0,...,0,0,0,0,1.0,0,0,0,0,0
53936,0.72,63.1,55.0,2757,5.69,5.75,3.61,0,1.0,0,...,0,0,0,0,1.0,0,0,0,0,0
53937,0.70,62.8,60.0,2757,5.66,5.68,3.56,0,0,0,...,0,0,0,0,1.0,0,0,0,0,0
53938,0.86,61.0,58.0,2757,6.15,6.12,3.74,0,0,0,...,0,0,0,0,0,1.0,0,0,0,0


In [12]:
X = combined.drop('price', axis=1)
y = combined['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score

model = LinearRegression()
model.fit(X_train_scaled, y_train)
accuracy = model.score(X_test_scaled, y_test)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9212308466087659


# Multi Layer Perceptron using PyTorch

In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [15]:
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)
X_train_tensor.shape

torch.Size([43152, 26])

In [30]:
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(26, 128)  # Input layer
        self.fc2 = nn.Linear(128, 128)  # Hidden layer
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, 1)   # Output layer

    def forward(self, x):
        x = F.relu(self.fc1(x))  # Activation function for input layer
        x = F.relu(self.fc2(x))  # Activation function for hidden layer
        x = F.relu(self.fc3(x))
        x = self.fc4(x)  # No activation function for output layer
        return x    

In [42]:
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

dataset = TensorDataset(X_train_tensor, y_train_tensor)
dataloader = DataLoader(dataset, batch_size=512, shuffle=True)

# Create an instance of your model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MLP().to(device)

# Define a loss function and an optimizer
criterion = nn.MSELoss()  # Mean Squared Error Loss for regression
optimizer = optim.SGD(model.parameters(), lr=1e-7)  # Stochastic Gradient Descent

# Number of epochs (iterations over the entire dataset)
epochs = 100

for epoch in range(epochs):
    for inputs, price in dataloader:
        inputs, price = inputs.to(device), price.to(device)
        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)

        # Compute loss
        loss = criterion(outputs, price)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()
    if(epoch % 10 == 0 and epoch != 0):
        print(f'Epoch {epoch}/{epochs}, Loss: {loss.item()}')
    
    # Check if loss is NaN
    if torch.isnan(loss):
        print("Loss is NaN. Adjusting learning rate or batch size may help.")
        break
    
# Move test data to the device
X_test_tensor, y_test_tensor = X_test_tensor.to(device), y_test_tensor.to(device)

# Set the model to evaluation mode
model.eval()

# Make predictions on the test data
with torch.no_grad():
    predictions = model(X_test_tensor)

# Compute the loss on the test data
test_loss = criterion(predictions, y_test_tensor)

print(f'Test Loss: {test_loss.item()}')

torch.save(model.state_dict(), 'baseline.pth')

  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 10/100, Loss: 29744520.0
Epoch 20/100, Loss: 14997852.0
Epoch 30/100, Loss: 15940638.0
Epoch 40/100, Loss: 21568464.0
Epoch 50/100, Loss: 16749720.0
Epoch 60/100, Loss: 17119342.0
Epoch 70/100, Loss: 14149149.0
Epoch 80/100, Loss: 16685638.0
Epoch 90/100, Loss: 17035852.0
Test Loss: 15855494.0
