Steps:
1. Import, read, head nad dtype
2. Check for NA values
3. Graphs
    - Univariate Analysis
    - Bivariate Analysis
    - Multivariate Analysis
4. Convert, Encode and Normalization
5. Factor Analysis
    1. Check Data Adequacy[KMO, Bartllet]
    2. *Feature Extraction[Scree Plot, Kaiser criterion, PCA, Maximum Likelihood]
    3. Factor Rotation[Promax, Varimax]
6. Model Training and Prediction[KAN]
7. Model Performance Measure

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.decomposition import PCA
import scipy as sp
import random
from factor_analyzer import FactorAnalyzer,calculate_bartlett_sphericity,calculate_kmo
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
import os
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
#read car data
data = pd.read_csv('./data/car_data.csv')

In [None]:
data.head()

In [None]:
data.info()

In [None]:
#check for missing values
data.isnull().sum()

In [None]:
data = data.fillna('0')

# Graph

## Univariable Analysis

In [None]:
sampled_data = data.sample(frac=0.01, random_state=42)

object_columns = sampled_data.select_dtypes(include=['object']).columns
object_columns = object_columns.drop(['Car_id','Customer Name','Date'])

if not os.path.exists('./graph/univariate'):
    os.makedirs('./graph/univariate')

# For Date Count
sales_counts = sampled_data['Date'].value_counts().sort_index().reset_index()
sales_counts.columns = ['Date', 'Sales']
plt.figure(figsize=(15, 5))
sns.barplot(x='Date', y='Sales', data=sales_counts, hue='Date')
plt.title('Sales Count by Date')
plt.xticks([])
plt.savefig('./graph/univariate/sales_count_by_date.png')
plt.show()

for i in range(len(object_columns)):
    plt.figure(figsize=(15, 5))
    sns.countplot(x=object_columns[i], data=sampled_data, hue=object_columns[i])
    plt.title(f'Count Plot for {object_columns[i]}')
    plt.xlabel(object_columns[i])
    plt.ylabel('Count')
    plt.xticks(rotation=90)
    plt.savefig(f'./graph/univariate/{object_columns[i]}_countplot.png')

plt.show()

In [None]:
plt.figure(figsize=(15, 5))
sns.kdeplot(x=sampled_data['Annual Income'], data=sampled_data)
plt.title(f'KDE Plot for Annual Income')
plt.xlabel('Annual Income')
plt.ylabel('KDE')
plt.xticks(rotation=90)
plt.savefig(f'./graph/univariate/Annual_Income_KDEplot.png')
plt.show()

## Bivariable Analysis

In [None]:
if not os.path.exists('./graph/bivariate'):
    os.makedirs('./graph/bivariate')

for i in range(len(object_columns)):
    plt.figure(figsize=(15, 5))
    sns.violinplot(x=object_columns[i], y='Price ($)', data=sampled_data, hue=object_columns[i])
    plt.title(f'Violin Plot for {object_columns[i]} vs Price')
    plt.xlabel(object_columns[i])
    plt.ylabel('Price')
    plt.xticks(rotation=90)
    plt.savefig(f'./graph/bivariate/{object_columns[i]}_vs_price_violinplot.png')

plt.show()


In [None]:
plt.figure(figsize=(15, 5))
sns.boxplot(x=sampled_data['Annual Income'], data=sampled_data)
plt.title(f'KDE Plot for Annual Income')
plt.xlabel('Annual Income')
plt.ylabel('KDE')
plt.xticks(rotation=90)
plt.savefig(f'./graph/bivariate/Annual_Income_KDE_plot.png')

plt.show()

## Multi-variate Analysis

In [None]:
if not os.path.exists('./graph/multivariate'):
    os.makedirs('./graph/multivariate')

sns.pairplot(sampled_data)
plt.savefig('./graph/multivariate/pairplot.png')
plt.show()

# Convert, Encode, Normalization

## Convert `Date` to independent variable

In [None]:
data['Date'] = pd.to_datetime(data['Date'])
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['Day'] = data['Date'].dt.day

data = data.drop(['Date'], axis=1)

## Encoding

In [None]:
object_columns = data.select_dtypes(include=['object']).columns

#Label Encoding
le = LabelEncoder()
for i in range(len(object_columns)):
    data[object_columns[i]] = le.fit_transform(data[object_columns[i]])

## Normalizing

In [None]:
num_columns = data.select_dtypes(include=['int64', 'float64', 'int32']).columns

#Standard Scaling
scaler = StandardScaler()
for i in range(len(num_columns)):
    data[num_columns[i]] = scaler.fit_transform(data[num_columns[i]].values.reshape(-1, 1))

# Features Analysis

In [None]:
summary_stats = data.describe()

print("Summary of Statistics:")
summary_stats

In [None]:
# Skewness and kurtosis
skewness = data.skew()
kurtosis = data.kurtosis()
# Display skewness and kurtosis values
print("\nSkewness:")
print(skewness)
print("\nKurtosis:")
print(kurtosis)

In [None]:
# Correlation matrix
correlation_matrix = data.corr()

# Correlation heatmap
plt.figure(figsize=(20, 10))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", linewidths=0.5, fmt = ".3f")
plt.title("Correlation Heatmap")
plt.show()

In [None]:
# Calculate Multicollinearity
y = data.drop(["Price ($)"], axis =1)
X = sm.add_constant(y)

# Calculate VIF for each variable
vif = pd.DataFrame()
vif["variable"] = X.columns
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

print(vif)

In [None]:
high_vif_variables = vif[vif["VIF"] >= 5]["variable"]
regression_data = X.drop(high_vif_variables, axis=1)

regression_data.info()

# Regression

## KAN

In [None]:
from effKAN import KAN
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from tqdm import tqdm
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

X = regression_data.drop(['const'], axis=1)
y = data['Price ($)']

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

train_X = torch.tensor(train_X.values, dtype=torch.float32).to(device)
train_y = torch.tensor(train_y.values, dtype=torch.float32).view(-1, 1).to(device)
test_X = torch.tensor(test_X.values, dtype=torch.float32).to(device)
test_y = torch.tensor(test_y.values, dtype=torch.float32).view(-1, 1).to(device)

trainset = TensorDataset(train_X, train_y)
trainloader = DataLoader(trainset, batch_size=512, shuffle=True)

model = KAN([12, 64, 32, 1])
model.to(torch.float32).to(device)

optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4, foreach=False)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.8)
criterion = nn.MSELoss()

epochs = 100
train_losses = []

for epoch in range(epochs):
    model.train()
    train_loss = 0
    with tqdm(trainloader, desc=f"Epoch {epoch+1}/{epochs}") as pbar:
        for i, (X, y) in enumerate(pbar):
            X, y  = X.to(device), y.to(device)
            optimizer.zero_grad()
            output = model(X)
            loss = criterion(output, y)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            pbar.set_postfix(loss=loss.item(), lr=optimizer.param_groups[0]['lr'])
    train_losses.append(loss.item())
    scheduler.step()

In [None]:
plt.figure(figsize=(15, 5))
plt.plot(train_losses)
plt.title("Training Loss")

In [None]:
model.eval()

with torch.no_grad():
    y_pred = model(test_X).cpu().numpy()

from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(test_y.cpu(), y_pred)
r2 = r2_score(test_y.cpu(), y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2}")

In [None]:
# Resnet model
import torchvision.models as models

X = regression_data.drop(['const'], axis=1)
y = data['Price ($)']

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

train_X = torch.tensor(train_X.values, dtype=torch.float32).to(device)
train_y = torch.tensor(train_y.values, dtype=torch.float32).view(-1, 1).to(device)
test_X = torch.tensor(test_X.values, dtype=torch.float32).to(device)
test_y = torch.tensor(test_y.values, dtype=torch.float32).view(-1, 1).to(device)

trainset = TensorDataset(train_X, train_y)
trainloader = DataLoader(trainset, batch_size=512, shuffle=True)

class ResNet(nn.Module):
    def __init__(self, num_classes):
        super(ResNet, self).__init__()
        self.num_classes = num_classes
        
        self.resnet = models.resnet101()

        self.resnet.conv1 = nn.Conv2d(self.num_classes, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.resnet.fc = nn.Linear(2048, 1)

        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = x.view(-1, self.num_classes, 1, 1)
        x = self.resnet(x)
        return x

model = ResNet(12)
model.to(device)
model.float()

optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4, foreach=False)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.8)
criterion = nn.MSELoss()

epochs = 100
train_losses = []

for epoch in range(epochs):
    model.train()
    train_loss = 0
    with tqdm(trainloader, desc=f"Epoch {epoch+1}/{epochs}") as pbar:
        for i, (X, y) in enumerate(pbar):
            X, y  = X.to(device), y.to(device)
            optimizer.zero_grad()
            output = model(X)
            loss = criterion(output, y)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            pbar.set_postfix(loss=loss.item(), lr=optimizer.param_groups[0]['lr'])
    train_losses.append(loss.item())
    scheduler.step()

In [None]:
plt.figure(figsize=(15, 5))
plt.plot(train_losses)
plt.title("Training Loss")

In [None]:
#torch.save(model.state_dict(), 'model.pth')

model.eval()

with torch.no_grad():
    y_pred = model(test_X).cpu().numpy()

from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(test_y.cpu(), y_pred)
r2 = r2_score(test_y.cpu(), y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2}")