## Importing the relevant libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import category_encoders as ce

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.stats.stattools import durbin_watson

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


sns.set_theme()

## Loading the raw data

In [None]:
df_main = pd.read_csv('cars_dataset.csv')
df = df_main.copy()
df.head()

In [None]:
df.shape

## Preprocessing

### Exploring the Descriptive Statistics of the Variables

In [None]:
df.describe(include='all').round(2)

### Determining the Variables of Interest

In [None]:
df = df.drop(['Model','Registration'], axis=1)

### Dealing with Missing Values

In [None]:
df.isnull().sum()

In [None]:
df[:20]

In [None]:
df.dropna(axis=0, ignore_index=True, inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.shape

### Exploring the Distribution of Price Variable

In [None]:
sns.displot(df['Price'])
plt.show()

### Dealing with Outliers

In [None]:
q = df['Price'].quantile(0.99)
df = df[df['Price']<q]

In [None]:
sns.displot(df['Price'])
plt.show()

In [None]:
sns.displot(df['Mileage'])
plt.show()

In [None]:
q = df['Mileage'].quantile(0.99)
df = df[df['Mileage']<q]

In [None]:
sns.displot(df['Mileage'])
plt.show()

In [None]:
sns.displot(df['EngineV'])
plt.show()

In [None]:
df = df[df['EngineV']<6.5]

In [None]:
sns.displot(df['EngineV'])
plt.show()

In [None]:
sns.displot(df['Year'])
plt.show()

In [None]:
q = df['Year'].quantile(0.01)
df = df[df['Year']>q]

In [None]:
sns.displot(df['Year'])
plt.show()

In [None]:
df.reset_index(drop=True, inplace=True)

### Categorizing Variables

In [None]:
df.head()

In [None]:
df.nunique()

In [None]:
# Initialize BinaryEncoder
binary_encoder = ce.BinaryEncoder()

# Fit and transform the data
df_encoded = binary_encoder.fit_transform(df)

In [None]:
df_encoded.head()

In [None]:
df_encoded.dtypes

In [None]:
df_encoded.nunique()

## Checking the OLS assumptions

In [None]:
y = df_encoded["Price"]
X = df_encoded.drop("Price", axis=1)

### Linearity

In [None]:
# Identify boolean-like columns (columns with only 0 and 1)
X_boolean = [col for col in X.columns if set(X[col].unique()).issubset({0, 1})]

# Exclude boolean-like columns and select only continuous columns
X_nonboolean = X.drop(columns=X_boolean, axis=0)

# Display the new DataFrame
print(X_nonboolean)

In [None]:
def fn(x, y):

    b1_slope, b0_intercept = np.polyfit(x, y, 1)
    y_pred = b0_intercept + x * b1_slope

    plt.figure(figsize=(6,3))
    plt.title(x.name + " and " + y.name)
    plt.scatter(x, y)
    plt.plot(x, y_pred, color='red', label='Regression Line')
    plt.show()

In [None]:
for col in X_nonboolean.columns:
    fn(X[col], y)

In [None]:
price_log = pd.Series(np.log(y), name='Price_log')

In [None]:
y = pd.concat([y,price_log], axis=1)

In [None]:
log_price = np.log(y)
df_encoded["log_price"] = log_price
df_encoded.head()

In [None]:
for col in non_boolean_X.columns:
    fn(df_encoded[col], df_encoded["log_price"])

### No Endogeneity / Exogeneity

In [None]:
y = df_encoded["log_price"]
x = sm.add_constant(X)
# Fit the OLS model
model = sm.OLS(y, x).fit()
residuals = model.resid

X["residuals"] = residuals

In [None]:
resid_corr = X.corr().loc[["residuals"],:"Year"] 
resid_corr

In [None]:
# Heatmap of Numeric Values Only
plt.figure(figsize=(10,3))
sns.heatmap(resid_corr, 
            annot=True, 
            cmap='coolwarm', 
            vmin=-1, 
            vmax=1, 
            annot_kws={"rotation": 35},
            fmt=".1f"
)

plt.title('Correlation')
plt.ylabel('Error Term')
plt.yticks([])
plt.xlabel('Predictors')
plt.xticks(rotation=55)
plt.show()
plt.clf()

In [None]:
X.drop("residuals", axis=1, inplace=True)

### Constant Error Variance / Homoscedasticity / No Heteroscedasticity

In [None]:
# Perform Breusch-Pagan test
bp_test = het_breuschpagan(residuals, model.model.exog) 

# Get the p-value
p_value = bp_test[1] 

# Print the results
print(f"Breusch-Pagan Test p-value for the Model: {p_value}")

# Interpret the results
if p_value < 0.05:
    print("Reject the null hypothesis of homoscedasticity.")
    print("There is evidence of heteroscedasticity.")
    print(f"The model is significant.\n")
else:
    print("Fail to reject the null hypothesis of homoscedasticity.")
    print("There is no evidence of heteroscedasticity.")
    print(f"The model is insignificant.\n")

### No Autocorrelation / Independant Error Terms / No Serial Correlation

In [None]:
plt.scatter(residuals.index, residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.title("Residuals Plot")
plt.xlabel("Observation Order")
plt.ylabel("Residuals")
plt.show()

In [None]:
dw_statistic = durbin_watson(residuals)
print(f"Durbin-Watson Statistic: {dw_statistic}")

### No Multicollinearity

In [None]:
corr_matrix = X.corr()

plt.figure(figsize=(10,5))
sns.heatmap(corr_matrix, 
            annot=True,
            annot_kws={"rotation": 35},
            cmap='coolwarm', 
            fmt=".1f")
plt.xticks(rotation=55)
plt.show()

In [None]:
vif_data = pd.DataFrame()
vif_data["Variable"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vif_data)

# Feature Engineering

In [None]:
# Engine Type_1 
# Body_2
# Year

## Create dummy variables

In [None]:
data_with_dummies = pd.get_dummies(data_no_multicollinearity, drop_first=True)

In [None]:
data_with_dummies.head()

### Rearrange a bit

In [None]:
data_with_dummies.columns.values

In [None]:
cols = ['log_price', 'Mileage', 'EngineV', 'Brand_BMW',
       'Brand_Mercedes-Benz', 'Brand_Mitsubishi', 'Brand_Renault',
       'Brand_Toyota', 'Brand_Volkswagen', 'Body_hatch', 'Body_other',
       'Body_sedan', 'Body_vagon', 'Body_van', 'Engine Type_Gas',
       'Engine Type_Other', 'Engine Type_Petrol']

In [None]:
data_preprocessed = data_with_dummies[cols]
data_preprocessed.head()

## Linear regression model

### Declare the inputs and the targets

In [None]:
targets = data_preprocessed['log_price']
inputs = data_preprocessed.drop(['log_price'],axis=1)

### Scale the data

In [None]:
scaler = StandardScaler()
scaler.fit(inputs)

In [None]:
inputs_scaled = scaler.transform(inputs)

### Train Test Split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(inputs, targets, test_size=0.2, random_state=365)

### Create the regression

In [None]:
reg = LinearRegression()
reg.fit(x_train,y_train)

In [None]:
y_hat = reg.predict(x_train)

In [None]:
plt.scatter(y_train, y_hat)
plt.xlabel('Targets (y_train)',size=18)
plt.ylabel('Predictions (y_hat)',size=18)
plt.xlim(6,13)
plt.ylim(6,13)
plt.show()

In [None]:
#Residual is the difference between the targets and the predictions
sns.displot(y_train - y_hat)
plt.title("Residuals PDF", size=18)

In [None]:
reg.score(x_train,y_train)

### Finding the weights and bias

In [None]:
reg.intercept_

In [None]:
reg.coef_

In [None]:
reg_summary = pd.DataFrame(inputs.columns.values, columns=['Features'])
reg_summary['Weights'] = reg.coef_
reg_summary

## Testing

In [None]:
y_hat_test = reg.predict(x_test)

In [None]:
x_test

In [None]:
plt.scatter(y_test, y_hat_test, alpha=0.2)
plt.xlabel('Targets (y_test)',size=18)
plt.ylabel('Predictions (y_hat_test)',size=18)
plt.xlim(6,13)
plt.ylim(6,13)
plt.show()

In [None]:
df_pf = pd.DataFrame(np.exp(y_hat_test), columns=['Prediction'])
df_pf.head()

In [None]:
df_pf['Target'] = np.exp(y_test)
df_pf

In [None]:
y_test = y_test.reset_index(drop=True)
y_test.head()

In [None]:
df_pf['Target'] = np.exp(y_test)
df_pf

In [None]:
df_pf['Residual'] = df_pf['Target'] - df_pf['Prediction']

In [None]:
df_pf['Difference%'] = np.absolute(df_pf['Residual']/df_pf['Target']*100)
df_pf

In [None]:
df_pf.describe()

In [None]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)
df_pf.sort_values(by=['Difference%'])

In [None]:
reg.feature_names_in_

In [None]:
reg.predict(np.array([0,2,0,0,0,0,0,0,0,0,0,1,0,0,1,0]).reshape(1, -1))