In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import cross_val_score

In [None]:
df = pd.read_csv('data')
df

In [None]:
df['car_ID'].nunique()

In [None]:
df = pd.read_csv('data', index_col='car_ID')
df

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df.isnull().sum().sort_values(ascending=False)

In [None]:
df.nunique().sort_values(ascending=False)

In [None]:
df.hist(grid=False, bins=8, figsize=(16, 10));

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(df.corr(numeric_only=True), annot = True)

In [None]:
df_copy = df.copy()

In [None]:
num_cols = [col for col in df_copy.columns if pd.api.types.is_numeric_dtype(df_copy[col]) and col != 'price']
cat_cols = [col for col in df_copy.columns if col not in num_cols and col != 'price']

In [None]:
len(num_cols), len(cat_cols), df_copy.shape[1]

In [None]:
df_copy = df_copy[(np.abs(stats.zscore(df_copy[num_cols])) < 3).all(axis=1)]

In [None]:
df.shape[0] - df_copy.shape[0]

In [None]:
# sns.pairplot(df_copy)

In [None]:
df_copy[cat_cols].nunique()

In [None]:
df_copy[['brand', 'model']] = df_copy['CarName'].str.split(' ', n=1, expand=True)

In [None]:
df_copy['model'] = df_copy['model'].str.lower().replace(' ', '')
df_copy['model'] = df_copy['model'].str.replace('(', '')
df_copy['model'] = df_copy['model'].str.replace(')', '')
df_copy = df_copy.drop(['CarName', 'model'], axis=1)

In [None]:
df_copy['brand'].value_counts()

In [None]:
df_copy

In [None]:
cat_cols = [col for col in df_copy.columns if col not in num_cols and col != 'price' and col != 'CarName']
df_copy = pd.get_dummies(df_copy, columns=cat_cols, drop_first=True)
df_copy

In [None]:
X = df_copy.drop('price', axis=1)
y = df_copy['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

In [None]:
scaler = StandardScaler()
preprocessor = ColumnTransformer([('numeric', scaler, num_cols)], remainder='passthrough')

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [None]:
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
r2_score(y_test, y_pred)

In [None]:
mean_absolute_error(y_test, y_pred)

In [None]:
mean_squared_error(y_test, y_pred)

In [None]:
import pickle

pickle.dump(model, open("data", "wb"))