In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
import seaborn as sns
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import pickle

In [None]:
data=pd.read_csv(r"Cars_Dataset_Cleaned.csv")

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
data.dtypes

In [None]:
data.sample(10)

# cleaning CC/Battery Capacity for ML Modeles

In [None]:
data=data.drop(["CC/Battery Capacity"] , axis=1)


In [None]:
data.isnull().sum()

In [None]:
data["Cleaned Capacity"].fillna(data["Cleaned Capacity"].mean() , inplace=True)
data["Torque"].fillna(data["Torque"].mean() , inplace=True)
data["Cars Prices"].fillna(data["Cars Prices"].mean() , inplace=True)
data["Performance(0 - 100 )KM/H"].fillna(data["Performance(0 - 100 )KM/H"].mean() , inplace=True)

In [None]:
category_sales = data.groupby('Company Names')['Cars Prices'].sum()
category_sales.plot(kind='bar')
plt.title('Cars prices')
plt.xlabel('Campany Names')
plt.ylabel('Cars Prices($)')
plt.show()

##Data Processing For ML

In [None]:

data = pd.get_dummies(data, columns=["Company Names", "Cars Names", "Engines", "Fuel Types"], dtype=int)




In [None]:
data.isna

In [None]:
def extract_number(value):
    try:
        return float(''.join(c for c in str(value) if c.isdigit() or c == '.'))
    except:
        return None

data["Total_Speed"] = data["Total Speed"].apply(extract_number)  
data["Performance"] = data["Performance(0 - 100 )KM/H"].apply(extract_number)


data=data.drop([ "Total Speed" , "Performance(0 - 100 )KM/H"] , axis=1)
y=data["Cars Prices"]
x=data.drop(["Cars Prices"] , axis=1)

In [None]:

scaler = StandardScaler()
x = scaler.fit_transform(x)

x_train , x_test , y_train , y_test =model_selection.train_test_split(x ,y , test_size=0.2,random_state=42 )
print(x_train.shape , y_train.shape) , x_test , y_test


modeles={"RandomForestRegressor":RandomForestRegressor() , 
         "GradientBoostingRegressor":GradientBoostingRegressor() ,
         "LinearRegression" :LinearRegression(),
         "DecisionTreeRegressor":DecisionTreeRegressor(),
         "KNeighborsRegressor":KNeighborsRegressor()
         }

for name,x in modeles.items():
    
    x.fit(x_train,y_train)
    y_pred=x.predict(x_train)

    score=r2_score(y_train, y_pred)
    print(f"{name}  accuracy score is {score} ")

In [None]:
best_model2=RandomForestRegressor()
best_model2.fit(x_train,y_train)
y_pre=best_model2.predict(x_test)
score1=r2_score(y_test , y_pre)
print(f" RandomForestRegressor accuracy score is {score1}")

In [None]:
best_model=DecisionTreeRegressor(max_depth=10 , min_samples_split=5 , random_state=42 )
best_model.fit(x_train,y_train)
y_pre=best_model.predict(x_test)
score1=r2_score(y_test , y_pre)
print(f"DecisionTreeRegressor accuracy score is {score1}")

In [None]:
best_model1=LinearRegression()
best_model1.fit(x_train,y_train)
y_pre=best_model1.predict(x_test)
score1=r2_score(y_test , y_pre)
print(f"LinearRegression accuracy score is {score1}")

##Now we can choose DecisionTreeRegressor to streamlit app

In [None]:
filename='car prices.sav'
pickle.dump(best_model , open(filename , 'wb') )
pickle.dump(scaler, open('scaler.sav', 'wb'))

with open('model_columns.pkl', 'wb') as f:
    pickle.dump(data.drop("Cars Prices", axis=1).columns.tolist(), f)

