In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('cardekho_imputated.csv',index_col=[0])

In [None]:
df.head()

In [None]:
df.drop(['car_name','brand'],axis=1,inplace=True)

In [None]:
df.head()

In [None]:
df['model'].unique()

In [None]:
df['seller_type'].value_counts()

In [None]:
df['fuel_type'].value_counts()

In [None]:
## Independent and Dependent feature
X = df.drop('selling_price',axis=1)
y = df['selling_price']

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [None]:
X['model'] = le.fit_transform(X['model'])

In [None]:
X['model'].unique()

In [None]:
X.head()

In [None]:
cat_feature = X.select_dtypes(include='object').columns
num_feature = X.select_dtypes(exclude='object').columns

In [None]:
cat_feature

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
scaler = StandardScaler()
oh_encode = OneHotEncoder(drop='first')

from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
   [ ('StandardScaler', scaler,num_feature ),
    ('OneHotEncoder', oh_encode, cat_feature)
   ]
)


In [None]:
preprocessor

In [None]:
from sklearn.model_selection import train_test_split
X_train , X_test, y_train , y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [None]:
X_train = preprocessor.fit_transform(X_train)

In [None]:
X_test = preprocessor.transform(X_test)

In [None]:
pd.DataFrame(X_train)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
from sklearn.metrics import mean_squared_error, r2_score


def evaluate_model(true, predicted):
    mse = mean_squared_error(true,predicted)
    rmse = np.sqrt(mse)
    score = r2_score(true, predicted)

    return mse , rmse , score

In [None]:
models = {
    'LinearRegression' : LinearRegression(),
    'DecisionTree': DecisionTreeRegressor(),
    'KNN': KNeighborsRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Adaboost Regressor': AdaBoostRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor()
}

In [None]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train , y_train)
    ## Make predictions
    y_train_pred = model.predict(X_train)
    y_pred = model.predict(X_test)
    
    ## Scores
    train_mse , train_rmse, train_score = evaluate_model(y_train, y_train_pred)
    test_mse , test_rmse, test_score = evaluate_model(y_test, y_pred)

    print('------------------------------------------')
    print(list(models.keys())[i])
    print('------------------------------------------')
    print('Model Training Performance  : ')
    print(f"MSE : {train_mse}")
    print(f"RMSE : {train_rmse}")
    print(f"R2 Score : {train_score}")

    print('-------------------------------------------')
    print('Model Testing Performance  : ')
    print(f"MSE : {test_mse}")
    print(f"RMSE : {test_rmse}")
    print(f"R2 Score : {test_score}")





In [None]:
#Initialize few parameter for Hyperparamter tuning
rf_params = {"max_depth": [5, 8, 15, None, 10],
             "max_features": [5, 7, "auto", 8],
             "min_samples_split": [2, 8, 15, 20],
             "n_estimators": [100, 200, 500, 1000]}

gradient_param = {
    'learning_rate':[10,1,0.1,0.01],
    'n_estimators': [40,50,60,70,80,90],
    'loss: Literal': ['squared_error', 'absolute_error', 'huber', 'quantile'],
    'criterion': ['friedman_mse', 'squared_error'],
    'min_samples_split': [1,2,3,4,5]
    
}


In [None]:
rf_params

In [None]:
gradient_param

In [None]:
randomcv_models = [
    ('Random Forest', RandomForestRegressor(), rf_params),
    ('GradientBoostingRegressor', GradientBoostingRegressor(), gradient_param)
]

In [None]:
from sklearn.model_selection import RandomizedSearchCV

model_param = {}

for name, model ,params in randomcv_models:
    random = RandomizedSearchCV(estimator=model, param_distributions=params ,n_iter=100 , cv=3, verbose = 2 , n_jobs=-1)

    random.fit(X_train, y_train)
    model_param[name] = random.best_params_

for model_name in model_param:
    print(f"---------------- Best Params for {model_name} -------------------")
    print(model_param[model_name])
    

In [None]:
models = {
    'KNN': KNeighborsRegressor(n_neighbors=10),
    'Random Forest': RandomForestRegressor(n_estimators=100, min_samples_split=2, max_features=8,max_depth=15),
    'Adaboost': AdaBoostRegressor(n_estimators= 80, loss= 'linear', learning_rate= 0.1)

}

In [None]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train , y_train)
    ## Make predictions
    y_train_pred = model.predict(X_train)
    y_pred = model.predict(X_test)
    
    ## Scores
    train_mse , train_rmse, train_score = evaluate_model(y_train, y_train_pred)
    test_mse , test_rmse, test_score = evaluate_model(y_test, y_pred)

    print('------------------------------------------')
    print(list(models.keys())[i])
    print('------------------------------------------')
    print('Model Training Performance  : ')
    print(f"MSE : {train_mse}")
    print(f"RMSE : {train_rmse}")
    print(f"R2 Score : {train_score}")

    print('-------------------------------------------')
    print('Model Testing Performance  : ')
    print(f"MSE : {test_mse}")
    print(f"RMSE : {test_rmse}")
    print(f"R2 Score : {test_score}")



