In [136]:
from os import path
from typing import List

from numpy import sqrt
from pandas import read_csv, DataFrame, factorize
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import  SelectKBest, r_regression, chi2

In [137]:
def print_scores(y_true, y_pred) -> None:
    mse = mean_squared_error(y_true=y_true, y_pred=y_pred)
    print("mse: ",mse)
    rmse = sqrt(mse)
    print("rmse: ",rmse)
    mae = mean_absolute_error(y_true=y_true, y_pred=y_pred)
    print("mae: ",mae)
    print(f"Actual selling prices mean : {y_true.mean()} and Predicted selling price mean : {y_pred.mean()} with a difference of : {y_true.mean() - y_pred.mean()}")

In [138]:
dataframe = read_csv(path.join("..", 'data/raw','carData.csv'))
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Kms_Driven     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Seller_Type    301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB


# First tests
I wanted to try some feature selector, however i need to practice more with these tools i barely undersant.  
Here is some following tries to create a custom Encoder (to avoid difference of categories length in a train/test/split situation) then a pipeline when I use a selectfrom model

In [139]:
dataframe.drop(columns=['Present_Price'], inplace=True)

In [140]:
univariate_dataset, selling_price = dataframe.drop(columns=["Selling_Price"]), dataframe["Selling_Price"]
univariate_dataset_train, univariate_dataset_test, selling_price_train, selling_price_test = train_test_split(univariate_dataset, selling_price, test_size=0.3, random_state=41)

In [141]:
class CustomOneHotEncoder(BaseEstimator, TransformerMixin):
    
    def fit(self, dataframe, target_serie = None):
        return self
    
    def transform(self, dataframe, target_serie = None):
        length_categories = 0
        for column in dataframe.columns:
            length_categories += len(factorize(dataframe[column])[0])
        encoder = OneHotEncoder(min_frequency=length_categories)
        dataframe = encoder.fit_transform(dataframe)
        return dataframe

In [146]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler



column_transformer = ColumnTransformer(transformers=[
    ('Label', CustomOneHotEncoder(), [column for column in univariate_dataset.columns if univariate_dataset[column].dtype == 'object' or column == 'Owner']),
    ('Scaler', StandardScaler(), [column for column in univariate_dataset.columns if univariate_dataset[column].dtype in ('int64', 'float64') and column != 'Owner'])
    ])

best_univariate_pipeline = Pipeline([
    ('best_feature',  SelectFromModel(LinearRegression())),
    ('linear_regression', LinearRegression())
])

univariate_dataset_train_transformed = column_transformer.fit_transform(univariate_dataset_train)
best_univariate_pipeline.fit(univariate_dataset_train_transformed, selling_price_train)
univariate_dataset_test_transformed = column_transformer.fit_transform(univariate_dataset_test)
predicted_prices_univariate = best_univariate_pipeline.predict(X=univariate_dataset_test_transformed)

print_scores(selling_price_test, predicted_prices_univariate)

mse:  31.81975408675402
rmse:  5.640900113169353
mae:  3.2685486714386247
Actual selling prices mean : 4.686373626373626 and Predicted selling price mean : 4.650428571428501 with a difference of : 0.035945054945125676


# Second try
Let's analyze correlations and variance. Thanks to the heat map (for exmeple in correlation.ipynb or the exploratory-anylysis), we can decide to test only the following features : Seller_Type, Fuel_Type, Transmission & Year.  
Year is the only numerical one so we always need to encode the categoricals one.
***

## Year & Transmission

In [165]:
chosen_features = ['Year','Transmission']
polyvariate_dataset, selling_price = dataframe.drop(columns=[column for column in dataframe.columns if column not in chosen_features]), dataframe["Selling_Price"]
polyvariate_dataset_train, polyvariate_dataset_test, selling_price_train, selling_price_test = train_test_split(polyvariate_dataset, selling_price, test_size=0.3)

polyvariate_columns_transformers = ColumnTransformer([
    ('encode', OneHotEncoder(categories=[['Manual', 'Automatic']]), ['Transmission']),
    ('scaler', StandardScaler(), ['Year'])
])
polyvariate_dataset_train_transformed = polyvariate_columns_transformers.fit_transform(polyvariate_dataset_train)
linear_regressor_polyvariate = LinearRegression()
linear_regressor_polyvariate.fit(X=polyvariate_dataset_train_transformed,  y=selling_price_train)

polyvariate_dataset_test_transformed = polyvariate_columns_transformers.fit_transform(polyvariate_dataset_test)
predicted_prices_polyvariate = linear_regressor_polyvariate.predict(X=polyvariate_dataset_test_transformed)
print_scores(y_true=selling_price_test, y_pred=predicted_prices_polyvariate)

mse:  14.347383887372835
rmse:  3.7877940661251417
mae:  2.9419428493671425
Actual selling prices mean : 4.312637362637362 and Predicted selling price mean : 4.730100083095773 with a difference of : -0.4174627204584107


## Year & Fuel Type

In [166]:
chosen_features = ['Year','Fuel_Type']
polyvariate_dataset, selling_price = dataframe.drop(columns=[column for column in dataframe.columns if column not in chosen_features]), dataframe["Selling_Price"]
polyvariate_dataset_train, polyvariate_dataset_test, selling_price_train, selling_price_test = train_test_split(polyvariate_dataset, selling_price, test_size=0.3)

polyvariate_columns_transformers = ColumnTransformer([
    ('encode', OneHotEncoder(categories=[['Petrol', 'Diesel', 'CNG']]), ['Fuel_Type']),
    ('scaler', StandardScaler(), ['Year'])
])
polyvariate_dataset_train_transformed = polyvariate_columns_transformers.fit_transform(polyvariate_dataset_train)
linear_regressor_polyvariate = LinearRegression()
linear_regressor_polyvariate.fit(X=polyvariate_dataset_train_transformed,  y=selling_price_train)

polyvariate_dataset_test_transformed = polyvariate_columns_transformers.fit_transform(polyvariate_dataset_test)
predicted_prices_polyvariate = linear_regressor_polyvariate.predict(X=polyvariate_dataset_test_transformed)
print_scores(y_true=selling_price_test, y_pred=predicted_prices_polyvariate)

mse:  20.374852184117312
rmse:  4.513851147758122
mae:  3.289551604366828
Actual selling prices mean : 5.068571428571428 and Predicted selling price mean : 4.8228928609226385 with a difference of : 0.24567856764878915


## Year & Seller Type

In [167]:
chosen_features = ['Year','Seller_Type']
polyvariate_dataset, selling_price = dataframe.drop(columns=[column for column in dataframe.columns if column not in chosen_features]), dataframe["Selling_Price"]
polyvariate_dataset_train, polyvariate_dataset_test, selling_price_train, selling_price_test = train_test_split(polyvariate_dataset, selling_price, test_size=0.3)

polyvariate_columns_transformers = ColumnTransformer([
    ('encode', OneHotEncoder(categories=[['Dealer', 'Individual']]), ['Seller_Type']),
    ('scaler', StandardScaler(), ['Year'])
])
polyvariate_dataset_train_transformed = polyvariate_columns_transformers.fit_transform(polyvariate_dataset_train)
linear_regressor_polyvariate = LinearRegression()
linear_regressor_polyvariate.fit(X=polyvariate_dataset_train_transformed,  y=selling_price_train)

polyvariate_dataset_test_transformed = polyvariate_columns_transformers.fit_transform(polyvariate_dataset_test)
predicted_prices_polyvariate = linear_regressor_polyvariate.predict(X=polyvariate_dataset_test_transformed)
print_scores(y_true=selling_price_test, y_pred=predicted_prices_polyvariate)

mse:  29.939408417964177
rmse:  5.471691549965529
mae:  2.6603614461885203
Actual selling prices mean : 5.372637362637362 and Predicted selling price mean : 4.777153159214688 with a difference of : 0.5954842034226742


## 3 together

In [180]:
chosen_features = ['Fuel_Type','Seller_Type', 'Transmission']
polyvariate_dataset, selling_price = dataframe.drop(columns=[column for column in dataframe.columns if column not in chosen_features]), dataframe["Selling_Price"]
polyvariate_dataset_train, polyvariate_dataset_test, selling_price_train, selling_price_test = train_test_split(polyvariate_dataset, selling_price, test_size=0.3)

polyvariate_columns_transformers = ColumnTransformer([
    ('encode_seller', OneHotEncoder(categories=[['Dealer', 'Individual']]), ['Seller_Type']),
    ('encode_fuel', OneHotEncoder(categories=[['Petrol', 'Diesel', 'CNG']]), ['Fuel_Type']),
    ('encode', OneHotEncoder(categories=[['Manual', 'Automatic']]), ['Transmission'])
])
polyvariate_dataset_train_transformed = polyvariate_columns_transformers.fit_transform(polyvariate_dataset_train)
linear_regressor_polyvariate = LinearRegression()
linear_regressor_polyvariate.fit(X=polyvariate_dataset_train_transformed,  y=selling_price_train)

polyvariate_dataset_test_transformed = polyvariate_columns_transformers.fit_transform(polyvariate_dataset_test)
predicted_prices_polyvariate = linear_regressor_polyvariate.predict(X=polyvariate_dataset_test_transformed)
print_scores(y_true=selling_price_test, y_pred=predicted_prices_polyvariate)

mse:  11.640752592719782
rmse:  3.4118547144800555
mae:  2.11657967032967
Actual selling prices mean : 4.880109890109891 and Predicted selling price mean : 5.1514423076923075 with a difference of : -0.2713324175824168


When removing the random state of the train test split, we can see that the lack of data could create *overfitting*. We can assume that all these features could give the same scores in this project