# Osher Dighorkar   
# Yuval Brunshtein  
# Github -  https://github.com/YuvalBru/ElasticNetRegression

## Libraries

In [92]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import random


In [93]:
df = pd.read_csv('C:\\Users\\Yuval\\Downloads\\dataset.csv')
df.head(5)

Unnamed: 0,manufactor,Year,model,Hand,Gear,capacity_Engine,Engine_type,Prev_ownership,Curr_ownership,Area,City,Price,Pic_num,Cre_date,Repub_date,Description,Color,Km,Test,Supply_score
0,יונדאי,2015,i35,2,אוטומטית,1600,בנזין,פרטית,פרטית,רעננה - כפר סבא,רעננה,51000.0,2.0,11/07/2023,11/07/2023,['רכב שמור בקנאות\nמוכרת עקב קבלת רכב חברה'],כחול כהה מטאלי,144000,,
1,ניסאן,2018,ניסאן מיקרה,1,אוטומטית,1200,בנזין,פרטית,פרטית,מושבים בשרון,אבן יהודה,49000.0,0.0,06/04/2022,22/05/2022,['שמורה כל התוספות'],כחול בהיר,69000,,
2,סוזוקי,2010,סוזוקי סוויפט,1,אוטומטית,1450,בנזין,,,רמת,רמת,22500.0,1.0,29/10/2022,29/10/2022,['רכב במצב מתוחזק ברמה גבוהה טסט עד אפריל 2023'],,145000,,
3,טויוטה,2016,אוריס,1,טיפטרוניק,1600,בנזין,פרטית,פרטית,נס ציונה - רחובות,רחובות,63000.0,5.0,16/05/2024,16/05/2024,['אוטו במצב חדש!! שמור בקנאות!! נהג יחיד מטופל...,אפור מטאלי,27300,,
4,קיה,2012,פיקנטו,1,אוטומטית,1248,בנזין,,,"ראשל""צ והסביבה",ראשון לציון,37000.0,1.0,13/06/2022,13/06/2022,['שמור'],,70000,,4.0


# prepare_data function: 
## In the following function we deal with the data we deem important and prepare it in order for it to be the best fit for training our Elastic Net Model. 
### 1. We drop columns which we deem unrelevant for the training, Repub_date is a date which is defined  in the site which means that it has no factor on the training (with it's specific characteristics) the same goes for Description and Pic_num, the Test column is filled with nans and therefore we can't fill it with data which is accurate and reliable so we remove it. Supply_score column was deemed unneccessary in the model due to it's low impact on the elastic net model that and the amount of NANS in the column deem it an unrequired column, City column is removed because Area column exists and we deem it more appropriate for training the model.
### 2. We take the creation date of the ad and subtract the year of the car we do that in order to create a new column which will be described as the age of the car.
### 3. We then notice that the types of the columns aren't as they should be so we alter that. 
### 4. We create two new columns which are a mathematical combination of several different columns
##### a. Feature_Comb1: Feature_comb1 exists in order to normalize the Km the column relatively to the age of the car and according to the hand of the car 
##### b. Feature_Comb2: Feature_Comb2 basically takes the ratio of car_age with respect to  hand. We deemed it an helpful feature in training the model.
### 5. We then turn the aforementioned columns to float types and get rid of unneccessary categories in Curr_ownership column we do the same in the Gear column
### 6. We notice that a lot of people write km as a value smaller than 1000 for example (real example) a person wrote 200 kms written on the car it is uncommon for people to sell a car after 200 km of driving especially if the car is Hand = 2 (continued from the aforementioned example) (Joke) so we take those values and multiply them by 1000 which is the typical language slang. (It is common to hear someone say 200 K Km (K being translating it to 200,000 Km). We then take the median value of Km to fill the NAN values in order to not be affected by extreme values low and high(If we were to take the mean we would be exposed to such values).
### 7. We then get rid of an extra category in Engine type and we notice that gas cars and hybrid cars can be rather well distinguished by their engine capacity and so therefore we fill NANS accordinf to those engine capacity.
### 8. We checked category probabillistic distribution in both Curr_ownership column and Prev_ownership column finding that there roughly is a non existant distribution in Curr_ownership so we simply fill the NAN values with the most common value. However in Prev_ownership we find a well defined probability distribution so we calculate the probability for each category and fill the NANS accordingly.


In [96]:
def prepare_data(df):
    df = df.drop([
                 'Repub_date','Description','Test' ,'Pic_num' , 'Supply_score', 'City'],
                  axis = 1
                 )
    df['Cre_date'] = pd.to_datetime(df['Cre_date'] , errors='coerce')
    #Subtracting the creation date from the year of the car thereby calculating the age of the car
    df['car_age'] = df['Cre_date'].dt.year - df['Year']
    integer_col = ['Km','car_age', 'capacity_Engine']
    cat_col = ['Year','Gear','Engine_type','Prev_ownership','Curr_ownership' ,'Area','Color','manufactor','model','Hand']
    df['Price'] = df['Price'].astype(str).str.replace(',', '').astype(float)   
    for col in integer_col:
        df[col] = pd.to_numeric(df[col] , errors = 'coerce').astype('Int64')
    df['Feature_Comb1'] =(df['Km']/(df['car_age']*1000+10000))  - df['Hand']*100
    df['Feature_Comb2'] = df['car_age']/df['Hand']
    df[cat_col] = df[cat_col].astype('category')
    df['Feature_Comb1'] = df['Feature_Comb1'].astype(float)
    df['Feature_Comb2'] = df['Feature_Comb2'].astype(float)
    df['Curr_ownership'] = df['Curr_ownership'].replace('חברה', 'רה')
    df['Curr_ownership'] = df['Curr_ownership'].replace('חברה', 'חב')
    df['Curr_ownership'] = df['Curr_ownership'].replace('לא מוגדר' , None)

    #Gear Column Handling
    df['Gear'] = df['Gear'].replace('אוטומט' , 'אוטומטית')
    #Km column we take values which are smaller than 1000 and multiply them by 1000 due to the regularly used term 
    # 200 K instead of 200000 this will lead to the model being able to interpert the data better
    df['Km'].fillna(0, inplace=True)
    df['Km'] = df['Km'].apply(lambda x: x * 1000 if 1 <= x <= 1000 else x)
    df['Km'].fillna(df['Km'].median() , inplace = True)
    df['Km'] = df['Km'].replace(0, df['Km'].median())
    
    #Engine type column we basically notice that cars with Gas type engines require more engine capacity 
    #We also notice that both gas cars and hybrid cars have significant effects on the model with it's coefficients being 
    #high valued to opposites sides hybrid (+) and gas(-) therefore we take conditions in order to fill in the nans in the
    #column we also merge some categories in order to later on minimize the amount of features the model takes.
    df['Engine_type'] = df['Engine_type'].replace('היבריד' , 'היברידי')
    df.loc[df['Engine_type'].isna() & (df['capacity_Engine'] > 2250), 'Engine_type'] = 'גז'
    df.loc[df['Engine_type'].isna() & (df['capacity_Engine'] <= 2250), 'Engine_type'] = 'היברידי'
    #Filling the Nans of the current ownership with the most common value since there isn't a significant distribution in the 
    df['Curr_ownership'] = df['Curr_ownership'].fillna('פרטית')
    #In previous ownership we see a significant distribution in the data therefore we fill the NANS in the data using 
    #the distribution we have found. 
    prev_data = {
      'Prev_ownership': [
        'פרטית', 'ליסינג', 'לא מוגדר', 'השכרה', 'אחר', 'חברה', 'מונית', None, 'ממשלתי'
          ],
    'Count': [543, 103, 43, 36, 27, 14, 4, 3, 1]
    }
    df_counts_prev = pd.DataFrame(prev_data)
    total_count_prev = df_counts_prev['Count'].sum()
    df_counts_prev['Probability'] = df_counts_prev['Count'] / total_count_prev
    
    nan_indices = df['Prev_ownership'][df['Prev_ownership'].isna()].index
    fill_values = random.choices(prev_data['Prev_ownership'], df_counts_prev['Probability'], k=len(nan_indices))
    df['Prev_ownership'].loc[nan_indices] = fill_values

    return df

## We create a pipeline that does final touches on the prepared given data to avoid errors in the elastic net model. We do that by doing the following actions:
### 1 We split the features into categorical features and numerical features in order to deal each part seperately thereby assigning each feature according to it's type.
### 2. Turning any nans that we might have not handled in the prepared_data  into the median value 
##### Note: In the prepare_data() function we organize data which we deem important for the model according to our analysis of the features while building and training the model therefore features which we deemed as less important were filled with median values in the numerical_transformer pipeline part using the SimpleImputer function.
### 3.categorical features with their new column to each category using the OneHotEncoder where we basically create a column for each category in a feature where the row where that category exists gets a 1 in the corresponding column and 0 if it doesn't exist in the row. Similarly as (2) we fill NANS with unknown creating a new category in these features. (The same note in (2) applies here as well) 
### 4. We use a Polynomial Features in order to try to  capture correlation between the polynomial combination of the features and the target. note that we take 2nd degree polynom in order to capture those correlations meaning that we will be trying. The following new features if the input is (feature1,feature2) then we will get the following new features (feature1^2,feature2^2,feature1*feature2*2,).
### 5. We then apply the numerical and categorical transformers on our features (X) using ColumnTransformer function.
### 6. We define another part of the pipeline named model  where we define the regression model which we want to use in our case as required an Elastic Net model and we define it's parameters max_iter and tol. Note that max_iter defines  the amount of steps we allow the gradient decent algorithm (which obviously optimizes the loss function) to take before quitting where the tol parameter defines how close to the actual point we want to get.
### 7. We then define param_grid which will later on define the hyperparameters of the ElasticNet model using gridsearch (An explanation in the next section)
### 8. GridSearchCV() function is the function that we use to train the model, the aforementioned function performs experiments on several different levels. The first level we want to refer to is the hyperparameter level , it examines the model on each of  the options defined in param_grid. The second level being 10-fold-cross-validation, GridSearhCV() performs cross-validation thereby creating 10 different models which out of them we will choose the best one according to the RMSE parameter.  It's also important to note that the hyperparameters which we mentioned alpha and l1 are defining the following things for the ElasticNet Regression:
##### a. Alpha Hyperparameter - this parameter helps us control the strength of the regularization in our model. 
##### b. l1 Hyperparameter- this parameter defines to the model how much of ridge penalty to take and how much laso penalty to take given that the Elastic Net Regression model penalty is a mix of the two. (l1 = 0 Ridge  Penalty is used , l1 = 1  Lasso Penalty is used  0< l1 < 1 it's a mixture of the two).
### 9. Finally we split the data into train test where 20% of the data is test data in order to calculate the RMSE and 80% is training data, note that the gridsearch will perform the aforementioned actions on the train set. If it were to perform it on the test set we would or without splitting the data we would get data leakage and therefore our RMSE wouldn't be worth anything. 

In [97]:
df2 = prepare_data(df)
#Splitting the data into X (Features) and y (Predictors)
X = df2.drop(columns=['Price' ,'Cre_date'])
y = df2['Price']

#Splitting the columns of the database to categorical columns and non categorical columns
categorical_cols = X.select_dtypes(include=['category']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

#Simpleimputer fills the nans in each numerical column with the median value of the column (Elastic net can't accept NANS)
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('poly', PolynomialFeatures(degree=2, include_bias=False))

])

# Preprocessing for categorical data: fill in missing values and one-hot encoder
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
#Defining the preprocessor of the transformers on the numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])
# Defining the model  
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', ElasticNet(max_iter = 10000, tol = 0.001))
])
#Parameters values to examine and optimize
param_grid = {
    'regressor__alpha': [0.001, 0.01, 0.1, 1.0, 10.0],
    'regressor__l1_ratio': [0.1, 0.5, 0.9],
}
#Defining grid_search
grid_search = GridSearchCV(model, param_grid, cv=10, scoring='neg_mean_squared_error')

#Splitting into training set and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Training the model
grid_search.fit(X_train, y_train)




  df['Cre_date'] = pd.to_datetime(df['Cre_date'] , errors='coerce')
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_co

# Model Measurement. 
## We measure the model's goodness of fit using RMSE which is the root  of the MSE which is  the mean of squared errors. error being defined as e_i = (x_pred_i - x_real_i). Therefore we believe RMSE is the appropriate measurement for goodness of fit of the model.

In [98]:
y_pred = grid_search.predict(X_test)

print(f'RMSE: {mean_squared_error(y_test, y_pred)**0.5}')


RMSE: 9887.706934201085


# Extracting  each feature of the model with the corresponding coefficient 

In [99]:
best_estimator = grid_search.best_estimator_


best_model = best_estimator.named_steps['regressor']
coefficients = best_model.coef_

num_features = best_estimator.named_steps['preprocessor'].transformers_[0][1].named_steps['poly'].get_feature_names_out(numerical_cols)
cat_features = best_estimator.named_steps['preprocessor'].transformers_[1][1].named_steps['onehot'].get_feature_names_out(categorical_cols)

feature_names = list(num_features) + list(cat_features)

feature_importance = dict(zip(feature_names, coefficients))

feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients
})
feature_importance_df

Unnamed: 0,Feature,Coefficient
0,capacity_Engine,9.310895
1,Km,-0.069370
2,car_age,-4774.835032
3,Feature_Comb1,-52.127218
4,Feature_Comb2,-236.907125
...,...,...
468,Color_סגול חציל,-383.987421
469,Color_שחור,1962.963277
470,Color_שמפניה,-6590.655830
471,Color_תכלת,-4268.650222


In [100]:
top_coefficients = feature_importance_df.sort_values(by='Coefficient', ascending=False).head(5)
top_coefficients

Unnamed: 0,Feature,Coefficient
133,model_I-MIEV,34882.273114
137,model_RCZ,31036.363025
225,model_מוסטנג,30263.44719
278,model_סקודה אוקטביה RS (2014),29462.944066
288,model_סקודה סופרב (2016),28119.565109


In [101]:
top_coefficients = feature_importance_df.sort_values(by='Coefficient', ascending=True).head(5)
top_coefficients

Unnamed: 0,Feature,Coefficient
213,model_לנסר ספורטבק,-38045.713964
270,model_ספייס סטאר,-22028.130942
250,model_סוזוקי סלריו,-21534.463042
94,model_108,-18743.767651
170,model_אטראז',-18618.436675


# The Top 5 features with the biggest coefficients are: 

## 
## 1.  -38045.71 - model_ - לנסר ספורטבק
## 2.  34882.27 -  model_I-MIEV
## 3. 31036.36 -  model_RCZ 
## 4. 30263.44 - model_מוסטנג
## 5. 29462.944066 - model_סקודה אוקטביה RS (2014)   