#### Importing necessary packages ####

In [96]:
import numpy as np;
import matplotlib.pyplot as pyplot;
import pandas as pd;
import sklearn;
import regex as re;
from sklearn.model_selection import train_test_split


In [97]:
# Verifying mission values
df = pd.read_csv("used_cars.csv");
print(df.isnull().sum())

brand             0
model             0
model_year        0
milage            0
fuel_type       170
engine            0
transmission      0
ext_col           0
int_col           0
accident        113
clean_title     596
price             0
dtype: int64


In [98]:
df.duplicated().sum()
#there is no duplicates in this data frame

0

To solve this problem of null values, we have three different approaches
<ul>
<li>Dropping all of these rows</li>
<li>Replacing it with mode or median or mean</li>
<li>Create another category for the NaN</li>
<li>Using KNNs</li>
</ul>

In [99]:
# replacing Null values with mode
df['fuel_type'] = df['fuel_type'].replace(np.nan,df['fuel_type'].mode()[0])

df['accident'] = df['accident'].replace(np.nan,df['accident'].mode()[0])

df = df.dropna(subset = ['clean_title'])

df = df.reset_index(drop=True);


In [100]:
df.isnull().sum()

brand           0
model           0
model_year      0
milage          0
fuel_type       0
engine          0
transmission    0
ext_col         0
int_col         0
accident        0
clean_title     0
price           0
dtype: int64

In [101]:
# turning milage,price values into numbers
# df = df.drop(columns = ['engine','model'])
for i,row in df.iterrows():
    number = int("".join(re.findall("\d+",row['milage'])))
    df.loc[i,'milage'] = number;
    number = int("".join(re.findall("\d+",row['price'])))
    df.loc[i,'price'] = number;
    
df['milage'] = df['milage'].astype(int);
df['price'] = df['price'].astype(int);

df

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,Ford,Utility Police Interceptor Base,2013,51000,E85 Flex Fuel,300.0HP 3.7L V6 Cylinder Engine Flex Fuel Capa...,6-Speed A/T,Black,Black,At least 1 accident or damage reported,Yes,10300
1,Hyundai,Palisade SEL,2021,34742,Gasoline,3.8L V6 24V GDI DOHC,8-Speed Automatic,Moonlight Cloud,Gray,At least 1 accident or damage reported,Yes,38005
2,INFINITI,Q50 Hybrid Sport,2015,88900,Hybrid,354.0HP 3.5L V6 Cylinder Engine Gas/Electric H...,7-Speed A/T,Black,Black,None reported,Yes,15500
3,Audi,S3 2.0T Premium Plus,2017,84000,Gasoline,292.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,6-Speed A/T,Blue,Black,None reported,Yes,31000
4,BMW,740 iL,2001,242000,Gasoline,282.0HP 4.4L 8 Cylinder Engine Gasoline Fuel,A/T,Green,Green,None reported,Yes,7300
...,...,...,...,...,...,...,...,...,...,...,...,...
3408,Mercedes-Benz,E-Class E 300 4MATIC,2018,53705,Gasoline,241.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,A/T,Black,Black,At least 1 accident or damage reported,Yes,25900
3409,Bentley,Continental GT Speed,2023,714,Gasoline,6.0L W12 48V PDI DOHC Twin Turbo,8-Speed Automatic with Auto-Shift,C / C,Hotspur,None reported,Yes,349950
3410,Audi,S4 3.0T Premium Plus,2022,10900,Gasoline,349.0HP 3.0L V6 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,53900
3411,Ford,F-150 Raptor,2020,33000,Gasoline,450.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,A/T,Blue,Black,None reported,Yes,62999


Extracting new features (Horsepower, engine's displacement, engine configuration) from the engine column.

In [102]:
df.iloc[10:30]

def extract_engine_attributes(engine_str):
    horsepower = re.search(r'(\d+\.\d+)HP|\d+\.\d+', engine_str)
    displacement = re.search(r'(\d+\.\d+L|\d+\.\d+ Liter)', engine_str)
    return horsepower.group(1) if horsepower else '',\
           displacement.group(1) if displacement else ''

# Apply the function to create new columns
df[['Horsepower', 'Engine_Displacement']] = df['engine'].apply(extract_engine_attributes).apply(pd.Series)

df['Horsepower'] = pd.to_numeric(df['Horsepower'], errors = 'coerce')
df['Engine_Displacement'] = df['Engine_Displacement'].str.replace('L', '')
df['Engine_Displacement'] = pd.to_numeric(df['Engine_Displacement'], errors = 'coerce')

df['Horsepower'].fillna(df['Horsepower'].mean(), inplace = True)
df['Engine_Displacement'].fillna(df['Engine_Displacement'].mean(), inplace = True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Horsepower'].fillna(df['Horsepower'].mean(), inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Engine_Displacement'].fillna(df['Engine_Displacement'].mean(), inplace = True)


In [103]:
df = df.drop(columns=['engine'])
df

Unnamed: 0,brand,model,model_year,milage,fuel_type,transmission,ext_col,int_col,accident,clean_title,price,Horsepower,Engine_Displacement
0,Ford,Utility Police Interceptor Base,2013,51000,E85 Flex Fuel,6-Speed A/T,Black,Black,At least 1 accident or damage reported,Yes,10300,300.000000,3.7
1,Hyundai,Palisade SEL,2021,34742,Gasoline,8-Speed Automatic,Moonlight Cloud,Gray,At least 1 accident or damage reported,Yes,38005,331.445122,3.8
2,INFINITI,Q50 Hybrid Sport,2015,88900,Hybrid,7-Speed A/T,Black,Black,None reported,Yes,15500,354.000000,3.5
3,Audi,S3 2.0T Premium Plus,2017,84000,Gasoline,6-Speed A/T,Blue,Black,None reported,Yes,31000,292.000000,2.0
4,BMW,740 iL,2001,242000,Gasoline,A/T,Green,Green,None reported,Yes,7300,282.000000,4.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3408,Mercedes-Benz,E-Class E 300 4MATIC,2018,53705,Gasoline,A/T,Black,Black,At least 1 accident or damage reported,Yes,25900,241.000000,2.0
3409,Bentley,Continental GT Speed,2023,714,Gasoline,8-Speed Automatic with Auto-Shift,C / C,Hotspur,None reported,Yes,349950,331.445122,6.0
3410,Audi,S4 3.0T Premium Plus,2022,10900,Gasoline,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,53900,349.000000,3.0
3411,Ford,F-150 Raptor,2020,33000,Gasoline,A/T,Blue,Black,None reported,Yes,62999,450.000000,3.5


In [104]:
print(df.isnull().sum())


brand                  0
model                  0
model_year             0
milage                 0
fuel_type              0
transmission           0
ext_col                0
int_col                0
accident               0
clean_title            0
price                  0
Horsepower             0
Engine_Displacement    0
dtype: int64


In [105]:
from sklearn.preprocessing import OneHotEncoder

X_cat = df.select_dtypes(include='object')
X_num = df.select_dtypes(exclude='object')
print(X_num)
print(X_cat)

# print(X_cat)
encoder = OneHotEncoder(handle_unknown='ignore',sparse_output=False);
X_encoded = encoder.fit_transform(X_cat)
# print(X_encoded)

categorical_columns = [f'{col}_{cat}' for i, col in enumerate(X_cat.columns) for cat in encoder.categories_[i]]
# print(categorical_columns)

one_hot_features = pd.DataFrame(X_encoded, columns=categorical_columns)
# print(one_hot_features.shape)
# print(df.shape)


# print(one_hot_features.describe());
df = X_num.join(one_hot_features);

# print(new_df.shape)
# df


      model_year  milage   price  Horsepower  Engine_Displacement
0           2013   51000   10300  300.000000                  3.7
1           2021   34742   38005  331.445122                  3.8
2           2015   88900   15500  354.000000                  3.5
3           2017   84000   31000  292.000000                  2.0
4           2001  242000    7300  282.000000                  4.4
...          ...     ...     ...         ...                  ...
3408        2018   53705   25900  241.000000                  2.0
3409        2023     714  349950  331.445122                  6.0
3410        2022   10900   53900  349.000000                  3.0
3411        2020   33000   62999  450.000000                  3.5
3412        2020   43000   40000  248.000000                  2.0

[3413 rows x 5 columns]
              brand                            model      fuel_type  \
0              Ford  Utility Police Interceptor Base  E85 Flex Fuel   
1           Hyundai                     P

In [106]:
# # turning categorical columns into classes
# def turn_into_class(df,col_name):

#     mean_encoded = df.groupby(col_name)['price'].mean()

#     df[col_name] = df[col_name].map(mean_encoded)
    

# turn_into_class(df,col_name='brand');
# turn_into_class(df,col_name='fuel_type');
# turn_into_class(df,col_name='transmission');
# turn_into_class(df,col_name='ext_col');
# turn_into_class(df,col_name='int_col');
# turn_into_class(df,col_name='clean_title');
# turn_into_class(df,col_name='accident');
# df['fuel_type'].unique()

# # extcolor_class_index = turn_into_class(df['ext_col'],col_name='ext_col');
# # intcolor_class_index = turn_into_class(df['int_col'],col_name='int_col')
# # cleantitle_class_index = turn_into_class(df['clean_title'],col_name='clean_title')
# # accident_class_index = turn_into_class(df['accident'],col_name='accident')
# # fuel_class_index = turn_into_class(df['fuel_type'],col_name='fuel_type')
# # transmission_class_index = turn_into_class(df['transmission'],col_name='transmission')






In [107]:
# print(df.dtypes);
train_x,test_x,train_y,test_y = train_test_split(df[df.columns.drop('price')], df['price'], test_size=0.2, random_state=40);


In [108]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [109]:
# model = LinearRegression()

# # Fit the model to the training data
# model.fit(train_x,train_y)
# # Make predictions on the test data
# y_pred = model.predict(test_x)


# print(y_pred)
# # Evaluate the model
# mse = mean_squared_error(test_y, y_pred)
# r2_square = r2_score(test_y,y_pred)

# print(mse)
# print(r2_square)

[ 6.53638734e+10  8.57245826e+04  5.15148923e+10  2.19855841e+04
 -6.56264409e+10  2.60491456e+04 -3.19362778e+10  2.36410348e+04
  2.34926732e+04 -1.60290152e+10  1.65235686e+11  9.75317700e+10
  7.23560140e+04  3.82723514e+04  5.00476417e+04  3.89904403e+04
  8.73667745e+10 -1.48241448e+09 -5.28673218e+02 -4.34593077e+10
  2.10887689e+04 -1.54036538e+10  9.16071008e+03  7.01280443e+10
  2.03287980e+10 -1.13641339e+10 -1.13641216e+10  1.17685971e+05
  1.11825609e+04 -2.13269165e+02  1.31049156e+04  1.81977760e+04
  5.66147538e+11  3.92279774e+04  3.25318383e+09  3.31964835e+04
  2.97031066e+04  4.89592426e+04 -5.90012641e+10 -1.84769451e+10
 -2.55726740e+10  6.63113619e+10 -5.98136576e+10  1.65235676e+11
 -1.24337224e+10  9.33507667e+10 -6.67612781e+03  6.62574418e+04
  6.70771989e+04  2.76975360e+04  3.54665192e+04  2.11940992e+04
 -8.97191042e+10 -1.01279303e+04  1.72783241e+10  5.07359763e+09
  3.01933680e+04 -8.29532373e+09 -6.02390191e+09  1.38452847e+10
  3.26064459e+04 -4.78703

In [110]:
import statsmodels.api as sm
X = sm.add_constant(train_x)  # Adds a constant term to the predictor
model = sm.OLS(train_y, X).fit()
print(model.summary())

X = sm.add_constant(test_x) 
y_pred_statsmodels = model.predict(X)
r2_score(test_y, y_pred_statsmodels)


                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.721
Model:                            OLS   Adj. R-squared:                  0.326
Method:                 Least Squares   F-statistic:                     1.824
Date:                Tue, 20 Aug 2024   Prob (F-statistic):           6.88e-27
Time:                        14:34:57   Log-Likelihood:                -33200.
No. Observations:                2730   AIC:                         6.960e+04
Df Residuals:                    1128   BIC:                         7.907e+04
Df Model:                        1601                                         
Covariance Type:            nonrobust                                         
                                                                    coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------

0.4572369353851504

As you can see that some p-values are >= 0.001, so we can remove them (there is no evidence that these predictors are directly related to response).
