#### Importing necessary packages ####

In [186]:
import numpy as np;
import matplotlib.pyplot as pyplot;
import pandas as pd;
import sklearn;
import regex as re;
from sklearn.model_selection import train_test_split


In [187]:
# Verifying mission values
df = pd.read_csv("used_cars.csv");
print(df.isnull().sum())

df.duplicated().sum()
#there is no duplicates in this data frame

brand             0
model             0
model_year        0
milage            0
fuel_type       170
engine            0
transmission      0
ext_col           0
int_col           0
accident        113
clean_title     596
price             0
dtype: int64


0

To solve this problem of null values, we have three different approaches
<ul>
<li>Dropping all of these rows</li>
<li>Replacing it with mode or median or mean</li>
<li>Create another category for the NaN</li>
<li>Using KNNs</li>
</ul>

In [188]:
# replacing Null values with mode
df['fuel_type'] = df['fuel_type'].replace(np.nan,df['fuel_type'].mode()[0])

df['accident'] = df['accident'].replace(np.nan,df['accident'].mode()[0])

df = df.dropna(subset = ['clean_title'])

df = df.reset_index(drop=True)


In [189]:
df.isnull().sum()

brand           0
model           0
model_year      0
milage          0
fuel_type       0
engine          0
transmission    0
ext_col         0
int_col         0
accident        0
clean_title     0
price           0
dtype: int64

In [190]:
df = df.drop(columns = ['engine','model'])
# turning milage,price values into numbers
for i,row in df.iterrows():
    number = int("".join(re.findall("\d+",row['milage'])))
    df.loc[i,'milage'] = number;
    number = int("".join(re.findall("\d+",row['price'])))
    df.loc[i,'price'] = number;
    
df['milage'] = df['milage'].astype(int);
df['price'] = df['price'].astype(int);
df

Unnamed: 0,brand,model_year,milage,fuel_type,transmission,ext_col,int_col,accident,clean_title,price
0,Ford,2013,51000,E85 Flex Fuel,6-Speed A/T,Black,Black,At least 1 accident or damage reported,Yes,10300
1,Hyundai,2021,34742,Gasoline,8-Speed Automatic,Moonlight Cloud,Gray,At least 1 accident or damage reported,Yes,38005
2,INFINITI,2015,88900,Hybrid,7-Speed A/T,Black,Black,None reported,Yes,15500
3,Audi,2017,84000,Gasoline,6-Speed A/T,Blue,Black,None reported,Yes,31000
4,BMW,2001,242000,Gasoline,A/T,Green,Green,None reported,Yes,7300
...,...,...,...,...,...,...,...,...,...,...
3408,Mercedes-Benz,2018,53705,Gasoline,A/T,Black,Black,At least 1 accident or damage reported,Yes,25900
3409,Bentley,2023,714,Gasoline,8-Speed Automatic with Auto-Shift,C / C,Hotspur,None reported,Yes,349950
3410,Audi,2022,10900,Gasoline,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,53900
3411,Ford,2020,33000,Gasoline,A/T,Blue,Black,None reported,Yes,62999


In [191]:
print(df.isnull().sum())


brand           0
model_year      0
milage          0
fuel_type       0
transmission    0
ext_col         0
int_col         0
accident        0
clean_title     0
price           0
dtype: int64


In [192]:
from sklearn.preprocessing import OneHotEncoder

X_cat = df.select_dtypes(include='object')
X_num = df.select_dtypes(exclude='object')
print(X_num)
print(X_cat)

# print(X_cat)
encoder = OneHotEncoder(handle_unknown='ignore',sparse_output=False);
X_encoded = encoder.fit_transform(X_cat)
# print(X_encoded)

categorical_columns = [f'{col}_{cat}' for i, col in enumerate(X_cat.columns) for cat in encoder.categories_[i]]
# print(categorical_columns)

one_hot_features = pd.DataFrame(X_encoded, columns=categorical_columns)
# print(one_hot_features.shape)
# print(df.shape)


# print(one_hot_features.describe());
df = X_num.join(one_hot_features);

# print(new_df.shape)
# df


      model_year  milage   price
0           2013   51000   10300
1           2021   34742   38005
2           2015   88900   15500
3           2017   84000   31000
4           2001  242000    7300
...          ...     ...     ...
3408        2018   53705   25900
3409        2023     714  349950
3410        2022   10900   53900
3411        2020   33000   62999
3412        2020   43000   40000

[3413 rows x 3 columns]
              brand      fuel_type                       transmission  \
0              Ford  E85 Flex Fuel                        6-Speed A/T   
1           Hyundai       Gasoline                  8-Speed Automatic   
2          INFINITI         Hybrid                        7-Speed A/T   
3              Audi       Gasoline                        6-Speed A/T   
4               BMW       Gasoline                                A/T   
...             ...            ...                                ...   
3408  Mercedes-Benz       Gasoline                                A/

In [193]:
# # turning categorical columns into classes
# def turn_into_class(df,col_name):

#     mean_encoded = df.groupby(col_name)['price'].mean()

#     df[col_name] = df[col_name].map(mean_encoded)
    

# turn_into_class(df,col_name='brand');
# turn_into_class(df,col_name='fuel_type');
# turn_into_class(df,col_name='transmission');
# turn_into_class(df,col_name='ext_col');
# turn_into_class(df,col_name='int_col');
# turn_into_class(df,col_name='clean_title');
# turn_into_class(df,col_name='accident');
# df['fuel_type'].unique()

# # extcolor_class_index = turn_into_class(df['ext_col'],col_name='ext_col');
# # intcolor_class_index = turn_into_class(df['int_col'],col_name='int_col')
# # cleantitle_class_index = turn_into_class(df['clean_title'],col_name='clean_title')
# # accident_class_index = turn_into_class(df['accident'],col_name='accident')
# # fuel_class_index = turn_into_class(df['fuel_type'],col_name='fuel_type')
# # transmission_class_index = turn_into_class(df['transmission'],col_name='transmission')






In [194]:
# print(df.dtypes);
train_x,test_x,train_y,test_y = train_test_split(df[df.columns.drop('price')], df['price'], test_size=0.2, random_state=40);


In [195]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [196]:
model = LinearRegression()

# Fit the model to the training data
model.fit(train_x,train_y)
# Make predictions on the test data
y_pred = model.predict(test_x)


print(y_pred)
# Evaluate the model
mse = mean_squared_error(test_y, y_pred)
r2_square = r2_score(test_y,y_pred)

print(mse)
print(r2_square)

[ 2.44356297e+05  5.99341734e+04  5.84410415e+04  3.05983849e+04
  2.60413040e+04  1.59273704e+04  2.50649064e+04  2.62766965e+04
  1.16194570e+04  5.18777720e+04  3.40702503e+04  4.75942221e+04
  7.45908337e+04  4.73298929e+04  6.63815424e+04 -3.28457777e+03
  5.20285088e+04  4.52508940e+04  5.60641137e+03  8.37802312e+04
  2.78005587e+04  5.19871316e+04  2.69972132e+04  4.15350146e+04
  4.80357037e+04  1.81173165e+04  2.97533406e+04  6.71271895e+04
  4.79843547e+04  2.83627083e+03  2.62761871e+04  1.20910936e+04
  1.29585731e+05  4.19929929e+04  3.34342394e+04  4.66270405e+04
  1.69392165e+04  3.62528273e+04  5.20422328e+04  6.72271192e+04
  1.44662691e+04  3.22544698e+04  1.39687184e+05  2.22073834e+04
  1.79740575e+04  1.36316196e+04 -5.00745170e+02  3.52619856e+04
  4.37709464e+04  2.99202240e+04  5.63247885e+04  4.00172945e+04
  3.94850407e+04  4.90186111e+03  9.79185826e+04 -3.54770693e+03
  8.14705004e+04  1.32110032e+04  1.81820399e+04  7.16495244e+04
  3.01011042e+04  5.35420

In [197]:
import statsmodels.api as sm
X = sm.add_constant(train_x)  # Adds a constant term to the predictor
model = sm.OLS(train_y, X).fit()
print(model.summary())

X = sm.add_constant(test_x) 
y_pred_statsmodels = model.predict(X)
r2_score(test_y, y_pred_statsmodels)


                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.435
Model:                            OLS   Adj. R-squared:                  0.379
Method:                 Least Squares   F-statistic:                     7.706
Date:                Mon, 19 Aug 2024   Prob (F-statistic):          8.37e-175
Time:                        23:00:14   Log-Likelihood:                -34165.
No. Observations:                2730   AIC:                         6.883e+04
Df Residuals:                    2481   BIC:                         7.030e+04
Df Model:                         248                                         
Covariance Type:            nonrobust                                         
                                                      coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------

0.43120601086830157

As you can see that some p-values are >= 0.001, so we can remove them (there is no evidence that these predictors are directly related to response).
