#### Importing necessary packages ####

In [1]:
import numpy as np;
import matplotlib.pyplot as pyplot;
import pandas as pd;
import sklearn;
import regex as re;
from sklearn.model_selection import train_test_split


In [2]:
# Verifying mission values
df = pd.read_csv("used_cars.csv");
print(df.isnull().sum())

df.duplicated().sum()
#there is no duplicates in this data frame

brand             0
model             0
model_year        0
milage            0
fuel_type       170
engine            0
transmission      0
ext_col           0
int_col           0
accident        113
clean_title     596
price             0
dtype: int64


0

To solve this problem of null values, we have three different approaches
<ul>
<li>Dropping all of these rows</li>
<li>Replacing it with mode or median or mean</li>
<li>Create another category for the NaN</li>
<li>Using KNNs</li>
</ul>

In [3]:
# replacing Null values with mode
df['fuel_type'] = df['fuel_type'].replace(np.nan,df['fuel_type'].mode()[0])

df['accident'] = df['accident'].replace(np.nan,df['accident'].mode()[0])

df = df.dropna(subset = ['clean_title'])


In [4]:
df.isnull().sum()

brand           0
model           0
model_year      0
milage          0
fuel_type       0
engine          0
transmission    0
ext_col         0
int_col         0
accident        0
clean_title     0
price           0
dtype: int64

In [5]:
df = df.drop(columns = ['engine','model'])
# turning milage,price values into numbers
for i,row in df.iterrows():
    number = int("".join(re.findall("\d+",row['milage'])))
    df.loc[i,'milage'] = number;
    number = int("".join(re.findall("\d+",row['price'])))
    df.loc[i,'price'] = number;
    
df

Unnamed: 0,brand,model_year,milage,fuel_type,transmission,ext_col,int_col,accident,clean_title,price
0,Ford,2013,51000,E85 Flex Fuel,6-Speed A/T,Black,Black,At least 1 accident or damage reported,Yes,10300
1,Hyundai,2021,34742,Gasoline,8-Speed Automatic,Moonlight Cloud,Gray,At least 1 accident or damage reported,Yes,38005
3,INFINITI,2015,88900,Hybrid,7-Speed A/T,Black,Black,None reported,Yes,15500
6,Audi,2017,84000,Gasoline,6-Speed A/T,Blue,Black,None reported,Yes,31000
7,BMW,2001,242000,Gasoline,A/T,Green,Green,None reported,Yes,7300
...,...,...,...,...,...,...,...,...,...,...
4003,Mercedes-Benz,2018,53705,Gasoline,A/T,Black,Black,At least 1 accident or damage reported,Yes,25900
4004,Bentley,2023,714,Gasoline,8-Speed Automatic with Auto-Shift,C / C,Hotspur,None reported,Yes,349950
4005,Audi,2022,10900,Gasoline,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,53900
4007,Ford,2020,33000,Gasoline,A/T,Blue,Black,None reported,Yes,62999


In [6]:
# turning categorical columns into classes

def turn_into_class(df,col_name):

    mean_encoded = df.groupby(col_name)['price'].mean()

    df[col_name] = df[col_name].map(mean_encoded)
    

turn_into_class(df,col_name='brand');
turn_into_class(df,col_name='fuel_type');
turn_into_class(df,col_name='transmission');
turn_into_class(df,col_name='ext_col');
turn_into_class(df,col_name='int_col');
turn_into_class(df,col_name='clean_title');
turn_into_class(df,col_name='accident');
df['fuel_type'].unique()

# extcolor_class_index = turn_into_class(df['ext_col'],col_name='ext_col');
# intcolor_class_index = turn_into_class(df['int_col'],col_name='int_col')
# cleantitle_class_index = turn_into_class(df['clean_title'],col_name='clean_title')
# accident_class_index = turn_into_class(df['accident'],col_name='accident')
# fuel_class_index = turn_into_class(df['fuel_type'],col_name='fuel_type')
# transmission_class_index = turn_into_class(df['transmission'],col_name='transmission')






array([22119.3671875, 42161.433930381885, 49021.324137931035,
       46134.166666666664, 44762.72727272727, 34853.75, 14000.0],
      dtype=object)

In [7]:
df = df.astype(int);
train_x,test_x,train_y,test_y = train_test_split(df[['brand','model_year','milage','fuel_type','transmission','ext_col','int_col','accident','clean_title']], df['price'], test_size=0.2, random_state=40);


In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [9]:
model = LinearRegression()

# Fit the model to the training data
model.fit(train_x,train_y)

# Make predictions on the test data
y_pred = model.predict(train_x)


print(y_pred)
# Evaluate the model
mse = mean_squared_error(train_y, y_pred)
r2_square = r2_score(train_y,y_pred)

print(mse)
print(r2_square)

[66893.62428231 51438.81170514 33828.17067987 ... 45727.53862111
  6608.11962189 36075.55262584]
4647956176.585222
0.3950448935456772


In [10]:
import statsmodels.api as sm
X = sm.add_constant(train_x)  # Adds a constant term to the predictor
model = sm.OLS(train_y, X).fit()
print(model.summary())

X = sm.add_constant(test_x) 
y_pred_statsmodels = model.predict(X)
r2_score(test_y, y_pred_statsmodels)


                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.395
Model:                            OLS   Adj. R-squared:                  0.393
Method:                 Least Squares   F-statistic:                     222.1
Date:                Mon, 19 Aug 2024   Prob (F-statistic):          2.82e-290
Time:                        16:29:21   Log-Likelihood:                -34258.
No. Observations:                2730   AIC:                         6.853e+04
Df Residuals:                    2721   BIC:                         6.859e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
brand            0.8403      0.028     29.540   

0.5830983527082443

As you can see that p-values of (fuel_type,model_year,clean_title,ext_col) are >= 0.001, so we can remove them (there is no evidence that these predictors are related to response).
