#### Importing necessary packages ####

In [1]:
import numpy as np;
import matplotlib.pyplot as pyplot;
import pandas as pd;
import sklearn;
import regex as re;
from sklearn.model_selection import train_test_split


In [2]:
# Verifying mission values
df = pd.read_csv("used_cars.csv");
print(df.isnull().sum())

df.duplicated().sum()
#there is no duplicates in this data frame

brand             0
model             0
model_year        0
milage            0
fuel_type       170
engine            0
transmission      0
ext_col           0
int_col           0
accident        113
clean_title     596
price             0
dtype: int64


0

To solve this problem of null values, we have three different approaches
<ul>
<li>Dropping all of these rows</li>
<li>Replacing it with mode or median or mean</li>
<li>Create another category for the NaN</li>
<li>Using KNNs</li>
</ul>

In [3]:
# replacing Null values with mode
df['fuel_type'] = df['fuel_type'].replace(np.nan,df['fuel_type'].mode()[0])

df['accident'] = df['accident'].replace(np.nan,df['accident'].mode()[0])

df = df.dropna(subset = ['clean_title'])


In [4]:
df.isnull().sum()

brand           0
model           0
model_year      0
milage          0
fuel_type       0
engine          0
transmission    0
ext_col         0
int_col         0
accident        0
clean_title     0
price           0
dtype: int64

In [5]:
df = df.drop(columns = ['engine','model'])
# turning milage,price values into numbers
for i,row in df.iterrows():
    number = int("".join(re.findall("\d+",row['milage'])))
    df.loc[i,'milage'] = number;
    number = int("".join(re.findall("\d+",row['price'])))
    df.loc[i,'price'] = number;
    
df

Unnamed: 0,brand,model_year,milage,fuel_type,transmission,ext_col,int_col,accident,clean_title,price
0,Ford,2013,51000,E85 Flex Fuel,6-Speed A/T,Black,Black,At least 1 accident or damage reported,Yes,10300
1,Hyundai,2021,34742,Gasoline,8-Speed Automatic,Moonlight Cloud,Gray,At least 1 accident or damage reported,Yes,38005
3,INFINITI,2015,88900,Hybrid,7-Speed A/T,Black,Black,None reported,Yes,15500
6,Audi,2017,84000,Gasoline,6-Speed A/T,Blue,Black,None reported,Yes,31000
7,BMW,2001,242000,Gasoline,A/T,Green,Green,None reported,Yes,7300
...,...,...,...,...,...,...,...,...,...,...
4003,Mercedes-Benz,2018,53705,Gasoline,A/T,Black,Black,At least 1 accident or damage reported,Yes,25900
4004,Bentley,2023,714,Gasoline,8-Speed Automatic with Auto-Shift,C / C,Hotspur,None reported,Yes,349950
4005,Audi,2022,10900,Gasoline,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,53900
4007,Ford,2020,33000,Gasoline,A/T,Blue,Black,None reported,Yes,62999


In [6]:
# turning categorical columns into classes

def turn_into_class(df,col_name):

    mean_encoded = df.groupby(col_name)['price'].mean()

    df[col_name] = df[col_name].map(mean_encoded)
     

turn_into_class(df,col_name='brand');
turn_into_class(df,col_name='fuel_type');
turn_into_class(df,col_name='transmission');
turn_into_class(df,col_name='ext_col');
turn_into_class(df,col_name='int_col');
turn_into_class(df,col_name='clean_title');
turn_into_class(df,col_name='accident');
df['fuel_type'].unique()

# extcolor_class_index = turn_into_class(df['ext_col'],col_name='ext_col');
# intcolor_class_index = turn_into_class(df['int_col'],col_name='int_col')
# cleantitle_class_index = turn_into_class(df['clean_title'],col_name='clean_title')
# accident_class_index = turn_into_class(df['accident'],col_name='accident')
# fuel_class_index = turn_into_class(df['fuel_type'],col_name='fuel_type')
# transmission_class_index = turn_into_class(df['transmission'],col_name='transmission')






array([22119.3671875, 42161.433930381885, 49021.324137931035,
       46134.166666666664, 44762.72727272727, 34853.75, 14000.0],
      dtype=object)

In [7]:
train_x,test_x,train_y,test_y = train_test_split(df[['brand','model_year','milage','fuel_type','transmission','ext_col','int_col','accident','clean_title']], df['price'], test_size=0.2, random_state=42);


In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [10]:
model = LinearRegression();

# Fit the model to the training data
model.fit(train_x,train_y);

# Make predictions on the test data
y_pred = model.predict(test_x);

print(y_pred);
# Evaluate the model
mse = mean_squared_error(test_y, y_pred);
r2_square = r2_score(test_y,y_pred);

print(mse);
print(r2_square);

[-1.55364452e+04 -3.82300661e+04  2.80709765e+04  2.62700117e+04
  1.57956255e+04  3.59791643e+04  4.94479451e+04  3.94460187e+04
  6.55055706e+04  1.88505161e+04  4.53225377e+04  1.31296022e+04
  2.42545638e+05  2.43378905e+05  4.32470281e+04  2.64374160e+04
  5.42043809e+04 -1.71438920e+04  6.40367617e+04 -7.27332188e+03
  4.72059616e+04  7.03856239e+04  2.69422235e+04  5.79974123e+04
  1.13832283e+05  3.41123831e+04  2.95127000e+04  5.85109081e+04
  2.50940213e+05  7.32338926e+04  5.34324962e+04  3.14584358e+04
  3.17650561e+04  1.80505632e+04  1.82220258e+04  1.16139317e+05
  4.40339872e+04  3.74564603e+04  1.02501619e+04  5.61178256e+04
  5.46824265e+04  2.08499589e+04  5.31172916e+04  1.91531459e+03
  2.32369746e+04  2.37893612e+04 -5.63809576e+03  5.22283963e+04
  2.55744810e+04  3.91353423e+04  5.78512550e+04  2.86990764e+04
  4.81478764e+04  3.97248822e+04 -1.00349801e+04  7.84476781e+03
  5.63828544e+04  2.82648442e+04  1.10480915e+04  5.56600913e+04
 -8.04812281e+02  6.52242

In [19]:
# Extracting model's coefficients 
coef_dict = {};

for ind,coef in enumerate(model.coef_):
    coef_dict[train_x.columns[ind]] = coef;


print(coef_dict)


{'brand': 0.75467417838044, 'model_year': 414.1581666688314, 'milage': -0.24745424539087615, 'fuel_type': 0.2601478206330188, 'transmission': 0.19323649158621362, 'ext_col': 0.30604015736049567, 'int_col': 0.29385178458003736, 'accident': -0.05401909776773852, 'clean_title': 0.0}
[ 7.54674178e-01  4.14158167e+02 -2.47454245e-01  2.60147821e-01
  1.93236492e-01  3.06040157e-01  2.93851785e-01 -5.40190978e-02
  0.00000000e+00]


![alt text](image.png)

![alt text](image-1.png)

Finding the p-value for each predictor (Testing if there is a linear relationship between predictor and response(sales))

In [33]:

for coef in coef_dict.keys(): 
    model = LinearRegression();
    # Fit the model to the training data
    model.fit(train_x[[coef]],train_y);
    # Make predictions on the test data
    # y_pred = model.predict(train_x);
    rss = np.sum((train_y - model.predict(train_x[[coef]])) ** 2);
    n = len(train_y)
    p = len(model.coef_)
    rse = np.sqrt(rss / (n - p - 1))
    se_beta1 = np.sqrt(rse**2 / np.sum((train_x[coef] - train_x[coef].mean())**2));
    se_beta0 = np.sqrt((rse**2)*( (1/n) + train_x[coef].mean()**2/(np.sum((train_x[coef] - train_x[coef].mean())**2))));
    t_beta1 = model.coef_[0] / se_beta1;
    t_beta0 = model.intercept_ / se_beta0;
    print(f'for predictor : {coef} for Slope : {t_beta1}')



for predictor : brand for Slope : 26.677561613098423
for predictor : model_year for Slope : 10.078586403809746
for predictor : milage for Slope : -16.020967975174255
for predictor : fuel_type for Slope : 2.695781891645117
for predictor : transmission for Slope : 16.25094843277405
for predictor : ext_col for Slope : 16.668935317100182
for predictor : int_col for Slope : 18.115812258689722
for predictor : accident for Slope : 5.051849362668637
for predictor : clean_title for Slope : 0.0
