# Model Building

In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder,FunctionTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
from sklearn.compose import ColumnTransformer

In [30]:
df=pd.read_csv("Car_Data.csv")

In [31]:
df

Unnamed: 0,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,Price
0,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,...,109,mpfi,3.19,3.40,10.0,102,5500,24,30,13950.0
4,audi 100ls,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,...,136,mpfi,3.19,3.40,8.0,115,5500,18,22,17450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,volvo 145e (sw),gas,std,four,sedan,rwd,front,109.1,188.8,68.9,...,141,mpfi,3.78,3.15,9.5,114,5400,23,28,16845.0
201,volvo 144ea,gas,turbo,four,sedan,rwd,front,109.1,188.8,68.8,...,141,mpfi,3.78,3.15,8.7,160,5300,19,25,19045.0
202,volvo 244dl,gas,std,four,sedan,rwd,front,109.1,188.8,68.9,...,173,mpfi,3.58,2.87,8.8,134,5500,18,23,21485.0
203,volvo 246,diesel,turbo,four,sedan,rwd,front,109.1,188.8,68.9,...,145,idi,3.01,3.40,23.0,106,4800,26,27,22470.0


In [32]:
df.columns

Index(['CarName', 'fueltype', 'aspiration', 'doornumber', 'carbody',
       'drivewheel', 'enginelocation', 'wheelbase', 'carlength', 'carwidth',
       'carheight', 'curbweight', 'enginetype', 'cylindernumber', 'enginesize',
       'fuelsystem', 'boreratio', 'stroke', 'compressionratio', 'horsepower',
       'peakrpm', 'citympg', 'highwaympg', 'Price'],
      dtype='object')

In [33]:
X = df.drop('Price', axis=1)
y = df['Price']

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

In [35]:
X_train.head()

Unnamed: 0,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,...,cylindernumber,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg
1,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,...,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27
112,peugeot 604sl,diesel,turbo,four,sedan,rwd,front,107.9,186.7,68.4,...,four,152,idi,3.7,3.52,21.0,95,4150,28,33
162,toyota mark ii,gas,std,four,sedan,fwd,front,95.7,166.3,64.4,...,four,98,2bbl,3.19,3.03,9.0,70,4800,28,34
87,mitsubishi outlander,gas,turbo,four,sedan,fwd,front,96.3,172.4,65.4,...,four,110,spdi,3.17,3.46,7.5,116,5500,23,30
99,nissan rogue,gas,std,four,hatchback,fwd,front,97.2,173.4,65.2,...,four,120,2bbl,3.33,3.47,8.5,97,5200,27,34


In [36]:
len(X_train.dtypes)

23

In [37]:
numeric_features =(X.select_dtypes(include=np.number).columns.tolist())
cat_features= X.select_dtypes(exclude=np.number).columns.tolist()

In [38]:
len(numeric_features)

13

In [39]:
len(cat_features)

10

In [40]:
def preprocess_numeric_data(X):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled, scaler

In [41]:
X_train[numeric_features], scaler = preprocess_numeric_data(X_train[numeric_features])

In [42]:
import joblib
joblib.dump(scaler, 'scaler.joblib')

['scaler.joblib']

In [43]:
# Custom function for categorical data preprocessing
def preprocess_categorical_data(X):
    encoder = OneHotEncoder(drop='first', handle_unknown='ignore')
    X_encoded = encoder.fit_transform(X)
    return X_encoded, encoder

In [44]:
X_train_categorical,encoder= preprocess_categorical_data(X_train[cat_features])

In [45]:
import joblib
joblib.dump(encoder, 'encoder.joblib')

['encoder.joblib']

In [62]:
model = LinearRegression()
model.fit(np.hstack([X_train[numeric_features], X_train_categorical.toarray()]), y_train)


# Model testing

In [66]:
import joblib
scaler = joblib.load('scaler.joblib')
encoder = joblib.load('encoder.joblib')

In [70]:
# Using scaler transform numeric cols
X_test_numeric=scaler.transform(X_test[numeric_features])


# Using encoder transform numeric cols
X_test_categorical=encoder.transform(X_test[cat_features])

# Predictions on the test set
predictions=model.predict(np.hstack([X_test_numeric, X_test_categorical.toarray()]))

# Print the predictions
print("Predictions on Test Data:")
print(predictions)

Predictions on Test Data:
[ 1.20890938e+04  9.94884375e+03  1.20800000e+04  8.36159375e+03
 -3.21673516e+14  1.71444688e+04  1.37035938e+04  1.08759688e+04
  1.24338438e+04  9.57296875e+03  9.51259375e+03  6.02040625e+03
  1.57486562e+04  2.33736562e+04  9.03953125e+03  1.14061875e+04
  5.47015625e+03  8.49346875e+03  7.76765625e+03  8.21590625e+03
  4.89378842e+13  1.27298125e+04  9.20231250e+03 -2.38009372e+14
  1.84008930e+14  1.06459375e+03  4.89378842e+13  4.55465625e+03
  1.48358750e+04  9.71096875e+03  1.07880312e+04  4.12477812e+04
  9.13940625e+03  5.07134375e+03  4.89378842e+13  1.12728438e+04
  6.54968750e+03  1.11523438e+04  2.38009372e+14  1.46876129e+14
  1.37035938e+04]




# Check for Loss

In [75]:
from sklearn.metrics import r2_score

In [None]:
r2_score(y_test,predictions)

0.7114814181796709

In [72]:


predictionstrain=model.predict(np.hstack([X_train[numeric_features], X_train_categorical.toarray()]))


In [73]:
predictionstrain

array([16499.84375, 16629.96875,  8791.15625,  9117.90625,  9962.96875,
       15749.21875, 15249.     ,  6998.65625, 16190.40625,  9271.96875,
       36164.15625, 13495.     ,  9979.75   , 25551.53125, 32528.03125,
       20970.34375,  7299.03125, 13947.96875, 17669.5625 , 15588.90625,
       18420.59375,  7363.28125, 16500.46875, 11739.1875 , 10345.03125,
       14495.1875 , 13498.34375,  7609.03125, 15621.96875, 15984.90625,
        9094.15625,  6188.375  , 28248.8125 , 16515.09375, 40959.59375,
       12439.09375, 12260.84375, 24563.84375,  5820.90625,  8448.78125,
        8189.40625, 12943.78125, 18344.53125,  6983.21875, 12787.03125,
       17073.71875, 16612.59375, 31600.53125,  7293.34375,  7986.84375,
       14590.28125,  7099.46875,  5497.15625,  7348.21875, 11595.03125,
       13844.21875, 12371.34375, 22469.46875, 32251.03125,  7897.96875,
       14914.46875, 18418.46875,  6056.625  , 45400.21875,  7227.625  ,
        7774.59375, 17536.3125 , 10595.40625,  8180.5    ,  5348

In [76]:
r2_score(y_train,predictionstrain)

0.997688070559093

# Here is the correct sample Code

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from scipy.sparse import hstack
import joblib

# Sample training data
train_data = {
    'NumericCol1': [1, 2, 3, 4, 5],
    'NumericCol2': [10, 20, 10, 30, 40],
    'CategoricalCol': ['A', 'B', 'A', 'B', 'A'],
    'Target': [15, 25, 20, 35, 45]
}

df_train = pd.DataFrame(train_data)

# Separate features and target
X_train = df_train.drop('Target', axis=1)
y_train = df_train['Target']

# Define numeric and categorical features
numeric_features = ['NumericCol1', 'NumericCol2']
categorical_features = ['CategoricalCol']

# Custom function for numeric data preprocessing
def preprocess_numeric_data(X):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled, scaler

# Custom function for categorical data preprocessing
def preprocess_categorical_data(X):
    encoder = OneHotEncoder(drop='first', handle_unknown='ignore')
    X_encoded = encoder.fit_transform(X)
    return X_encoded, encoder

# Combine both numeric and categorical preprocessing
def preprocess_data(X):
    X_numeric, scaler = preprocess_numeric_data(X[numeric_features])
    X_categorical, encoder = preprocess_categorical_data(X[categorical_features])
    return X_numeric, X_categorical, scaler, encoder

# Preprocess the training data
X_train_numeric, X_train_categorical, scaler, encoder = preprocess_data(X_train)

# Train the model (Linear Regression in this case)
model = LinearRegression()
model.fit(hstack([X_train_numeric, X_train_categorical]), y_train)

# Save preprocessing objects for later use
joblib.dump(scaler, 'scaler.joblib')
joblib.dump(encoder, 'encoder.joblib')



['encoder.joblib']

In [None]:
# --------------- Now, for the Test Data ---------------

# Sample test data
test_data = {
    'NumericCol1': [6, 7, 8],
    'NumericCol2': [50, 60, 70],
    'CategoricalCol': ['A', 'B','A'] # Introduce an unseen category 'C'
}

df_test = pd.DataFrame(test_data)

# Preprocess the test data using the same scaler and encoder from training
X_test_numeric, _ = preprocess_numeric_data(df_test[numeric_features])


X_test_categorical, _ = preprocess_categorical_data(df_test[categorical_features])

# Predictions on the test set
predictions=model.predict(hstack([X_test_numeric, X_test_categorical]))


# Print the predictions
print("Predictions on Test Data:")
print(predictions)


Predictions on Test Data:
[14.43334144 27.9375     41.6499919 ]


In [None]:
X_test_categorical

<3x1 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [None]:
X_test_numeric

array([[-1.22474487, -1.22474487],
       [ 0.        ,  0.        ],
       [ 1.22474487,  1.22474487]])