# Model Building

In [73]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder,FunctionTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
from sklearn.compose import ColumnTransformer

In [74]:
df=pd.read_csv("Car_Data.csv")

In [75]:
df

Unnamed: 0,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,Price
0,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,...,109,mpfi,3.19,3.40,10.0,102,5500,24,30,13950.0
4,audi 100ls,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,...,136,mpfi,3.19,3.40,8.0,115,5500,18,22,17450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,volvo 145e (sw),gas,std,four,sedan,rwd,front,109.1,188.8,68.9,...,141,mpfi,3.78,3.15,9.5,114,5400,23,28,16845.0
201,volvo 144ea,gas,turbo,four,sedan,rwd,front,109.1,188.8,68.8,...,141,mpfi,3.78,3.15,8.7,160,5300,19,25,19045.0
202,volvo 244dl,gas,std,four,sedan,rwd,front,109.1,188.8,68.9,...,173,mpfi,3.58,2.87,8.8,134,5500,18,23,21485.0
203,volvo 246,diesel,turbo,four,sedan,rwd,front,109.1,188.8,68.9,...,145,idi,3.01,3.40,23.0,106,4800,26,27,22470.0


In [76]:
df.columns

Index(['CarName', 'fueltype', 'aspiration', 'doornumber', 'carbody',
       'drivewheel', 'enginelocation', 'wheelbase', 'carlength', 'carwidth',
       'carheight', 'curbweight', 'enginetype', 'cylindernumber', 'enginesize',
       'fuelsystem', 'boreratio', 'stroke', 'compressionratio', 'horsepower',
       'peakrpm', 'citympg', 'highwaympg', 'Price'],
      dtype='object')

In [78]:
X = df.drop('Price', axis=1)
y = df['Price']

In [79]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

In [80]:
X_train.head()

Unnamed: 0,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,...,cylindernumber,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg
1,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,...,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27
112,peugeot 604sl,diesel,turbo,four,sedan,rwd,front,107.9,186.7,68.4,...,four,152,idi,3.7,3.52,21.0,95,4150,28,33
162,toyota mark ii,gas,std,four,sedan,fwd,front,95.7,166.3,64.4,...,four,98,2bbl,3.19,3.03,9.0,70,4800,28,34
87,mitsubishi outlander,gas,turbo,four,sedan,fwd,front,96.3,172.4,65.4,...,four,110,spdi,3.17,3.46,7.5,116,5500,23,30
99,nissan rogue,gas,std,four,hatchback,fwd,front,97.2,173.4,65.2,...,four,120,2bbl,3.33,3.47,8.5,97,5200,27,34


In [81]:
len(X_train.dtypes)

23

In [82]:
numeric_features =(X.select_dtypes(include=np.number).columns.tolist())
cat_features= X.select_dtypes(exclude=np.number).columns.tolist()

In [83]:
len(numeric_features)

13

In [84]:
len(cat_features)

10

In [85]:
def preprocess_numeric_data(X):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled, scaler

In [86]:
X_train[numeric_features], scaler = preprocess_numeric_data(X_train[numeric_features])

In [88]:
import joblib
joblib.dump(scaler, 'scaler.joblib')

['scaler.joblib']

In [89]:
# Custom function for categorical data preprocessing
def preprocess_categorical_data(X):
    encoder = OneHotEncoder(drop='first', handle_unknown='ignore')
    X_encoded = encoder.fit_transform(X)
    return X_encoded, encoder

In [90]:
X_train_categorical,encoder= preprocess_categorical_data(X_train[cat_features])

In [91]:
import joblib
joblib.dump(encoder, 'encoder.joblib')

['encoder.joblib']

In [101]:
model = LinearRegression()
model.fit(hstack([X_train[numeric_features], X_train_categorical]), y_train)

# Model testing

In [105]:
import joblib
scaler = joblib.load('scaler.joblib')
encoder = joblib.load('encoder.joblib')

In [106]:
# Using scaler transform numeric cols
X_test_numeric=scaler.transform(X_test[numeric_features])


# Using encoder transform numeric cols
X_test_categorical=encoder.transform(X_test[cat_features])

# Predictions on the test set
predictions=model.predict(hstack([X_test_numeric, X_test_categorical]))


# Print the predictions
print("Predictions on Test Data:")
print(predictions)

Predictions on Test Data:
[12086.92649815  9951.78194744 12074.45962428  8371.26281092
 40571.14635796 17145.47373954 13699.41797319 10868.17676188
 12446.59884997  9574.63467778  9514.47301677  6027.01340511
 15738.4639529  23339.1286973   9047.49415233 11398.94142522
  5486.825607    8499.01909454  7778.82455935  8218.00814817
  9657.39494656 12729.99998595  9200.16111142 20636.98974452
 20208.29643868  1038.56236556 14330.03806154  4555.45684167
 14838.7762454   9713.08894623 10793.70436978 41243.47791102
  9132.81228826  5078.77368225 11796.83098194 11279.11805508
  6557.96001094 11142.61919137 12262.38597461 20330.49568045
 13699.41797319]




# Check for Loss

In [107]:
from sklearn.metrics import r2_score

In [108]:
r2_score(y_test,predictions)

0.7114814181796709

In [113]:

predictions=model.predict(hstack([X_train[numeric_features], X_train_categorical]))


In [115]:
predictions

array([16500.01087438, 16631.58098834,  8792.08739895,  9119.776311  ,
        9962.62221997, 15749.97600891, 15249.98629872,  6997.98934155,
       16190.63047263,  9271.91615258, 36163.54355228, 13495.01087438,
        9980.02130963, 25551.97900357, 32527.97919522, 20969.97162094,
        7299.00271073, 13950.02211672, 17669.01431515, 15590.3013305 ,
       18419.94219533,  7363.96196991, 16499.92277093, 11739.19203308,
       10344.9977574 , 14495.05437758, 13499.01040488,  7608.99959007,
       15620.99622421, 15984.70744035,  9094.95666931,  6189.01350903,
       28247.96204665, 16515.01441263, 40959.97707873, 12439.9803063 ,
       12259.21162333, 24564.99797995,  5821.55396413,  8449.00135536,
        8188.99585935, 12944.38827554, 18343.93178942,  6984.02616728,
       12788.05870785, 17075.00889634, 16609.86476836, 31599.97174007,
        7294.95014342,  7986.71547044, 14589.3799885 ,  7098.97599019,
        5499.68001899,  7348.99446606, 11595.02584893, 13845.0024293 ,
      

In [116]:
r2_score(y_train,predictions)

0.9976880808076355

# Here is the correct sample Code

In [55]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from scipy.sparse import hstack
import joblib

# Sample training data
train_data = {
    'NumericCol1': [1, 2, 3, 4, 5],
    'NumericCol2': [10, 20, 10, 30, 40],
    'CategoricalCol': ['A', 'B', 'A', 'B', 'A'],
    'Target': [15, 25, 20, 35, 45]
}

df_train = pd.DataFrame(train_data)

# Separate features and target
X_train = df_train.drop('Target', axis=1)
y_train = df_train['Target']

# Define numeric and categorical features
numeric_features = ['NumericCol1', 'NumericCol2']
categorical_features = ['CategoricalCol']

# Custom function for numeric data preprocessing
def preprocess_numeric_data(X):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled, scaler

# Custom function for categorical data preprocessing
def preprocess_categorical_data(X):
    encoder = OneHotEncoder(drop='first', handle_unknown='ignore')
    X_encoded = encoder.fit_transform(X)
    return X_encoded, encoder

# Combine both numeric and categorical preprocessing
def preprocess_data(X):
    X_numeric, scaler = preprocess_numeric_data(X[numeric_features])
    X_categorical, encoder = preprocess_categorical_data(X[categorical_features])
    return X_numeric, X_categorical, scaler, encoder

# Preprocess the training data
X_train_numeric, X_train_categorical, scaler, encoder = preprocess_data(X_train)

# Train the model (Linear Regression in this case)
model = LinearRegression()
model.fit(hstack([X_train_numeric, X_train_categorical]), y_train)

# Save preprocessing objects for later use
joblib.dump(scaler, 'scaler.joblib')
joblib.dump(encoder, 'encoder.joblib')



['encoder.joblib']

In [72]:
# --------------- Now, for the Test Data ---------------

# Sample test data
test_data = {
    'NumericCol1': [6, 7, 8],
    'NumericCol2': [50, 60, 70],
    'CategoricalCol': ['A', 'B','A'] # Introduce an unseen category 'C'
}

df_test = pd.DataFrame(test_data)

# Preprocess the test data using the same scaler and encoder from training
X_test_numeric, _ = preprocess_numeric_data(df_test[numeric_features])


X_test_categorical, _ = preprocess_categorical_data(df_test[categorical_features])

# Predictions on the test set
predictions=model.predict(hstack([X_test_numeric, X_test_categorical]))


# Print the predictions
print("Predictions on Test Data:")
print(predictions)


Predictions on Test Data:
[14.43334144 27.9375     41.6499919 ]


In [100]:
X_test_categorical

<3x1 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [99]:
X_test_numeric

array([[-1.22474487, -1.22474487],
       [ 0.        ,  0.        ],
       [ 1.22474487,  1.22474487]])