# This notebook contains the fitting of our preprocessed data into various machine learning algorithmsm

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error

In [2]:
# Import the preprocesssed data
data = pd.read_csv("preprocessed_data.csv")
data.head()

Unnamed: 0,car_name,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Maruti Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Hyundai Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,Hyundai i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,Maruti Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,Ford Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [3]:
encoder = LabelEncoder()

data["car_name"]=data["car_name"].apply(str.lower)

data["car_name"] = encoder.fit_transform(data["car_name"])

np.save('car_name.npy', encoder.classes_)

In [4]:
# Split the data into features and labels
features = data.drop("selling_price", axis=1)
labels = data["selling_price"]

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(features, labels, test_size=0.2, random_state=42)

len(X_train), len(y_train), len(X_val), len(y_val)

(12328, 12328, 3083, 3083)

In [5]:
X_train.head()

Unnamed: 0,car_name,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
11210,118,7,70252,Dealer,Diesel,Automatic,11.2,2400,215.0,5
1347,107,2,10000,Individual,Petrol,Manual,23.84,1199,84.0,5
10363,83,2,6000,Dealer,Diesel,Automatic,19.0,1950,241.3,5
316,25,7,63000,Dealer,Petrol,Manual,17.8,1497,117.3,5
10638,35,10,80292,Dealer,Petrol,Manual,20.36,1197,78.9,5


In [6]:
# convert non-numerical columns to numerical columns

cat_columns = ["seller_type", "fuel_type", "transmission_type"]

from sklearn.preprocessing import LabelEncoder,OneHotEncoder

for label in cat_columns:
    encoder = LabelEncoder()
    
    # X_train[label]=X_train[label].apply(str.lower)
    # X_val[label]=X_val[label].apply(str.lower)
        
    X_train[label] = encoder.fit_transform(X_train[label])
    X_val[label] = encoder.transform(X_val[label])
    np.save(f'{label}.npy', encoder.classes_)

In [7]:
X_train.head()

Unnamed: 0,car_name,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
11210,118,7,70252,0,1,0,11.2,2400,215.0,5
1347,107,2,10000,1,4,1,23.84,1199,84.0,5
10363,83,2,6000,0,1,0,19.0,1950,241.3,5
316,25,7,63000,0,4,1,17.8,1497,117.3,5
10638,35,10,80292,0,4,1,20.36,1197,78.9,5


In [8]:
X_train.columns

Index(['car_name', 'vehicle_age', 'km_driven', 'seller_type', 'fuel_type',
       'transmission_type', 'mileage', 'engine', 'max_power', 'seats'],
      dtype='object')

In [10]:
# Scaling our labels also
minmax_scaler = MinMaxScaler()

X_train_scaled = minmax_scaler.fit_transform(X_train)
X_val_scaled = minmax_scaler.transform(X_val)

scaler = StandardScaler()
y_train_scaled = y_train.values
y_val_scaled = y_val.values

y_train_scaled

array([1825000,  515000, 7500000, ...,  250000,  620000,  960000],
      dtype=int64)

In [11]:
# Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train_scaled, y_train_scaled)

# Test the model on validation data
print(f"Score on training data: {lin_reg.score(X_train_scaled, y_train_scaled)}")
print(f"Score on validation data: {lin_reg.score(X_val_scaled, y_val_scaled)}")

Score on training data: 0.6203750900858938
Score on validation data: 0.6639794533791967


In [12]:
# Lasso Regression
lasso = Lasso()
lasso.fit(X_train_scaled, y_train)

# Test the model on validation data
print(f"Score on training data: {lasso.score(X_train_scaled, y_train_scaled)}")
print(f"Score on validation data: {lasso.score(X_val_scaled, y_val_scaled)}")

Score on training data: 0.6203750871941566
Score on validation data: 0.6639930984820968


In [13]:
# Elastic Net regression
elastic_net = ElasticNet()
elastic_net.fit(X_train_scaled, y_train)

# Test the model on validation data
print(f"Score on training data: {elastic_net.score(X_train_scaled, y_train_scaled)}")
print(f"Score on validation data: {elastic_net.score(X_val_scaled, y_val_scaled)}")

Score on training data: 0.13188610762068464
Score on validation data: 0.15154517041362425


In [14]:
from sklearn.ensemble import RandomForestRegressor

# Random Forest Regression
rand_reg = RandomForestRegressor(criterion='absolute_error')
rand_reg.fit(X_train_scaled, y_train)

# Test the model on validation data
print(f"Score on training data: {rand_reg.score(X_train_scaled, y_train_scaled)}")
print(f"Score on validation data: {rand_reg.score(X_val_scaled, y_val_scaled)}")

Score on training data: 0.9793910726879624
Score on validation data: 0.9299722566787842


The Random Forest Regressor can explain 94% of the invariablility on the data. So it is the best performing model.

In [17]:
predictions = rand_reg.predict(X_val_scaled)
predictions[0]

242490.0

In [18]:
y_val[0]

120000

In [30]:
lin_reg.predict(X_val_scaled)[0]

array([-0.84682801])

In [15]:
# Save the best performing model
import pickle 
filename = "random_forest.pickle"

# save model
pickle.dump(rand_reg, open(filename, "wb"))

In [16]:
# Test the loaded model
loaded_model = pickle.load(open(filename, "rb"))

loaded_model.score(X_val_scaled, y_val)

0.9290103111100572

In [17]:
X_val.values[0]

array([3.500e+01, 1.200e+01, 7.300e+04, 0.000e+00, 4.000e+00, 1.000e+00,
       2.036e+01, 1.197e+03, 7.890e+01, 5.000e+00])

In [18]:
y_val.values[0]

190000

In [19]:
pred = loaded_model.predict(X_val)
pred



array([4375525., 4399705., 4375525., ..., 4375525., 4399705., 4375525.])

In [12]:
value = 683187

print ('{:,}'.format(value)) 

683,187


In [19]:
count = len(str(value))
for i in str(value):
    count = count-1
    if(count == [, ])

6 5
8 4
3 3
1 2
8 1
7 0


In [25]:
# This code is taken from geeks for geeks page (https://www.geeksforgeeks.org/convert-the-number-from-international-system-to-indian-system/)
def convert(input):

	# Find the length of the
	# input string
	Len = len(input)

	# Removing all the separators(, )
	# from the input string
	i = 0
	while(i < Len):
		if(input[i] == ","):
			input = input[:i] + input[i + 1:]
			Len -= 1
			i -= 1
		elif(input[i] == " "):
			input=input[:i] + input[i + 1:]
			Len -= 1
			i -= 1
		else:
			i += 1
	# Reverse the input string
	input=input[::-1]

	# Declaring the output string
	output = ""

	# Process the input string
	for i in range(Len):

		# Add a separator(, ) after the
		# third number
		if(i == 2):
			output += input[i]
			output += ","
		
		# Then add a separator(, ) after
		# every second number
		elif(i > 2 and i % 2 == 0 and
			i + 1 < Len):
			output += input[i]
			output += ","
		else:
			output += input[i]
	
	# Reverse the output string
	output=output[::-1]

	# Return the output string back
	# to the main function
	return output

In [27]:
convert(tf.round(model.predict(tf.expand_dims(X_val[0], axis=0))))

NameError: name 'tf' is not defined