10.	The car dataset captures the selling price of the used cars with respect to features like  year_bought, km_driven, transmission and owner.

Objectives:

i.	Understand the Dataset & cleanup (if required).

ii.	Build Regression model to predict the selling prices w.r.t owner

iii.	Also evaluate the model using scores RMSE



In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

# Step 1: Load the dataset
car_data = pd.read_csv('CAR DETAILS FROM CAR DEKHO.csv')

# Step 2: Data Understanding and Cleanup
print("Dataset before cleaning:")
print(car_data.head())

# Check for missing values
print("\nMissing values in the dataset:")
print(car_data.isnull().sum())

# Step 3: Encode 'Owner' and 'Transmission' if they're categorical
le = LabelEncoder()
car_data['owner_encoded'] = le.fit_transform(car_data['owner'])
car_data['transmission_encoded'] = le.fit_transform(car_data['transmission'])

# Step 4: Create additional features
car_data['car_age'] = 2023 - car_data['year']  # Calculate car age

# Step 5: Apply log transformation to the target variable
car_data['log_selling_price'] = np.log(car_data['selling_price'])

# Step 6: Extract relevant features and 'log_selling_price' target
X = car_data[['car_age', 'km_driven', 'owner_encoded', 'transmission_encoded']]
y = car_data['log_selling_price']

# Step 7: Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 8: Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step 9: Build the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Step 10: Predict selling prices for the test set
y_pred_log = model.predict(X_test)
y_pred = np.exp(y_pred_log)  # Convert back to original scale

# Step 11: Evaluate the model using RMSE
rmse = np.sqrt(mean_squared_error(np.exp(y_test), y_pred))  # Compare with original scale
print(f"\nRoot Mean Squared Error (RMSE): {rmse}")

# Display the model coefficients
print(f"\nModel Coefficients: \nIntercept: {model.intercept_}")
for i, col in enumerate(X.columns):
    print(f"Coefficient for {col}: {model.coef_[i]}")

Dataset before cleaning:
                       name  year  selling_price  km_driven    fuel  \
0             Maruti 800 AC  2007          60000      70000  Petrol   
1  Maruti Wagon R LXI Minor  2007         135000      50000  Petrol   
2      Hyundai Verna 1.6 SX  2012         600000     100000  Diesel   
3    Datsun RediGO T Option  2017         250000      46000  Petrol   
4     Honda Amaze VX i-DTEC  2014         450000     141000  Diesel   

  seller_type transmission         owner  
0  Individual       Manual   First Owner  
1  Individual       Manual   First Owner  
2  Individual       Manual   First Owner  
3  Individual       Manual   First Owner  
4  Individual       Manual  Second Owner  

Missing values in the dataset:
name             0
year             0
selling_price    0
km_driven        0
fuel             0
seller_type      0
transmission     0
owner            0
dtype: int64

Root Mean Squared Error (RMSE): 450515.5249475974

Model Coefficients: 
Intercept: 12.763238

In [2]:
import numpy as np
import pandas as pd

class GradientDescentLinearRegression:
    def __init__(self, learning_rate=0.01, n_iters=1000):
        self.learning_rate = learning_rate
        self.n_iters = n_iters
        self.m = 0
        self.c = 0

    def fit(self, X, y):
        n = len(y)
        for i in range(self.n_iters):
            y_pred = self.m * X + self.c
            dm = -(2/n) * np.sum(X * (y - y_pred))  # Derivative w.r.t. slope (m)
            dc = -(2/n) * np.sum(y - y_pred)        # Derivative w.r.t. intercept (c)

            self.m = self.m - self.learning_rate * dm
            self.c = self.c - self.learning_rate * dc

            if i % 100 == 0:
                print(f"Iteration {i}: m = {self.m}, c = {self.c}")

    def predict(self, X):
        return self.m * X + self.c

# Load the dataset
car_data = pd.read_csv('CAR DETAILS FROM CAR DEKHO.csv')

# Step 1: Data Understanding and Cleanup
print("Dataset before cleaning:")
print(car_data.head())  # Print the first few rows of the dataset before cleaning

# Step 2: Encode 'Owner' (categorical to numeric if needed)
car_data['owner_encoded'] = car_data['owner'].astype('category').cat.codes

# Extract 'Owner' (Feature) and 'Selling Price' (Target)
X = car_data[['owner_encoded']].values.flatten()
y = car_data['selling_price'].values

# Feature scaling (normalizing the input to prevent large updates)
X_scaled = (X - np.mean(X)) / np.std(X)

# Initialize and fit the model using gradient descent
regressor = GradientDescentLinearRegression(learning_rate=0.01, n_iters=1000)
regressor.fit(X_scaled, y)

# Make predictions
y_pred = regressor.predict(X_scaled)

# Evaluate the model (using RMSE)
rmse = np.sqrt(np.mean((y - y_pred)**2))
print(f"\nRoot Mean Squared Error (RMSE): {rmse}")

# Print the final values of m and c
print(f"\nFinal Model: y = {regressor.m} * X_scaled + {regressor.c}")

Dataset before cleaning:
                       name  year  selling_price  km_driven    fuel  \
0             Maruti 800 AC  2007          60000      70000  Petrol   
1  Maruti Wagon R LXI Minor  2007         135000      50000  Petrol   
2      Hyundai Verna 1.6 SX  2012         600000     100000  Diesel   
3    Datsun RediGO T Option  2017         250000      46000  Petrol   
4     Honda Amaze VX i-DTEC  2014         450000     141000  Diesel   

  seller_type transmission         owner  
0  Individual       Manual   First Owner  
1  Individual       Manual   First Owner  
2  Individual       Manual   First Owner  
3  Individual       Manual   First Owner  
4  Individual       Manual  Second Owner  
Iteration 0: m = -2404.638199639989, c = 10082.546235023043
Iteration 100: m = -104605.70952593259, c = 438607.3143562754
Iteration 200: m = -118159.57021719345, c = 495438.078794418
Iteration 300: m = -119957.07720272786, c = 502974.94953536376
Iteration 400: m = -120195.46178086713, c = 

13.	The car dataset captures the selling price of the used cars with respect to features like  year_bought, km_driven, transmission and owner.

Objectives:

i.	Understand the Dataset & cleanup (if required).

ii.	Build Regression model to predict the selling prices w.r.t year brought and owner

iii.	Also evaluate the model using scores RMSE


In [3]:
import pandas as pd
import numpy as np

# Load dataset
file_path = 'CAR DETAILS FROM CAR DEKHO.csv'
cardekho_data = pd.read_csv(file_path)
print("Dataset Preview:")
print(cardekho_data.head())

# Define Linear Regression class with Gradient Descent
class GradientDescentLinearRegression:
    def __init__(self, learning_rate=0.01, n_iters=1000):
        self.learning_rate = learning_rate
        self.n_iters = n_iters
        self.m = None
        self.c = 0

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.m = np.zeros(n_features)  # Initialize coefficients

        for i in range(self.n_iters):
            y_pred = np.dot(X, self.m) + self.c
            dm = (-2/n_samples) * np.dot(X.T, (y - y_pred))
            dc = (-2/n_samples) * np.sum(y - y_pred)
            self.m -= self.learning_rate * dm
            self.c -= self.learning_rate * dc

            if i % 100 == 0:
                print(f"Iteration {i+1}: m = {self.m}, c = {self.c}")

    def predict(self, X):
        return np.dot(X, self.m) + self.c

# Preprocessing the data
cardekho_data['owner_encoded'] = cardekho_data['owner'].astype('category').cat.codes
X = cardekho_data[['year', 'owner_encoded']].values  # Do NOT flatten X
y = cardekho_data['selling_price'].values

# Scale the features
X_scaled = (X - np.mean(X, axis=0)) / np.std(X, axis=0)

# Train the model
print("\n")
regressor = GradientDescentLinearRegression(learning_rate=0.01, n_iters=1000)
regressor.fit(X_scaled, y)

# Predictions
y_pred = regressor.predict(X_scaled)

# Calculate RMSE
rmse = np.sqrt(np.mean((y - y_pred)**2))
print(f"\nRoot Mean Squared Error (RMSE): {rmse}")
print(f"\nFinal Model: y = {regressor.m[0]} * Year_Bought + {regressor.m[1]} * Owner + {regressor.c}")

Dataset Preview:
                       name  year  selling_price  km_driven    fuel  \
0             Maruti 800 AC  2007          60000      70000  Petrol   
1  Maruti Wagon R LXI Minor  2007         135000      50000  Petrol   
2      Hyundai Verna 1.6 SX  2012         600000     100000  Diesel   
3    Datsun RediGO T Option  2017         250000      46000  Petrol   
4     Honda Amaze VX i-DTEC  2014         450000     141000  Diesel   

  seller_type transmission         owner  
0  Individual       Manual   First Owner  
1  Individual       Manual   First Owner  
2  Individual       Manual   First Owner  
3  Individual       Manual   First Owner  
4  Individual       Manual  Second Owner  


Iteration 1: m = [ 4788.92547941 -2404.63819964], c = 10082.546235023043
Iteration 101: m = [190955.39356072 -49282.63602103], c = 438607.31435627805
Iteration 201: m = [219014.56183647 -34434.06990938], c = 495438.078794422
Iteration 301: m = [225997.84847737 -28199.65327001], c = 502974.949535

16.	The car dataset captures the selling price of the used cars with respect to features like  year_bought, km_driven, transmission and owner.

Objectives:

i.	Understand the Dataset & cleanup (if required).

ii.	Build Regression models to predict the selling prices w.r.t km driven and owner

iii.	Also evaluate the model using scores RMSE


In [4]:
import pandas as pd
import numpy as np

# Load dataset
file_path = 'CAR DETAILS FROM CAR DEKHO.csv'
cardekho_data = pd.read_csv(file_path)
print("Dataset Preview:")
print(cardekho_data.head())

# Define Linear Regression class with Gradient Descent
class GradientDescentLinearRegression:
    def __init__(self, learning_rate=0.01, n_iters=1000):
        self.learning_rate = learning_rate
        self.n_iters = n_iters
        self.m = None
        self.c = 0

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.m = np.zeros(n_features)  # Initialize coefficients

        for i in range(self.n_iters):
            y_pred = np.dot(X, self.m) + self.c
            dm = (-2/n_samples) * np.dot(X.T, (y - y_pred))
            dc = (-2/n_samples) * np.sum(y - y_pred)
            self.m -= self.learning_rate * dm
            self.c -= self.learning_rate * dc

            if i % 100 == 0:
                print(f"Iteration {i+1}: m = {self.m}, c = {self.c}")

    def predict(self, X):
        return np.dot(X, self.m) + self.c

# Preprocessing the data
cardekho_data['owner_encoded'] = cardekho_data['owner'].astype('category').cat.codes
X = cardekho_data[['km_driven', 'owner_encoded']].values  # Do NOT flatten X
y = cardekho_data['selling_price'].values

# Scale the features
X_scaled = (X - np.mean(X, axis=0)) / np.std(X, axis=0)

# Train the model
print("\n")
regressor = GradientDescentLinearRegression(learning_rate=0.01, n_iters=1000)
regressor.fit(X_scaled, y)

# Predictions
y_pred = regressor.predict(X_scaled)

# Calculate RMSE
rmse = np.sqrt(np.mean((y - y_pred)**2))
print(f"\nRoot Mean Squared Error (RMSE): {rmse}")
print(f"\nFinal Model: y = {regressor.m[0]} * KM_Driven + {regressor.m[1]} * Owner + {regressor.c}")

Dataset Preview:
                       name  year  selling_price  km_driven    fuel  \
0             Maruti 800 AC  2007          60000      70000  Petrol   
1  Maruti Wagon R LXI Minor  2007         135000      50000  Petrol   
2      Hyundai Verna 1.6 SX  2012         600000     100000  Diesel   
3    Datsun RediGO T Option  2017         250000      46000  Petrol   
4     Honda Amaze VX i-DTEC  2014         450000     141000  Diesel   

  seller_type transmission         owner  
0  Individual       Manual   First Owner  
1  Individual       Manual   First Owner  
2  Individual       Manual   First Owner  
3  Individual       Manual   First Owner  
4  Individual       Manual  Second Owner  


Iteration 1: m = [-2224.71053616 -2404.63819964], c = 10082.546235023043
Iteration 101: m = [-78082.11040187 -87818.06662348], c = 438607.3143562754
Iteration 201: m = [-82743.22613368 -94798.85891619], c = 495438.078794418
Iteration 301: m = [-82881.88764111 -95500.60992408], c = 502974.9495353

17.	The car dataset captures the selling price of the used cars with respect to features like  year_bought, km_driven, transmission and owner.

Objectives:

i.	Understand the Dataset & cleanup (if required).

ii.	Build Regression models to predict the selling prices w.r.t transmission and owner

iii.	Also evaluate the model using scores RMSE


In [7]:
import pandas as pd
import numpy as np

# Load dataset
file_path = 'CAR DETAILS FROM CAR DEKHO.csv'
cardekho_data = pd.read_csv(file_path)
print("Dataset Preview:")
print(cardekho_data.head())

# Define Linear Regression class with Gradient Descent
class GradientDescentLinearRegression:
    def __init__(self, learning_rate=0.01, n_iters=1000):
        self.learning_rate = learning_rate
        self.n_iters = n_iters
        self.m = None
        self.c = 0

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.m = np.zeros(n_features)  # Initialize coefficients

        for i in range(self.n_iters):
            y_pred = np.dot(X, self.m) + self.c
            dm = (-2/n_samples) * np.dot(X.T, (y - y_pred))
            dc = (-2/n_samples) * np.sum(y - y_pred)
            self.m -= self.learning_rate * dm
            self.c -= self.learning_rate * dc

            if i % 100 == 0:
                print(f"Iteration {i+1}: m = {self.m}, c = {self.c}")

    def predict(self, X):
        return np.dot(X, self.m) + self.c

# Preprocessing the data
cardekho_data['owner_encoded'] = cardekho_data['owner'].astype('category').cat.codes
cardekho_data['transmission_encoded'] = cardekho_data['transmission'].astype('category').cat.codes
X = cardekho_data[['transmission_encoded', 'owner_encoded']].values  # Do NOT flatten X
y = cardekho_data['selling_price'].values

# Scale the features
X_scaled = (X - np.mean(X, axis=0)) / np.std(X, axis=0)

# Train the model
print("\n")
regressor = GradientDescentLinearRegression(learning_rate=0.01, n_iters=1000)
regressor.fit(X_scaled, y)

# Predictions
y_pred = regressor.predict(X_scaled)

# Calculate RMSE
rmse = np.sqrt(np.mean((y - y_pred)**2))
print(f"\nRoot Mean Squared Error (RMSE): {rmse}")
print(f"\nFinal Model: y = {regressor.m[0]} * Transmission + {regressor.m[1]} * Owner + {regressor.c}")

Dataset Preview:
                       name  year  selling_price  km_driven    fuel  \
0             Maruti 800 AC  2007          60000      70000  Petrol   
1  Maruti Wagon R LXI Minor  2007         135000      50000  Petrol   
2      Hyundai Verna 1.6 SX  2012         600000     100000  Diesel   
3    Datsun RediGO T Option  2017         250000      46000  Petrol   
4     Honda Amaze VX i-DTEC  2014         450000     141000  Diesel   

  seller_type transmission         owner  
0  Individual       Manual   First Owner  
1  Individual       Manual   First Owner  
2  Individual       Manual   First Owner  
3  Individual       Manual   First Owner  
4  Individual       Manual  Second Owner  


Iteration 1: m = [-6134.28343286 -2404.63819964], c = 10082.546235023043
Iteration 101: m = [-261759.5679536   -90259.88196094], c = 438607.3143562754
Iteration 201: m = [-294212.58345424  -96579.88476695], c = 495438.0787944181
Iteration 301: m = [-298436.31413591  -96732.95808092], c = 502974.