In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import pickle as pk

In [2]:
def get_brand_name(car_name):
    car_name = car_name.split(' ')[0]
    return car_name.strip()

In [3]:
def clean_data(value):
    if isinstance(value, str):
        value = value.split(' ')[0]
        value = value.strip()
        if value == '':
            return np.nan
        try:
            return float(value)
        except ValueError:
            return np.nan
    return value

In [4]:
def clean_torque(value):
    if isinstance(value, str):
        value = value.split('N')[0].split('@')[0].strip()
        try:
            return float(value)
        except ValueError:
            return np.nan
    return value

In [5]:
cars_data = pd.read_csv('Cardetails.csv')

In [6]:
cars_data.dropna(inplace=True)

In [7]:
cars_data.drop_duplicates(inplace=True)

In [8]:
cars_data['name'] = cars_data['name'].apply(get_brand_name)
cars_data['mileage'] = cars_data['mileage'].apply(clean_data)
cars_data['max_power'] = cars_data['max_power'].apply(clean_data)
cars_data['engine'] = cars_data['engine'].apply(clean_data)
cars_data['torque'] = cars_data['torque'].apply(clean_torque)

In [11]:
cars_data['name'].replace(['Maruti', 'Skoda', 'Honda', 'Hyundai', 'Toyota', 'Ford', 'Renault',
       'Mahindra', 'Tata', 'Chevrolet', 'Datsun', 'Jeep', 'Mercedes-Benz',
       'Mitsubishi', 'Audi', 'Volkswagen', 'BMW', 'Nissan', 'Lexus',
       'Jaguar', 'Land', 'MG', 'Volvo', 'Daewoo', 'Kia', 'Fiat', 'Force',
       'Ambassador', 'Ashok', 'Isuzu', 'Opel'],
                                       [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],
       inplace=True)

In [13]:
cars_data['transmission'].replace(['Manual', 'Automatic'],[1,2], inplace=True)

In [14]:
cars_data['seller_type'].replace(['Individual', 'Dealer', 'Trustmark Dealer'],[1,2,3], inplace=True)

In [15]:
cars_data['fuel'].replace(['Diesel', 'Petrol', 'LPG', 'CNG'],[1,2,3,4], inplace=True)

In [16]:
cars_data['owner'].replace(['First Owner', 'Second Owner', 'Third Owner',
       'Fourth & Above Owner', 'Test Drive Car'],
                                 [1,2,3,4,5], inplace=True)

In [17]:
cars_data.dropna(inplace=True)

In [18]:
input_data = cars_data.drop(columns=['selling_price'])
output_data = cars_data['selling_price']

In [19]:
x_train, x_test, y_train, y_test = train_test_split(input_data, output_data, test_size=0.2)

In [20]:
model = LinearRegression()

In [21]:
model.fit(x_train, y_train)

In [22]:
predict = model.predict(x_test)

In [23]:
model_columns = list(x_train.columns)

In [24]:
print("Model expects these columns:", model_columns)

Model expects these columns: ['name', 'year', 'km_driven', 'fuel', 'seller_type', 'transmission', 'owner', 'mileage', 'engine', 'max_power', 'torque', 'seats']


In [25]:
input_data_model = pd.DataFrame(
    [[5, 2022, 12000, 1, 1, 1, 1, 12.99, 2494.0, 100.6, 200.0, 5.0]],
    columns=['name', 'year', 'km_driven', 'fuel', 'seller_type', 'transmission', 
             'owner', 'mileage', 'engine', 'max_power', 'torque', 'seats']
)

In [26]:
input_data_model = input_data_model[model_columns]

In [27]:
prediction = model.predict(input_data_model)

In [28]:
print("Prediction:", prediction)

Prediction: [983743.19717024]


In [29]:
pk.dump(model, open('model.pkl', 'wb'))