In [1]:
import pandas as pd

# Load the data
df = pd.read_csv("quikr_car.csv")

# Show first few rows
df.head()


Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,"45,000 kms",Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40 kms,Diesel
2,Maruti Suzuki Alto 800 Vxi,Maruti,2018,Ask For Price,"22,000 kms",Petrol
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,"28,000 kms",Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,"36,000 kms",Diesel


In [2]:
df.info()
df.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 892 entries, 0 to 891
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        892 non-null    object
 1   company     892 non-null    object
 2   year        892 non-null    object
 3   Price       892 non-null    object
 4   kms_driven  840 non-null    object
 5   fuel_type   837 non-null    object
dtypes: object(6)
memory usage: 41.9+ KB


Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
count,892,892,892,892,840,837
unique,525,48,61,274,258,3
top,Honda City,Maruti,2015,Ask For Price,"45,000 kms",Petrol
freq,13,235,117,35,30,440


In [3]:
df = df.dropna()


In [6]:
# Remove 'Rs', ',', and convert to integer
df['price'] = df['price'].str.replace('Rs.', '', regex=False).str.replace(',', '', regex=False).str.strip()
df = df[df['price'] != 'Ask For Price']
df['price'] = df['price'].astype(int)


KeyError: 'price'

In [7]:
print(df.columns.tolist())


['name', 'company', 'year', 'Price', 'kms_driven', 'fuel_type']


In [8]:
df.columns = df.columns.str.strip()


In [9]:
print(df.columns.tolist())


['name', 'company', 'year', 'Price', 'kms_driven', 'fuel_type']


In [10]:
# Remove 'Rs', ',', and convert to integer
df['price'] = df['price'].str.replace('Rs.', '', regex=False).str.replace(',', '', regex=False).str.strip()
df = df[df['price'] != 'Ask For Price']
df['price'] = df['price'].astype(int)


KeyError: 'price'

In [11]:
df.columns = df.columns.str.strip()


In [12]:
df = df.dropna()


In [13]:
# Remove 'Rs.', ',', and convert to int
df['Price'] = df['Price'].str.replace('Rs.', '', regex=False)
df['Price'] = df['Price'].str.replace(',', '', regex=False)
df['Price'] = df['Price'].str.strip()

# Drop rows where price is not a number (e.g., "Ask for Price")
df = df[df['Price'].str.isnumeric()]

df['Price'] = df['Price'].astype(int)


AttributeError: Can only use .str accessor with string values!

In [14]:
# Convert to string first to avoid .str errors
df['Price'] = df['Price'].astype(str)

# Remove 'Rs.', commas, and leading/trailing spaces
df['Price'] = df['Price'].str.replace('Rs.', '', regex=False)
df['Price'] = df['Price'].str.replace(',', '', regex=False)
df['Price'] = df['Price'].str.strip()

# Remove rows like "Ask For Price"
df = df[df['Price'].str.isnumeric()]

# Convert cleaned price to int
df['Price'] = df['Price'].astype(int)


In [15]:
df['kms_driven'] = df['kms_driven'].astype(str)
df['kms_driven'] = df['kms_driven'].str.replace(' kms', '', regex=False)
df['kms_driven'] = df['kms_driven'].str.replace(',', '', regex=False)
df['kms_driven'] = df['kms_driven'].str.strip()

# Remove junk entries
df = df[df['kms_driven'].str.isnumeric()]
df['kms_driven'] = df['kms_driven'].astype(int)


In [16]:
print(df[['Price', 'kms_driven']].head())


    Price  kms_driven
0   80000       45000
1  425000          40
3  325000       28000
4  575000       36000
6  175000       41000


In [17]:
# For simplicity, encode 'fuel_type' and 'company' using one-hot encoding
df_encoded = pd.get_dummies(df, columns=['fuel_type', 'company'], drop_first=True)


In [18]:
# Features (X) and Target (y)
X = df_encoded.drop(['name', 'Price'], axis=1)
y = df_encoded['Price']


In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [20]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)


In [21]:
from sklearn.metrics import mean_absolute_error, r2_score

y_pred = model.predict(X_test)

print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))


Mean Absolute Error: 198286.4539147586
R² Score: 0.1877654982563528


In [22]:
sample_input = X_test.iloc[0:1]
predicted_price = model.predict(sample_input)
actual_price = y_test.iloc[0]

print("Predicted Price:", int(predicted_price[0]))
print("Actual Price:", actual_price)


Predicted Price: 71738
Actual Price: 135000


In [23]:
print("Available fuel types:", df['fuel_type'].unique())
print("Available companies:", df['company'].unique())


Available fuel types: ['Petrol' 'Diesel' 'LPG']
Available companies: ['Hyundai' 'Mahindra' 'Ford' 'Maruti' 'Skoda' 'Audi' 'Toyota' 'Renault'
 'Honda' 'Datsun' 'Mitsubishi' 'Tata' 'Volkswagen' 'Chevrolet' 'Mini'
 'BMW' 'Nissan' 'Hindustan' 'Fiat' 'Force' 'Mercedes' 'Land' 'Jaguar'
 'Jeep' 'Volvo']


In [24]:
# Collect user input
user_year = int(input("Enter Year of Manufacture: "))
user_kms = int(input("Enter Kilometers Driven: "))
user_fuel = input("Enter Fuel Type (e.g., Petrol, Diesel, CNG): ")
user_company = input("Enter Company (e.g., Maruti, Hyundai): ")


Enter Year of Manufacture: 2015
Enter Kilometers Driven: 50000
Enter Fuel Type (e.g., Petrol, Diesel, CNG): petrol
Enter Company (e.g., Maruti, Hyundai): maruti


In [25]:
# Create empty row with same columns as X
input_data = pd.DataFrame(columns=X.columns)

# Set numeric values
input_data.loc[0, 'year'] = user_year
input_data.loc[0, 'kms_driven'] = user_kms

# Set one-hot encoded columns
for col in X.columns:
    if f'fuel_type_{user_fuel}' == col:
        input_data.loc[0, col] = 1
    elif f'company_{user_company}' == col:
        input_data.loc[0, col] = 1
    elif col not in ['year', 'kms_driven']:
        input_data.loc[0, col] = 0  # Set rest to 0


In [26]:
predicted_price = model.predict(input_data)[0]
print(f"\nEstimated Car Price: ₹{int(predicted_price):,}")



Estimated Car Price: ₹1,574,015


In [27]:
import pickle

# Save model
with open('car_model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Save the column names used during training
with open('columns.pkl', 'wb') as f:
    pickle.dump(X.columns.tolist(), f)
