In [48]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
import xgboost
import lightgbm
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn import metrics
from sklearn.metrics import mean_squared_error

In [3]:
df_banglore = pd.read_csv("../input/used-car-information/UsedCars_Banglore.csv",index_col = False)
df_chennai = pd.read_csv("../input/used-car-information/UsedCars_Chennai.csv",index_col = False)
df_delhi = pd.read_csv("../input/used-car-information/UsedCars_Delhi.csv",index_col = False)
df_hyderabad = pd.read_csv("../input/used-car-information/UsedCars_Hyderabad.csv",index_col = False)
df_kolkata = pd.read_csv("../input/used-car-information/UsedCars_Kolkata.csv",index_col = False)
df_mumbai = pd.read_csv("../input/used-car-information/UsedCars_Mumbai.csv",index_col = False)
df_pune = pd.read_csv("../input/used-car-information/UsedCars_Pune.csv",index_col = False)


In [4]:
df = pd.concat([df_banglore , df_chennai ,df_delhi , df_hyderabad ,df_kolkata , df_mumbai ,df_pune ])
df

In [5]:
df = df.drop(columns =  ["Unnamed: 0"] , axis = 1)

In [6]:
df.reset_index(inplace  =True , drop = True)
df.head()

In [7]:
df.shape

In [8]:
df.columns

In [9]:
df['full_model_name']

In [10]:
df = df.drop(columns=['full_model_name'] , axis = 1)

In [11]:
df.head()

In [12]:
print(df.describe())
print("*"*50)
print(df.info())

In [13]:
df.isnull().sum()

In [14]:
df.duplicated().sum()

In [15]:
df.drop_duplicates(keep='first' , inplace = True)
df.shape

In [16]:
df.head()

In [17]:
data = df[['year' , 'price' , 'distance_travelled(kms)']]
data.plot(kind = 'box' , figsize = (20,10) , subplots=True)
plt.show()

In [18]:
df = df[df['year']>2005]
df.shape

In [19]:
df.head()

In [20]:
df.reset_index(inplace=True , drop = True)

In [21]:
sns.histplot(data = df['year'])

In [22]:
sns.histplot(data = df['price'])

### **Which brands of cars are common in used cars?**

In [23]:
df['brand'].unique()

In [24]:
df['brand'].value_counts()

In [25]:
# check top 10 common car brands
plt.figure(figsize=(12,5))
df['brand'].value_counts()[:10].sort_values(ascending=True).plot(kind='barh')
plt.title("Top 10 Common Car Brands")
plt.show()

#### **'Maruti Suzuki'** and **'Hyundai'** are the most common car brands in used cars.

### **Which models of cars are common in used cars?**

In [26]:
df['model_name'].value_counts()

In [27]:
plt.figure(figsize=(12,5))
df['model_name'].value_counts()[:10].sort_values(ascending=True).plot(kind='barh')
plt.title("Top 10 Common Model Names")
plt.show()

#### **'Creta'** and **'City'** are the most common car models in used car.
- We also found that the top 3 models "Creta", "City", "Swift" equal the top 3 most common car brands "Hyundai", "Honda", "Maruti Suzuki" respectively.

### **Which fuel type is more common in used cars?**

In [28]:
df_pie=df['fuel_type'].value_counts()
df_pie

In [29]:
labels = ['Diesel', 'Petrol', 'CNG + 1', 'Petrol + 1', 'Hybrid']
colors = sns.color_palette('Paired')

plt.pie(df_pie , colors = colors , labels = labels , startangle=90)
plt.title("Fuel Types in Used Cars")
plt.show()

#### **"Diesel"** is the most common fuel type in used cars.

### **Does the car fuel type affect the selling price?**

In [30]:
plt.ticklabel_format(style = 'plain')
plt.ylabel("price")
plt.xlabel("fuel type")
plt.bar(df['fuel_type'] , df['price'])
plt.show()

#### Although **"Diesel"** and **"Petrol"** are the most common fuel type, They have a **higher price** for used cars than other fuel types.

### **Does the city have an effect on the selling price?**

In [31]:
plt.figure(figsize=(12,5))
plt.ticklabel_format(style = 'plain' )
plt.xticks(rotation = 45)
plt.bar(df['city'], sorted(df['price']))
plt.title('City by Selling Price')
plt.show()

#### **"Pune"** has the highest price for used cars among other cities.

In [32]:
current_year = 2021
age = []

for i in df['year']:
    age.append(current_year-int(i))

In [34]:
df['car_age'] = age

In [35]:
df.head()

In [36]:
thresh = 30000
temp = []
for i in df['distance_travelled(kms)']:
    if i>=thresh:
        temp.append(1)
    else:
        temp.append(0)
df['distance below 30k km'] = temp
df.head()

In [38]:
df.shape[0]

In [39]:
temp = []
for i in range(0,df.shape[0] , 1):
    if ((df['distance below 30k km'][i]==1) and (df['car_age'][i]<4)):
        temp.append(int(1))
    else:
        temp.append(int(0))
        
df['new and less used'] = temp
df.head()

In [57]:
X= df[['price','distance_travelled(kms)']]
y= df["new and less used"]

In [58]:
X_train ,X_test , y_train , y_test = train_test_split(X,y , test_size = 0.2 , random_state = 42)

In [59]:
print(X_train.shape , X_test.shape)

In [60]:
from sklearn.metrics import fbeta_score

def F2(reg , X_train , y_train , X_test , y_test):
    reg.fit(X_train , y_train)
    predictions = reg.predict(X_test)
    score = fbeta_score(y_test , predictions ,beta = 0.5 , average = 'binary')
    return score

In [61]:
cat = CatBoostClassifier(silent=True)
lgb = lightgbm.LGBMClassifier()
xgb = xgboost.XGBClassifier()
rfc = RandomForestClassifier()
models_list = [cat , lgb ,xgb , rfc]

In [62]:
X_train

In [63]:
for model in models_list:
    print(f"score of {model} is {F2(model , X_train , y_train , X_test , y_test)} \n")

In [64]:
model = cat
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = model.score(X_test, y_test)
print(accuracy)

###### After building a calssification model based on the **car price** and  **kms driven** using **Cat Boost Classifier**, The model accuracy was **88%**.