## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import *
from xgboost import XGBRegressor

## Loading Data

In [3]:
date_converter = {'Prod. year':pd.to_datetime}

train=pd.read_csv("../input/mathcohack/train.csv",converters=date_converter)
test=pd.read_csv("../input/mathcohack/test.csv",converters=date_converter)

### Viewing the first 5 rows of train

In [4]:
train.head()

Unnamed: 0,ID,Price,Levy,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags
0,45654403,13328,1399,LEXUS,RX 450,2010-01-01,Jeep,Yes,Hybrid,3.5,186005 km,6.0,Automatic,4x4,04-May,Left wheel,Silver,12
1,44731507,16621,1018,CHEVROLET,Equinox,2011-01-01,Jeep,No,Petrol,3.0,192000 km,6.0,Tiptronic,4x4,04-May,Left wheel,Black,8
2,45774419,8467,-,HONDA,FIT,2006-01-01,Hatchback,No,Petrol,1.3,200000 km,4.0,Variator,Front,04-May,Right-hand drive,Black,2
3,45769185,3607,862,FORD,Escape,2011-01-01,Jeep,Yes,Hybrid,2.5,168966 km,4.0,Automatic,4x4,04-May,Left wheel,White,0
4,45809263,11726,446,HONDA,FIT,2014-01-01,Hatchback,Yes,Petrol,1.3,91901 km,4.0,Automatic,Front,04-May,Left wheel,Silver,4


## Preprocessing

In [6]:
X=train.copy()
X['Current Year'] = pd.to_datetime('2021')
X['Age'] = X['Current Year'].dt.year-X['Prod. year'].dt.year
X.drop(["Price","Levy","ID","Current Year","Prod. year"],axis=1,inplace=True)

In [7]:
X.replace("სხვა","Other",inplace=True)

In [8]:
X[["Engine volume","Turbo"]]=X["Engine volume"].str.split(expand=True)

In [9]:
X["Leather interior"]=X["Leather interior"].map({"Yes":1,"No": 0})

In [10]:
X["Turbo"]=X["Turbo"].map({None:0,"Turbo":1})

In [11]:
#X["Leather interior"]=X["Leather interior"].map({"Yes":1,"No": 0})
#X["Turbo"]=X["Turbo"].map({"None":0,"Turbo":1})
X["Wheel"]=X["Wheel"].map({"Left wheel":0,"Right-hand drive":1})
X["Doors"]=X["Doors"].map({"04-May":0,"02-Mar":1,">5":2})
X["Drive wheels"]=X["Drive wheels"].map({"4x4":0,"Front":1,"Front":2})
X["Gear box type"]=X["Gear box type"].map({"Automatic":0,"Tiptronic":1,"Variator":2,"Manual":3})
X["Fuel type"]=X["Fuel type"].map({"Hybrid":0,"Petrol":1,"Diesel":2,"CNG":3,"Plug-in Hybrid":4,"LPG":5,"Hydrogen":6})
X["Category"]=X["Category"].map({"Jeep":0,"Hatchback":1,"Sedan":2,"Microbus":3,"Goods wagon":4,"Universal":5,"Coupe":6,"Minivan":7,"Cabriolet":8,"Limousine":9,"Pickup":10})


In [12]:
X["Drive wheels"].fillna(1, inplace=True)

In [13]:
X["Mileage"]=X["Mileage"].str.split(expand=True)[0].astype("int")

In [14]:
X[["Drive wheels","Cylinders"]]=X[["Drive wheels","Cylinders"]].astype("int")

In [15]:
X["Engine volume"]=X["Engine volume"].astype("float")

In [16]:
h=X.copy()
h.drop(["Manufacturer","Model","Color"], axis=1, inplace=True)

In [17]:
y=train.Price

### Spliting train data


In [18]:
x_train,x_test,y_train,y_test=train_test_split(h,y,train_size=0.8)

### Training XGB Model

In [19]:
xgb=XGBRegressor(max_depth=100,
    learning_rate=0.1,
    n_estimators=1000,
    verbosity=0,
    silent=None,
    objective='reg:linear',
    booster='gbtree',
    n_jobs=-1,
    nthread=None,
    gamma=0,
    min_child_weight=1,
    max_delta_step=0,
    subsample=0.8,
    colsample_bytree=1,
    colsample_bylevel=1,
    colsample_bynode=1,
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    base_score=0.5,
    random_state=0,
    seed=None,)
xgb.fit(x_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=100,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=1000, n_jobs=-1, nthread=4, num_parallel_tree=1,
             objective='reg:linear', random_state=0, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, seed=0, silent=None, subsample=0.8,
             tree_method='exact', validate_parameters=1, verbosity=0)

### Predicting

In [20]:
p=xgb.predict(x_test)

### Evalution using RMSLE metric

In [21]:
np.sqrt(sklearn.metrics.mean_squared_log_error(y_test, abs(p)))

1.0349025842662645

## Test


### Preprocession the test data

In [22]:
R=test.copy()
R['Current Year'] = pd.to_datetime('2021')
R['Age'] = R['Current Year'].dt.year-R['Prod. year'].dt.year
R.drop(["Price","Levy","ID","Current Year","Prod. year"],axis=1,inplace=True)

In [23]:
R.replace("სხვა","Other",inplace=True)

In [24]:
R[["Engine volume","Turbo"]]=R["Engine volume"].str.split(expand=True)
R["Leather interior"]=R["Leather interior"].map({"Yes":1,"No": 0})
R["Turbo"]=R["Turbo"].map({None:0,"Turbo":1})

In [25]:
R["Engine volume"]=R["Engine volume"].astype("float")

In [27]:
R["Category"]=R["Category"].map({"Jeep":0,"Hatchback":1,"Sedan":2,"Microbus":3,"Goods wagon":4,"Universal":5,"Coupe":6,"Minivan":7,"Cabriolet":8,"Limousine":9,"Pickup":10})

In [28]:
R["Fuel type"]=R["Fuel type"].map({"Hybrid":0,"Petrol":1,"Diesel":2,"CNG":3,"Plug-in Hybrid":4,"LPG":5,"Hydrogen":6})


In [29]:
R["Gear box type"]=R["Gear box type"].map({"Automatic":0,"Tiptronic":1,"Variator":2,"Manual":3})


In [30]:
R["Drive wheels"]=R["Drive wheels"].map({"4x4":0,"Front":1,"Front":2})


In [31]:
R["Doors"]=test["Doors"].map({"04-May":0,"02-Mar":1,">5":2})


In [32]:
R["Wheel"]=R["Wheel"].map({"Left wheel":0,"Right-hand drive":1})


In [33]:
R["Mileage"]=R["Mileage"].str.split(expand=True)[0].astype("int")

In [34]:
R.head()

Unnamed: 0,Manufacturer,Model,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags,Age,Turbo
0,VOLKSWAGEN,Golf,1,0,2,2.0,0,4,3,2.0,1,0,Grey,10,9,1
1,HYUNDAI,Sonata,2,1,1,2.4,26000,4,1,2.0,0,0,Grey,10,9,0
2,NISSAN,Tiida,2,0,1,1.5,168000,4,0,2.0,0,1,Sky blue,8,16,0
3,VOLVO,XC90,0,1,1,3.2,143000,6,0,0.0,0,0,Blue,12,9,0
4,OPEL,Astra,1,0,1,1.6,200000,4,3,2.0,0,0,Black,0,28,0


In [35]:
R["Drive wheels"].fillna(1, inplace=True)

In [36]:
R["Engine volume"]=R["Engine volume"].astype("float")

In [37]:
R.drop(["Manufacturer","Model","Color"], axis=1, inplace=True)

### Predicting on test data using using the xgb model

In [38]:
l=xgb.predict(R)

In [39]:
prediction=pd.DataFrame(abs(l),columns=['Price'])

In [41]:
prediction.to_csv("solh.csv")