In [1]:
# Description
'''
・Drop rows with missing price values
・Create the following concat variables
    drive*fuel
    drive*condition
    drive*size
    fuel*condition
    fuel*size
    condition*size
・Conver manufacturer to lower case 
・Set unrealistic years to average year
・Treat extreme values in odometer to be outliers (1e10)
・Use the target encoding for all categorical variables
'''

'\n・Create the following concat variables\n drive*fuel\n drive*condition\n drive*size\n fuel*condition\n fuel*size\n condition*size\n\n・Use the target encoding for all categorical variables\n・Conver manufacturer to lower case \n'

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.preprocessing as preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgbm
from sklearn.metrics import mean_absolute_percentage_error

In [15]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [18]:
# drop rows with missing price (accidentally shifted?)
train_df = train_df.dropna(subset=['price'])

# create new feature columns
train_df['drive_fuel'] = train_df['drive'] + '_' + train_df['fuel']
train_df['drive_condition'] = train_df['drive'] + '_' + train_df['condition']
train_df['drive_size'] = train_df['drive'] + '_' + train_df['size']
train_df['fuel_condition'] = train_df['fuel'] + '_' + train_df['condition']
train_df['fuel_size'] = train_df['fuel'] + '_' + train_df['size']
train_df['condition_size'] = train_df['condition'] + '_' + train_df['size']

# Make manufacturer lowercase
train_df['manufacturer'] = train_df['manufacturer'].str.lower()

# change unrealistic year values to nan
index_to_interpolate = train_df[train_df['year'] > 2023].index
train_df.loc[index_to_interpolate, 'year'] = train_df.loc[index_to_interpolate, 'year'].fillna('')

# change odometer = -1 to median odometer
index_to_interpolate = train_df[train_df['odometer'] == -1].index
median_odometer = train_df.loc[train_df['odometer'] != -1, 'odometer'].median()
train_df.loc[index_to_interpolate, 'odometer'] = train_df.loc[index_to_interpolate, 'odometer'].fillna(median_odometer)

# change -1 > odometer, odometer > 1e6 to 1e10
index_to_interpolate = train_df[(train_df['odometer'] < -1e2) | (train_df['odometer'] > 1e6)].index
train_df.loc[index_to_interpolate, 'odometer'] = train_df.loc[index_to_interpolate, 'odometer'].fillna(1e10)

print(train_df[train_df['odometer'] == -1])

train_df.to_csv('model1.csv', index=False)

          id              region  year manufacturer  condition    cylinders  \
44        44           rochester  2008         jeep       fair  6 cylinders   
211      211       inland empire  1973    chevrolet       good  8 cylinders   
276      276           baltimore  2010          bmw       good  6 cylinders   
583      583        rhode island  2010         jeep  excellent  6 cylinders   
611      611              merced  2017       subaru   like new  4 cylinders   
...      ...                 ...   ...          ...        ...          ...   
27036  27036     medford-ashland  2006         ford  excellent  6 cylinders   
27138  27138              denver  2011    chevrolet  excellent  6 cylinders   
27207  27207         tallahassee  2018          bmw   like new  4 cylinders   
27389  27389       orange county  2011         ford   like new  6 cylinders   
27498  27498  huntington-ashland  2010    chevrolet       fair  4 cylinders   

      fuel  odometer title_status transmission  ...