# Data reading and cleaning (1 block)

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.0.3-cp37-none-manylinux1_x86_64.whl (76.3 MB)
[K     |████████████████████████████████| 76.3 MB 1.2 MB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.3


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_absolute_percentage_error, mean_squared_log_error
from catboost import CatBoostRegressor
from scipy.stats import zscore

pd.set_option('max_rows', 1000)
pd.set_option('max_columns', 1000)

def metrics_print(actual, predicted, data_set):
    print(f'{data_set}')
    print('RMSLE', np.sqrt(mean_squared_log_error(actual, predicted)))
    print('MAE', mean_absolute_error(actual, predicted))
    print('MAPE', mean_absolute_percentage_error(actual, predicted))
    print('r2_score', r2_score(actual, predicted))
    
# weather data
df1 = pd.read_csv('train_weather.csv', encoding = 'utf8')
cols = df1.columns
df1.columns = [c.strip() for c in cols]
print(df1.shape)

df0 = pd.read_csv('test_weather.csv', encoding = 'utf8')
cols = df0.columns
df0.columns = [c.strip() for c in cols]
df0.rename(columns = {'EVENTS':'EVENT'}, inplace = True)
print(df0.shape)

df1 = pd.concat([df1,df0])

# main data
df = pd.read_csv('train.csv', encoding = 'utf8')
cols = df.columns
df.columns = [c.strip() for c in cols]
df.fillna('', inplace = True)
df['is_train'] = 1

# adding test to the single dataset
df0 = pd.read_csv('test.csv', encoding = 'utf8')
cols = df0.columns
df0.columns = [c.strip() for c in cols]
df0.fillna('', inplace = True)
df0['is_train'] = 0

df = pd.concat([df,df0])

# convertion to float
def conv2float(x):
    try:
        x = float(x)
    except:
        print(x)
    return x

# cleaning
feats_agg = ['AIR_TEMP', 'TRACK_TEMP','HUMIDITY', 'PRESSURE', 'WIND_SPEED', 'WIND_DIRECTION', 'RAIN']
l = lambda x: len(str(x).split('.'))
for f in feats_agg:
    df1[f] = df1[f].map(lambda x: str(x).replace(',','.'))
    df1 = df1[df1[f].map(l) < 3]
    df1[f] = df1[f].map(conv2float)

# weather analysis
df1['TIME_UTC_STR'] = pd.to_datetime(df1['TIME_UTC_STR'])

feats_gr  = ['LOCATION','EVENT']
feats_agg = ['TIME_UTC_STR','AIR_TEMP', 'TRACK_TEMP','HUMIDITY', 'PRESSURE', 'WIND_SPEED', 'WIND_DIRECTION', 'RAIN']

df2 = df1.groupby(feats_gr)[feats_agg].agg([np.mean, min, max, np.std])

# creating race_number
df1.sort_values(['LOCATION','EVENT','TIME_UTC_STR'], inplace = True)
#df1['d'] = df1.groupby(['LOCATION','EVENT']).TIME_UTC_SECONDS.diff()
df1['day'] = df1['TIME_UTC_STR'].astype(str).str[:10]

feats = ['LOCATION','EVENT','day']
df2 = df1.groupby(feats).size().reset_index()
df2['race_number'] = df2.groupby(feats[:2]).cumcount() + 1
del df2[0]

# join with initial table
df3 = df1.merge(df2)
# group by feats_gr and calc mean,std of feats_agg
feats_gr  = ['LOCATION','EVENT','race_number','day']
feats_agg = ['AIR_TEMP', 'TRACK_TEMP','HUMIDITY', 'PRESSURE', 'WIND_SPEED', 'WIND_DIRECTION', 'RAIN']
df4 = df3.groupby(feats_gr)[feats_agg].agg([np.mean, np.std])
cols = df4.columns
df4.columns = ['_'.join(c) for c in cols]
df4.reset_index(inplace = True)

# and then we will merge it with main data
feats_weather = list(df4.columns[3:])

for f in ['DRIVER_NUMBER','GROUP']:
    del df[f]
        
df['driver'] = df['TEAM'] + '_' + df['DRIVER_NAME']
del df['TEAM']
del df['DRIVER_NAME']


# we should create new feature: race_number = number of trial(appearance) per group
feats = ['LOCATION', 'EVENT','driver'] + ['LAP_NUMBER']
df['race_number'] = df.groupby(feats).cumcount()+1

# number of unique NUMBER per driver
# FOR EACH DRIVER THERE IS ONLY ONE UNIQUE NUMBER NOT CHANGING BY LAPS! 
pd.options.display.float_format = '{:,.0f}'.format
feats = ['LOCATION', 'EVENT','driver']
df.groupby(feats)['NUMBER'].nunique().unstack().max(axis = 1).unstack().fillna(0)

feats_gr  = ['LOCATION','EVENT','driver','race_number']
feats_lap = ['LAP_NUMBER']
feats_time = ['S1','S2','S3','ELAPSED','HOUR','S1_LARGE','S2_LARGE','S3_LARGE','PIT_TIME']

for f in feats_time:
    df[f] = df[f].map(lambda x: str(x).strip() )
df[feats_time][:5]


def str2time(x):
    x = x.strip()
    if x == '':
        return np.nan
    x = x.split(':')
    out = float(x[-1])
    for i,t in enumerate(x[::-1]):
        out+=float(t)*60*i
    return out

for f in feats_time:
    df[f] = df[f].map(str2time)
df[feats_time].isnull().sum()

for f in ['S1','S2','S3']:
    del df[f+'_LARGE']
    
# check 2: cumsum of s1 + s2 + s3 in the group = elapsed
# yes, it's true before appearance of null in s1,s2,s3
feats_gr  = ['LOCATION','EVENT','driver','race_number']
df.sort_values(feats_gr + ['LAP_NUMBER'], inplace = True)

df['S'] = \
    df.groupby(feats_gr)['S1'].cumsum() + \
    df.groupby(feats_gr)['S2'].cumsum() + \
    df.groupby(feats_gr)['S3'].cumsum()
df['delta'] = df['S'] - df['ELAPSED']


f1 = df.delta.abs()>1
f2 = df.LAP_NUMBER < 8

feats_time = ['S1','S2','S3','ELAPSED','HOUR','PIT_TIME','S','delta']
df[f1][feats_gr + feats_time][:5]


# check 3: s1+s2+s3 = diff of hour
del df['HOUR']

# create feature s with correction by pit_time
df['S'] = df['S1'] + df['S2'] + df['S3']
df['S_PIT'] = df['S1'] + df['S2'] + df['S3'] - df['PIT_TIME'].fillna(0)


df['pit_b'] = (df.CROSSING_FINISH_LINE_IN_PIT == 'B').astype(int)
del df['CROSSING_FINISH_LINE_IN_PIT']

df['POWER'] = df.POWER.replace({'':0, 235.0:1, 250.0:2})

feats_real   = ['KPH']
feats_target = ['LAP_TIME']

df['KPH'] = df.KPH.replace('',np.nan).astype(np.float)
feats_join = ['LOCATION', 'EVENT', 'race_number']
df = df.merge(df4, on = feats_join, how = 'left')

# DATA FOR TRAINING CREATED

(914, 11)
(167, 11)


# Model training and submission creation (1 block)

In [None]:
le, le1, le2, le3, le4, le5 = LabelEncoder(), LabelEncoder(), LabelEncoder(), LabelEncoder(), LabelEncoder(), LabelEncoder()
le1.fit(df['LOCATION'])
le2.fit(df['EVENT'])
le3.fit(df['driver'])
le4.fit(df['race_number'])
le5.fit(df['LAP_NUMBER'])

# Creating race_id as  
# race_id = LOCATION + EVENT + race_number

df['race_id'] = pd.Series(le1.transform(df['LOCATION'])).astype(str) + '_' + \
                pd.Series(le2.transform(df['EVENT'])).astype(str) + '_' + \
                pd.Series(le4.transform(df['race_number'])).astype(str)

# Creating race_driver_id as  
# race_id = LOCATION + EVENT + race_number + driver
df['race_driver_id'] = df['race_id'] + '_' + pd.Series(le3.transform(df['driver'])).astype(str)


# Finalising the features in the specific order
df = df[['race_id', 'race_driver_id', 'driver', 'race_number', 'LOCATION', 'EVENT', 'NUMBER',
         'LAP_NUMBER', 'LAP_IMPROVEMENT', 'S1', 'S1_IMPROVEMENT', 'S2', 'S2_IMPROVEMENT', 'S3', 'S3_IMPROVEMENT', 
         'KPH', 'ELAPSED', 'PIT_TIME', 'POWER', 'is_train', 'S', 'delta', 'S_PIT', 'LAP_TIME', 'pit_b']]
print(df.shape)


print(df.shape)
# LAPS_in_RACE
temp = df.groupby(['race_id'])['LAP_NUMBER'].max().reset_index()
temp = temp.rename(columns={'LAP_NUMBER':'LAPS_in_RACE'})

df = df.merge(temp, on=['race_id'], how='left')

print(df.shape)

# filling NaNs in PIT_TIME with 0
df['PIT_TIME'] = df['PIT_TIME'].fillna(0)

temp = df.groupby('race_id')['driver'].nunique().reset_index().rename(columns={'driver':'drivers_in_RACE'})
df = df.merge(temp, on= 'race_id', how='left')

print(df.shape)

# drivers_in_LAP
temp = df.groupby(['race_id', 'LAP_NUMBER'])['driver'].nunique().reset_index().rename(columns={'driver':'drivers_in_LAP'})
df = df.merge(temp, on=['race_id', 'LAP_NUMBER'], how='left')

# drivers_droped
df['drivers_droped'] = abs(df.sort_values(['race_id', 'LAP_NUMBER']).groupby('race_id')['drivers_in_LAP'].diff().fillna(0))

df = df.drop(columns=['delta'])

print(df.shape)

# Test
print('Test')
print('S1 NaNs', df[(df['S1'].isna())&(df['is_train'] == 0)].shape)
print('S2 NaNs', df[(df['S2'].isna())&(df['is_train'] == 0)].shape)
print('S3 NaNs', df[(df['S3'].isna())&(df['is_train'] == 0)].shape)
print('KPH NaNs', df[(df['KPH'].isna())&(df['is_train'] == 0)].shape)

# Train
print('Train')
print('S1 NaNs', df[(df['S1'].isna())&(df['is_train'] == 1)].shape)
print('S2 NaNs', df[(df['S2'].isna())&(df['is_train'] == 1)].shape)
print('S3 NaNs', df[(df['S3'].isna())&(df['is_train'] == 1)].shape)
print('KPH NaNs', df[(df['KPH'].isna())&(df['is_train'] == 1)].shape)

print(df.shape)
df.loc[(df['race_id'] == '5_3_0')&(df['LAP_NUMBER'] == 3)&(df['KPH'].isna()), 'KPH'] = 50
df.loc[(df['race_id']=='5_5_0')&(df['LAP_NUMBER'] == 4), 'KPH'] = 100

df.loc[(df['race_driver_id'] == '5_3_0_8')&(df['LAP_NUMBER'] == 3), 'S3'] = 18
df.loc[(df['race_driver_id'] == '5_5_0_24')&(df['S3'].isna()), 'S3'] = 23.

df['S'] = df['S1'] + df['S2'] + df['S3']
df['S_PIT'] = df['S'] - df['PIT_TIME']

df = df.dropna(subset=['S1', 'S2', 'S3'])
assert df.isna().mean().sum() == 0


# creating features
df['S1_dist'] = df['KPH']*df['S1']
df['S2_dist'] = df['KPH']*df['S2']
df['S3_dist'] = df['KPH']*df['S3']
df['S_dist'] = df['S1_dist'] + df['S2_dist'] + df['S3_dist']

temp = df.groupby(['race_id', 'LAP_NUMBER'])['S1_dist']\
    .median()\
    .reset_index()\
    .rename(columns={'S1_dist':'S1_dist_race_id_LAP_median'})

df = df.merge(temp, on=['race_id', 'LAP_NUMBER'], how='left')

temp = df.groupby(['race_id', 'LAP_NUMBER'])['S2_dist']\
    .median()\
    .reset_index()\
    .rename(columns={'S2_dist':'S2_dist_race_id_LAP_median'})

df = df.merge(temp, on=['race_id', 'LAP_NUMBER'], how='left')

temp = df.groupby(['race_id', 'LAP_NUMBER'])['S3_dist']\
    .median()\
    .reset_index()\
    .rename(columns={'S3_dist':'S3_dist_race_id_LAP_median'})

df = df.merge(temp, on=['race_id', 'LAP_NUMBER'], how='left')

temp = df.groupby(['race_id', 'LAP_NUMBER'])['S_dist']\
    .median()\
    .reset_index()\
    .rename(columns={'S_dist':'S_dist_race_id_LAP_median'})

df = df.merge(temp, on=['race_id', 'LAP_NUMBER'], how='left')

# creating features
df['estimated_time_S'] = df['S_dist_race_id_LAP_median']/df['KPH']
df['estimated_time_S1'] = df['S1_dist_race_id_LAP_median']/df['KPH']
df['estimated_time_S2'] = df['S2_dist_race_id_LAP_median']/df['KPH']
df['estimated_time_S3'] = df['S3_dist_race_id_LAP_median']/df['KPH']

# defining the submission Locations and events
test_LOCATION = ['Location 6', 'Location 7', 'Location 8']
test_EVENT = ['Qualifying Group 1', 'Qualifying Group 2', 'Qualifying Group 3', 'Qualifying Group 4']

# creating features
df['place_in_LAP'] = df.sort_values(['race_id', 'S']).groupby(['race_id', 'LAP_NUMBER']).cumcount()+1

print(df.shape)

# creating features
df['S1_zscore'] = df.groupby(['race_driver_id']).S1.transform(lambda x : zscore(x,ddof=1))
df['S2_zscore'] = df.groupby(['race_driver_id']).S2.transform(lambda x : zscore(x,ddof=1))
df['S3_zscore'] = df.groupby(['race_driver_id']).S3.transform(lambda x : zscore(x,ddof=1))
df['S1_zscore'] = df['S1_zscore'].fillna(df['S1_zscore'].median())
df['S2_zscore'] = df['S2_zscore'].fillna(df['S2_zscore'].median())
df['S3_zscore'] = df['S3_zscore'].fillna(df['S3_zscore'].median())
df['min_zscore'] = df[['S1_zscore', 'S2_zscore', 'S3_zscore']].min(axis=1)
df['max_zscore'] = df[['S1_zscore', 'S2_zscore', 'S3_zscore']].max(axis=1)
df['estimated_ELAPSED'] = (df['estimated_time_S']*df['LAP_NUMBER'])
df['ELAPSED_RATIO'] = abs(df['estimated_ELAPSED'] - df['ELAPSED'])/df['ELAPSED']

df_test = df[df['is_train'] == 0].copy()
df_train = df[df['is_train'] == 1].copy()
        
df_train['LAP_TIME'] = df_train['LAP_TIME'].astype(float)

# creating filters
f_test_loc = df_train['LOCATION'].isin(test_LOCATION)
f_test_event = df_train['EVENT'].isin(test_EVENT)

# training only on the submission locations

df_model_2 = df_train[(f_test_loc)&(~f_test_event)]

# !!!!!!!!!!!!!!!!!! training only on this data
# Location 6,  Free Practice 1
# Location 6,  Free Practice 2
# Location 6,  Free Practice 3
# Location 7,  Free Practice 1
# Location 7,  Free Practice 2
# Location 8,  Free Practice 1
# Location 8,  Free Practice 2

# Encoding categorical features
for x in ['driver', 'LOCATION', 'EVENT', 'race_id', 'race_driver_id']:
    df_model_2[x] = le.fit_transform(df_model_2[x])
    df_test[x] = le.fit_transform(df_test[x])

print('df--------', df.shape)
print('df_test---', df_test.shape[0])
print('df_train--', df_train.shape[0])
print('df_model_2', df_model_2.shape[0])

# availiable features
feat0 = ['race_id', 'race_driver_id', 'driver', 'race_number', 'LOCATION',
       'EVENT', 'NUMBER', 'LAP_NUMBER', 'LAP_IMPROVEMENT', 'S1',
       'S1_IMPROVEMENT', 'S2', 'S2_IMPROVEMENT', 'S3', 'S3_IMPROVEMENT', 'KPH',
       'ELAPSED', 'PIT_TIME', 'POWER', 'is_train', 'S', 'S_PIT',
       'pit_b', 'LAPS_in_RACE', 'drivers_in_RACE', 'drivers_in_LAP',
       'drivers_droped', 'S1_dist', 'S2_dist', 'S3_dist', 'S_dist',
       'S1_dist_race_id_LAP_median', 'S2_dist_race_id_LAP_median',
       'S3_dist_race_id_LAP_median', 'S_dist_race_id_LAP_median',
       'estimated_time_S', 'estimated_time_S1', 'estimated_time_S2',
       'estimated_time_S3', 'place_in_LAP', 'S1_zscore', 'S2_zscore', 'S3_zscore', 'min_zscore', 'max_zscore']

# filtering 0 values
df_model_2 = df_model_2[df_model_2['LAP_TIME'] > 0]

# defining list of features
feat = feat0

# train test split
x, xv, y, yv = train_test_split(df_model_2[feat], df_model_2['LAP_TIME'], test_size=0.2, random_state=4)
print(x.shape, xv.shape)

# creating lof from target
y_log = np.log1p(y)
yv_log = np.log1p(yv)

# training 2 models
model2 = RandomForestRegressor(n_estimators=2000, max_depth=5)
model3 = CatBoostRegressor(verbose=False)

model2.fit(x, y_log)
model3.fit(x, y_log)

pred_test  = 
(np.expm1(model2.predict(df_test[feat]))+np.expm1(model3.predict(df_test[feat])))/2

# creating submission
df_test['LAP_TIME'] = pred_test

df_test[['LAP_TIME']].to_csv('my_submission_file.csv', index=False)

(10696, 25)
(10696, 25)
(10696, 26)
(10696, 27)
(10696, 28)
Test
S1 NaNs (0, 28)
S2 NaNs (0, 28)
S3 NaNs (2, 28)
KPH NaNs (2, 28)
Train
S1 NaNs (4, 28)
S2 NaNs (8, 28)
S3 NaNs (34, 28)
KPH NaNs (30, 28)
(10696, 28)
(10660, 41)


  keepdims=keepdims)
  ret, rcount, out=ret, casting='unsafe', subok=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


df-------- (10660, 48)
df_test--- 420
df_train-- 10240
df_model_2 4075
(3164, 45) (792, 45)
