In [None]:
#import dependencies
import numpy as np 
import pandas as pd
import os
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt

!pip install fancyimpute
from fancyimpute import IterativeImputer

import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns',None)
from sklearn.metrics import accuracy_score,f1_score,recall_score,precision_score,confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance

In [None]:
# load dataset
weatherdata = pd.read_csv('/kaggle/input/formulaaihackathon2022/weather.csv')

#copy of wetherdata
df = weatherdata.copy()

# EDA and Preprocessing

In [None]:
# #time stamp to time
# df['TIMESTAMP'] = df['TIMESTAMP'].apply(lambda x:datetime.fromtimestamp(x))

In [None]:
#drop columns(either based on variance or outliers)
irrelavant_features = ['M_PACKET_FORMAT', 'M_GAME_MAJOR_VERSION', 'M_GAME_MINOR_VERSION', 'M_PACKET_VERSION', 
                       'M_PACKET_ID', 'M_FRAME_IDENTIFIER', 'M_SECONDARY_PLAYER_CAR_INDEX', 'M_BRAKING_ASSIST', 'M_SESSION_LINK_IDENTIFIER', 
                       'M_PIT_RELEASE_ASSIST', 'M_ZONE_START', 'M_ZONE_FLAG', 'M_GAME_PAUSED', 'M_SLI_PRO_NATIVE_SUPPORT', 
                       'M_SAFETY_CAR_STATUS', 'M_ERSASSIST','M_FORMULA', 'M_PIT_ASSIST', 'M_GEARBOX_ASSIST', 'M_SPECTATOR_CAR_INDEX', 'M_DYNAMIC_RACING_LINE_TYPE', 
                       'M_PIT_SPEED_LIMIT', 'M_NETWORK_GAME', 'M_STEERING_ASSIST', 'M_IS_SPECTATING', 'M_DYNAMIC_RACING_LINE', 
                       'M_DRSASSIST', 'M_NUM_MARSHAL_ZONES','GAMEHOST','Unnamed: 58']

df.drop(columns=irrelavant_features,inplace=True)

#split the feats into discrete and continous
cont_feats = [feat for feat in df.columns if df[feat].nunique()>50]
cat_feats = [feat for feat in df.columns if feat not in cont_feats]

In [None]:
def missing_values_table(df):
        zero_val = (df == 0.00).astype(int).sum(axis=0)
        mis_val = df.isnull().sum()
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        mz_table = pd.concat([zero_val, mis_val, mis_val_percent], axis=1)
        mz_table = mz_table.rename(
        columns = {0 : 'Zero_Values', 1 : 'Missing_Values', 2 : 'Missing_Values%'})
        mz_table['Sum_Zero_Plus_Missing_Values'] = mz_table['Zero_Values'] + mz_table['Missing_Values']
        mz_table['%_Sum_Zero_Plus_Missing_Values'] = 100 * mz_table['Sum_Zero_Plus_Missing_Values'] / len(df)
        mz_table['Data Type'] = df.dtypes
        mz_table = mz_table[
            mz_table.iloc[:,1] != 0].sort_values(
        'Missing_Values%', ascending=False).round(1)
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns and " + str(df.shape[0]) + " Rows.\n"      
            "There are " + str(mz_table.shape[0]) +
              " columns that have missing values.")
#         mz_table.to_excel('D:/sampledata/missing_and_zero_values.xlsx', freeze_panes=(1,0), index = False)
        return mz_table

missing_values_df = missing_values_table(df)

In [None]:
# df1 = df.copy()
# corr = df.corr()
# plt.figure(figsize=(15,15))
# sns.heatmap(corr,annot=True)

f, ax = plt.subplots(3, 3, figsize = (40 , 20))
axx = ax.flatten()

for index, col in enumerate(missing_values_df.index.values):
    sns.distplot(df[col],ax=axx[index])

In [None]:
# for feat in missing_values_df.index.values:
#     plt.figure(figsize=(7,4))
#     sns.distplot(df[feat])
#     plt.show()

Since we have 27.3% of missing values in 8 Features, we shall use Multiple Imputations using Chained Equations - MICE to impute the values

In [None]:
#fill null values with ffill or agregated mean by date and time
#ffill
df = df.fillna(method = 'ffill')

# mice_imputer = IterativeImputer()
# #imputing the missing values with mice imputer
# imputedf = mice_imputer.fit_transform(df)
# #create dataframe of imputedf
# imputedf = pd.DataFrame(imputedf,columns = df.columns)

#make a copy of imputedf
# df = imputedf.copy()

In [None]:
#copy of df
df1 = df.copy()

In [None]:
#time stamp to time
df['TIMESTAMP'] = df['TIMESTAMP'].apply(lambda x:datetime.fromtimestamp(x))
df['date'] =  df['TIMESTAMP'].dt.date
df['hour'] =  df['TIMESTAMP'].dt.hour
df['min'] =  df['TIMESTAMP'].dt.minute
df['sec'] =  df['TIMESTAMP'].dt.second

#sort values by date
df.sort_values(by=['TIMESTAMP'],inplace=True)
#groupby 
df_agg = df.groupby(['M_SESSION_UID','date','hour','min','sec']).mean().reset_index()

In [None]:
df_agg['M_WEATHER'][df_agg['M_WEATHER']==0.5] = np.nan
df_agg = df_agg.fillna(method = 'ffill')

In [None]:
# df[df['M_WEATHER_FORECAST_SAMPLES_M_SESSION_TYPE'].isnull()].tail(50)
# df[df['M_WEATHER_FORECAST_SAMPLES_M_SESSION_TYPE'].notnull()].tail(50)

### Random Forest

In [None]:
target = 'M_WEATHER'
df_agg = df_agg.drop(columns=['date'])

X = df_agg.drop(columns=target)
y = df_agg[target]
X_train, X_test, y_train, y_test = train_test_split(X, y,shuffle=False,random_state=42)
#random forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

In [None]:
#predict for train and validation data
pred_train = rf.predict(X_train)
pred_test = rf.predict(X_test)

In [None]:
def metrics_print(actual,predicted, data_set):
    print(f'{data_set}')
    print('ACCURACY',accuracy_score(actual,predicted))
    print('RECALL', recall_score(actual,predicted,average='weighted'))
    print('PRECISION',precision_score(actual,predicted,average='weighted'))
    print('F1-SCORE',f1_score(actual,predicted,average='weighted'))

In [None]:
metrics_print(y_train,pred_train,'train')
metrics_print(y_test,pred_test,'test')

In [None]:
confusion_matrix(y_train,pred_train)

In [None]:
confusion_matrix(y_test,pred_test)

In [None]:
# cat_cols = [col for col in list(train.select_dtypes('object').columns) if col not in [target]]
# num_cols = [col for col in list(train.select_dtypes('float').columns)]

In [None]:
# f, ax = plt.subplots(2, 5, figsize = (40 , 15))
# axx = ax.flatten()

# for index, col in enumerate(cat_cols):
#     train[ col].value_counts().plot.pie(autopct = '%1.1f%%', ax = axx[index])

## Feature Engineering

In [None]:
#mean TRACK TEMP AND AIR TEMP per session id and hour
feats_gr  = ['M_SESSION_UID','hour']
feats_agg = ['M_TRACK_TEMPERATURE','M_AIR_TEMPERATURE']
dftemp = df_agg.groupby(feats_gr)[feats_agg].agg([np.mean])
cols = dftemp.columns
dftemp.columns = ['_SessionID_Hour_'.join(c) for c in cols]
dftemp.reset_index(inplace = True)
# join df with df_agg
df_agg = df_agg.merge(dftemp, on = feats_gr, how = 'left')




# for feat in df_agg.columns:
#     print(feat , df_agg[feat].nunique())
# #     plt.figure(figsize=(10,4))
# #     sns.distplot(df_agg[feat])
# #     plt.show()
# #     print(df_agg[feat].value_counts(),'/n')

#drop features by variance
dropfeats = ['M_PIT_STOP_WINDOW_IDEAL_LAP','M_FORECAST_ACCURACY']

In [None]:
# #session features 
sess_feats = ['M_SESSION_UID','M_SESSION_TYPE','M_SESSION_TIME','M_SESSION_TIME_LEFT','M_SESSION_DURATION']

In [None]:
weatherdata[['M_TIME_OFFSET','M_SESSION_TIME' ]]

In [None]:
df_agg.groupby(['M_SESSION_UID'])['M_SESSION_TIME','M_SESSION_TIME_LEFT'].agg('min','max')

In [None]:
# weatherdata.sort_values(by=['M_SESSION_UID','M_SESSION_TIME']).groupby(['M_SESSION_UID','M_SESSION_TIME']).mean().head()

In [None]:
# df_agg.head(50)


In [None]:
df[df['M_SESSION_UID']==df_agg['M_SESSION_UID'][0]]

In [None]:
temp1 = weatherdata.sort_values(['M_SESSION_UID','M_SESSION_TIME'])

In [None]:
temp = weatherdata[(weatherdata['M_SESSION_UID']==weatherdata['M_SESSION_UID'][(weatherdata['M_WEATHER']!=0)][3089774]) ]

In [None]:
weatherdata['M_SESSION_UID'][(weatherdata['M_WEATHER']!=0)][3089774]

In [None]:
# temp.sort_values(['M_SESSION_UID','M_SESSION_TIME']).head(50)

In [None]:
temp_agg = df.groupby(['M_SESSION_UID','M_SESSION_TIME','M_TIME_OFFSET','M_WEATHER']).mean().reset_index()

In [None]:
weatherdata['M_FORECAST_ACCURACY'].value_counts()

In [None]:
temp_agg['M_FORECAST_ACCURACY'].value_counts(normalize=True)

In [None]:
temp_agg[temp_agg['M_FORECAST_ACCURACY']==1]['M_WEATHER'].value_counts()

In [None]:
temp_agg['M_WEATHER'].value_counts()

In [None]:
temp_agg.head(50)

In [None]:
weatherdata.columns

In [None]:
# weatherdata.groupby(['M_SESSION_UID','M_TIME_OFFSET']).count().head(50)

In [None]:
# weatherdata.groupby(['M_SESSION_UID','TIMESTAMP']).median().reset_index()['M_TIME_OFFSET'].value_counts()

In [None]:
temp_agg.head(50)