In [None]:
pip install feature_engine --quiet

In [None]:
# Import Lib
import random
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import tensorflow as tf 
import os 


SEED = 22022022

def random_seed(SEED):
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    tf.random.set_seed(SEED)

random_seed(SEED)

In [None]:
#Read all the input Data

Train = pd.read_csv('../input/predict-accident-risk-score-for-unique-postcode/train.csv')
Population = pd.read_csv('../input/predict-accident-risk-score-for-unique-postcode/population.csv')
Road_Network = pd.read_csv('../input/predict-accident-risk-score-for-unique-postcode/roads_network.csv')
Test = pd.read_csv('../input/predict-accident-risk-score-for-unique-postcode/test.csv')
SampleSubmission = pd.read_csv('../input/predict-accident-risk-score-for-unique-postcode/sample_submission.csv')

# Exploring dataset properties:

In [None]:
Train.info()

In [None]:
#Check Missing Value and its distribution between train and test
print(Train.columns[Train.isnull().any()])
print(Test.columns[Test.isnull().any()])

In [None]:
print(Train.isnull().sum())
print(Test.isnull().sum())

# Shuffle the DataFrame

In [None]:
from sklearn.utils import shuffle
Train = shuffle(Train,random_state=SEED)
Train.reset_index(drop=True,inplace=True)

In [None]:
Train.head(2)

In [None]:
#Missing Value Imputation

Train['Road_Surface_Conditions'].fillna('Missing',inplace=True)
Train['Special_Conditions_at_Site'].fillna('Missing',inplace=True)

Test['Road_Surface_Conditions'].fillna('Missing',inplace=True)
Test['Special_Conditions_at_Site'].fillna('Missing',inplace=True)

print(Train.columns[Train.isnull().any()])
print(Test.columns[Test.isnull().any()])

# Target Distribution

In [None]:
sns.histplot(Train['Number_of_Casualties'])

In [None]:
# Target is basically count's and follow Poisson distribution
# Hence after comparing baseline of objective  MSE & Poisson went ahead with modelling 
# Poisson regression based on baseline model score.

# Feature Transformation & Engineering

In [None]:
#Extract the all the alphabets from postcode prior to first number from left in 
#postcode, this represent the postal area code
Train['postcode_1'] = Train['postcode'].str.replace('\d+', '@')
Test['postcode_1'] = Test['postcode'].str.replace('\d+', '@')
############
Train[['postcode_1','postcode_2']] = Train['postcode_1'].str.split('@',1,expand=True)
Test[['postcode_1','postcode_2']] = Test['postcode_1'].str.split('@',1,expand=True)
############
Train.drop(columns='postcode_2',inplace=True)
Test.drop(columns='postcode_2',inplace=True)

In [None]:
#Cardinality of feature postcode & postcode_1
print(len(Train['postcode'].unique()))
print(len(Train['postcode_1'].unique()))

In [None]:
# Check for postcode_1 value there in test and not in train
train_cols = pd.Index(Train['postcode_1'].unique())
test_cols = pd.Index(Test['postcode_1'].unique()) 
print(test_cols.difference(train_cols))

In [None]:
#Transform Date feature
Train['Date'] = pd.to_datetime(Train['Date'])
Test['Date'] = pd.to_datetime(Test['Date'])

#Extract Quarter,Month & Dayofweek
Train['qtr'] = Train['Date'].dt.quarter
Train['month'] = Train['Date'].dt.month
Train['dayofweek'] = Train['Date'].dt.dayofweek
#----------------------------------------#
Test['qtr'] = Test['Date'].dt.quarter
Test['month'] = Test['Date'].dt.month
Test['dayofweek'] = Test['Date'].dt.dayofweek

In [None]:
#Transform Time feature
Train['Time'] = Train['Time'].str.replace(':','.').astype('float')
Test['Time']  = Test['Time'].str.replace(':','.').astype('float')

In [None]:
#All Categorical Columns
cat_col = Train.select_dtypes(include='object').columns

In [None]:
#Checking for value distribution in 
for col in cat_col :
    print(col)
    train_cols = pd.Index(Train[col].unique())
    test_cols = pd.Index(Test[col].unique()) 
    print(test_cols.difference(train_cols))

In [None]:
#Rare Label encoding for postcode feature 

from feature_engine.encoding import RareLabelEncoder

encoder = RareLabelEncoder(tol=0.000003, n_categories=2, 
                           variables='postcode',
                           replace_with='Rare')

# fit the encoder
Train  = encoder.fit_transform(Train)
Test  = encoder.transform(Test)

In [None]:
train_cols = pd.Index(Train['postcode'].unique())
test_cols = pd.Index(Test['postcode'].unique()) 
print(test_cols.difference(train_cols))

In [None]:
# Count Aggregate function
def agg_function(featuer,agg_featuer,Train,Test):
    agg_featuers = Train.groupby([featuer]).agg({agg_featuer: ['count']})
    agg_featuers.columns = [featuer + '_'.join(c).strip('_') for c in agg_featuers.columns]
    Train = Train.merge(agg_featuers, on = [featuer], how='left');
    Test = Test.merge(agg_featuers, on = [featuer], how='left');
    return (Train,Test)

for col in Train.select_dtypes(include='object').columns:
    Train,Test = agg_function(col,'country',Train,Test)

In [None]:
# Mean Aggregate function

def agg_function(featuer,agg_featuer,Train,Test):
    agg_featuers = Train.groupby([featuer]).agg({agg_featuer: ['mean']})
    agg_featuers.columns = [featuer + '_'.join(c).strip('_') for c in agg_featuers.columns]
    Train = Train.merge(agg_featuers, on = [featuer], how='left');
    Test = Test.merge(agg_featuers, on = [featuer], how='left');
    return (Train,Test)

Train,Test = agg_function('postcode_1','Number_of_Casualties',Train,Test)
Train,Test = agg_function('postcode_1','Local_Authority_(District)',Train,Test)
#---------------------------#
Train,Test = agg_function('Local_Authority_(Highway)','Local_Authority_(District)',Train,Test)
Train,Test = agg_function('postcode','Local_Authority_(District)',Train,Test)

In [None]:
print(Train.columns[Train.isnull().any()])
print(Test.columns[Test.isnull().any()])

In [None]:
Train['Time'] = Train['Time'].round(0).astype('str')
Test['Time'] = Test['Time'].round(0).astype('str')
#------------------#
Train['Time'].fillna('Missing',inplace=True)
Test['Time'].fillna('Missing',inplace=True)

In [None]:
# Count & Mean Aggregate function

def agg_function(featuer,agg_featuer,Train,Test):
    agg_featuers = Train.groupby([featuer]).agg({agg_featuer: ['count','mean']})
    agg_featuers.columns = [featuer + '_'.join(c).strip('_') for c in agg_featuers.columns]
    Train = Train.merge(agg_featuers, on = [featuer], how='left');
    Test = Test.merge(agg_featuers, on = [featuer], how='left');
    return (Train,Test)

In [None]:
Train,Test = agg_function('Time','Number_of_Casualties',Train,Test)
Train,Test = agg_function('Day_of_Week','Number_of_Casualties',Train,Test)
Train,Test = agg_function('month','Number_of_Casualties',Train,Test)
Train,Test = agg_function('qtr','Number_of_Casualties',Train,Test)
Train,Test = agg_function('1st_Road_Class','Number_of_Casualties',Train,Test)
Train,Test = agg_function('Number_of_Vehicles','Speed_limit',Train,Test)

In [None]:
print(Train.columns[Train.isnull().any()])
print(Test.columns[Test.isnull().any()])

In [None]:
Train.info()

In [None]:
Train.drop(columns=[],inplace=True)
Test.drop(columns=[],inplace=True)

In [None]:
# Extract Target 
Y = Train['Number_of_Casualties']

In [None]:
# Drop Unwanted columns

Train.drop(columns=['Accident_ID','Date','country','2nd_Road_Number','postcode_1',
                    'Time','Number_of_Casualties','postcode','Local_Authority_(Highway)',
                    'Road_Type','Pedestrian_Crossing-Human_Control',
                    'Pedestrian_Crossing-Physical_Facilities','Light_Conditions',
                    'Weather_Conditions','Road_Surface_Conditions',
                    'Special_Conditions_at_Site','Carriageway_Hazards',
                    'Did_Police_Officer_Attend_Scene_of_Accident','state',],inplace=True)

Test.drop(columns=['Accident_ID','Date','country','2nd_Road_Number','postcode_1',
                   'Time','Number_of_Casualties','postcode','Local_Authority_(Highway)',
                   'Road_Type','Pedestrian_Crossing-Human_Control',
                   'Pedestrian_Crossing-Physical_Facilities','Light_Conditions',
                   'Weather_Conditions','Road_Surface_Conditions',
                   'Special_Conditions_at_Site','Carriageway_Hazards',
                   'Did_Police_Officer_Attend_Scene_of_Accident','state',],inplace=True)

In [None]:
# Correlation to check Correlation among input & target feature and Histogram plots to check distribution of data in Train & Test
# & Identify possible features having different Train & Test distribution

In [None]:
#PLot 
plt.figure(figsize=(30,20))
sns.heatmap(Train.corr(),annot=True)

In [None]:
for col in Train.columns:
    fig, (ax1) = plt.subplots(1,1,figsize=(20, 10))
    ax1.hist(Train[col], bins=100, alpha=0.5, label=col);
    ax1.hist(Test[col], bins=100, alpha=0.5, label=col);
    ax1.set(xlabel=col, ylabel="Count")

In [None]:
#Check/Impute missing value if any : 
print(Train.columns[Train.isnull().any()])
print(Test.columns[Test.isnull().any()])

#Train.fillna(Train.mean(),inplace=True)
#Test.fillna(Test.mean(),inplace=True)

In [None]:
Train.head(1)

In [None]:
#Scale Features 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
Train_scale    = pd.DataFrame(scaler.fit_transform(Train),columns=Train.columns)
Test_scale = pd.DataFrame(scaler.transform(Test),columns=Test.columns)

In [None]:
#Baseline Model Score Check

In [None]:
from sklearn.model_selection import cross_val_score,cross_val_predict,cross_validate

def cross_valid(model,X,Y,cv=5):
    results = cross_validate(model, X, Y,scoring="neg_mean_poisson_deviance",
                             cv=cv,return_train_score=True)
    return (results)

In [None]:
from xgboost import XGBRegressor,XGBRFRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import VotingRegressor,StackingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import PoissonRegressor,TweedieRegressor,GammaRegressor


Reg_Models = [#PoissonRegressor(),
              #XGBRegressor(objective='count:poisson',
              #             use_label_encoder=False,random_state=SEED,n_jobs=-1),
              CatBoostRegressor(objective='Poisson',silent=True,
                             random_state=SEED),
              LGBMRegressor( objective='poisson',random_state=SEED,n_jobs=-1)
               ]

for i in Reg_Models:
  model = i
  error = cross_valid(model,Train_scale,Y,cv=5)
  print(error['test_score'])
  print(model,error['train_score'].mean(),error['test_score'].mean())
  print(error['train_score'].mean() - error['test_score'].mean())

# Feature Selection

In [None]:
from sklearn.model_selection import StratifiedKFold,KFold,train_test_split
from catboost import CatBoostRegressor, Pool, EShapCalcType, EFeaturesSelectionAlgorithm

X_train, X_test ,y_train, y_test = train_test_split(Train_scale,Y,random_state=SEED)
feature_names = list(Train.columns)
train_pool = Pool(X_train, y_train, feature_names=feature_names)
test_pool = Pool(X_test, y_test, feature_names=feature_names)

##########################
model = CatBoostRegressor(objective='Poisson',silent=True,random_state=SEED)
summary = model.select_features(
    train_pool,
    eval_set=test_pool,
    features_for_select='0-41',
    num_features_to_select=27,
    steps=1,
    algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues,
    shap_calc_type=EShapCalcType.Exact,
    train_final_model=True,
    logging_level='Silent',
    plot=True
)

In [None]:
summary

**Hyperparameter tuning did not result in any score improvement, hence used the baseline parameters instead.**

**Tried GroupKfold(On Postal Code),StratifiedKFold(On Target(Number_of_Casualties)) & Kfold as Cross validation technique. GroupKfold gave a higher score but that did not transform into better result into Leader Board. So finally went ahead with Kfold.**

In [None]:
#Selected Featuer List
sel = ['Police_Force',
  'Number_of_Vehicles',
  'Local_Authority_(District)',
  '1st_Road_Class',
  '1st_Road_Number',
  'Speed_limit',
  '2nd_Road_Class',
  'Urban_or_Rural_Area',
  'qtr',
  'Local_Authority_(Highway)country_count',
  'Road_Typecountry_count',
  'Pedestrian_Crossing-Human_Controlcountry_count',
  'Light_Conditionscountry_count',
  'Road_Surface_Conditionscountry_count',
  'Special_Conditions_at_Sitecountry_count',
  'Carriageway_Hazardscountry_count',
  'Did_Police_Officer_Attend_Scene_of_Accidentcountry_count',
  'statecountry_count',
  'postcodecountry_count',
  'postcode_1Number_of_Casualties_mean',
  'Local_Authority_(Highway)Local_Authority_(District)_mean',
  'TimeNumber_of_Casualties_mean',
  'Day_of_WeekNumber_of_Casualties_mean',
  'monthNumber_of_Casualties_mean',
  '1st_Road_ClassNumber_of_Casualties_mean',
  'Number_of_VehiclesSpeed_limit_count',
  'Number_of_VehiclesSpeed_limit_mean']

In [None]:
#Final Model Traning 
from sklearn.model_selection import cross_validate

clf= CatBoostRegressor(objective='Poisson',silent=True,
                                random_state=SEED)

output = cross_validate(clf,Train_scale[sel],Y, cv=3, scoring = 'neg_mean_squared_error',
                        return_estimator=True,return_train_score=True)


In [None]:
print(output['train_score'].mean())
print(output['test_score'].mean())

In [None]:
feature_importances_avg = [] 

for idx,estimator in enumerate(output['estimator']):
    #print("Features sorted by their score for estimator {}:".format(idx))
    feature_importances = estimator.feature_importances_
    feature_importances_avg.append(feature_importances)
    
feature_importances = np.mean(feature_importances_avg,0)
feature_importances = pd.DataFrame(np.mean(feature_importances_avg,0) , 
                     index = Train_scale[sel].columns, columns=['importance']).sort_values('importance', ascending=False)

feature_importances.head(50)

In [None]:
#Test Set Predection 

sub_file = []

for idx,estimator in enumerate(output['estimator']):
    print(idx)
    score = estimator.predict(Test_scale[sel])
    sub_file.append(score)
    
final = np.mean(sub_file,0)

In [None]:
#Prepare Final Submission file : 
Test_sub = pd.read_csv('../input/predict-accident-risk-score-for-unique-postcode/test.csv')
Test_sub['Accident_risk_index'] = final
#
Target_mapper =  Test_sub.groupby(['postcode'])['Accident_risk_index'].mean()
#
SampleSubmission['Accident_risk_index'] = SampleSubmission['postcode'].map(Target_mapper)
#
SampleSubmission['Accident_risk_index'].std(),SampleSubmission.tail()
#
SampleSubmission.to_csv('26M.csv',index=False)

In [None]:
!!!!!Thank You!!!!!!!!!