# Simple Starter Notebook for;
## UmojaHack Africa 2021 #2: Sendy - Delivery Rider Response Challenge by UmojaHack Africa

Can you predict who is the best delivery rider for an order placed via logistics company Sendy?

![Umoja Hack](https://zindpublic.blob.core.windows.net/public/uploads/competition/image/152/thumb_c5ec4e2a-e000-4176-a93c-dd1143c2b60f.png)

The objective of this challenge is to create a machine learning model that will predict whether a rider will accept, decline or ignore an order sent to them.

In [298]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

In [299]:
# Load files
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
riders = pd.read_csv('Riders.csv')
ss = pd.read_csv('SampleSubmission.csv')

In [300]:
# check data shapes
train.shape, test.shape, riders.shape, ss.shape

((179867, 21), (76791, 20), (2632, 4), (76791, 2))

In [301]:
# Preview train
train.head()

Unnamed: 0,ID,order_id,dispatch_day,dispatch_day_of_week,dispatch_time,client_id,client_type,order_license_status,order_carrier_type,vendor_type,...,rider_license_status,rider_carrier_type,rider_amount,rider_lat,rider_long,pickup_lat,pickup_long,drop_off_lat,drop_off_long,target
0,ID_SCUW21PVAU,4435,27,6,09:02:54,593630,Business,0,2,Bike,...,0,1,1080,-42.698343,-17.228539,-42.692371,-17.248305,-42.687442,-17.424682,1
1,ID_2HA7X30JMN,32711,30,7,13:01:37,837729,Personal,0,1,Bike,...,0,1,730,-42.787317,-17.288252,-42.784046,-17.290121,-42.673267,-17.234595,2
2,ID_IAJWDTBY6M,8712,14,2,10:01:00,695129,Personal,0,2,Bike,...,1,1,490,-42.74918,-17.287848,-42.765204,-17.293784,-42.813953,-17.294805,1
3,ID_LKSVPNYMTR,44869,22,3,14:11:16,1504660,Personal,0,2,Bike,...,1,1,510,-42.836266,-17.31192,-42.831913,-17.315311,-42.812409,-17.265441,2
4,ID_O7N8Y918YH,57590,27,5,16:11:38,36869,Business,0,2,Bike,...,0,0,400,-42.828195,-17.322818,-42.836056,-17.318111,-42.828517,-17.302052,0


In [302]:
# Preview riders
riders.head()

Unnamed: 0,Rider ID,Active Rider Age,Average Partner Rating,Number of Ratings
0,16261,308,21.05,321
1,8832,224,10.0,27
2,53866,238,17.76,25
3,46368,343,24.56,320
4,45609,399,14.97,214


In [303]:
# Merge rider dataset to train and test sets
train = train.merge(riders, how = 'left', left_on='rider_id', right_on='Rider ID')
test = test.merge(riders, how = 'left', left_on='rider_id', right_on='Rider ID')

# Preview merged dataframe
train.head()

Unnamed: 0,ID,order_id,dispatch_day,dispatch_day_of_week,dispatch_time,client_id,client_type,order_license_status,order_carrier_type,vendor_type,...,rider_long,pickup_lat,pickup_long,drop_off_lat,drop_off_long,target,Rider ID,Active Rider Age,Average Partner Rating,Number of Ratings
0,ID_SCUW21PVAU,4435,27,6,09:02:54,593630,Business,0,2,Bike,...,-17.228539,-42.692371,-17.248305,-42.687442,-17.424682,1,30153,11,10.0,1
1,ID_2HA7X30JMN,32711,30,7,13:01:37,837729,Personal,0,1,Bike,...,-17.288252,-42.784046,-17.290121,-42.673267,-17.234595,2,20884,68,24.13,229
2,ID_IAJWDTBY6M,8712,14,2,10:01:00,695129,Personal,0,2,Bike,...,-17.287848,-42.765204,-17.293784,-42.813953,-17.294805,1,33143,273,24.92,123
3,ID_LKSVPNYMTR,44869,22,3,14:11:16,1504660,Personal,0,2,Bike,...,-17.31192,-42.831913,-17.315311,-42.812409,-17.265441,2,96531,168,23.76,175
4,ID_O7N8Y918YH,57590,27,5,16:11:38,36869,Business,0,2,Bike,...,-17.322818,-42.836056,-17.318111,-42.828517,-17.302052,0,103546,95,24.53,42


In [304]:
# One hot encoding
train = pd.get_dummies(train, columns=['client_type', 'vendor_type','order_carrier_type','rider_carrier_type','target'])
test = pd.get_dummies(test, columns=['client_type', 'vendor_type','order_carrier_type','rider_carrier_type'])

train.head()
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 179867 entries, 0 to 179866
Data columns (total 31 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   ID                      179867 non-null  object 
 1   order_id                179867 non-null  int64  
 2   dispatch_day            179867 non-null  int64  
 3   dispatch_day_of_week    179867 non-null  int64  
 4   dispatch_time           179867 non-null  object 
 5   client_id               179867 non-null  int64  
 6   order_license_status    179867 non-null  int64  
 7   rider_id                179867 non-null  int64  
 8   rider_license_status    179867 non-null  int64  
 9   rider_amount            179867 non-null  int64  
 10  rider_lat               179867 non-null  float64
 11  rider_long              179867 non-null  float64
 12  pickup_lat              179867 non-null  float64
 13  pickup_long             179867 non-null  float64
 14  drop_off_lat        

In [305]:
train['rider_to_drop_off_dist'] = (train['rider_lat'] - train['drop_off_lat'])**2 + (train['rider_long'] - train['drop_off_long'])**2
train['rider_to_drop_off_dist'] = train['rider_to_drop_off_dist'].apply(np.sqrt)

train['rider_to_pickup_dist'] = (train['rider_lat'] - train['pickup_lat'])**2 + (train['rider_long'] - train['pickup_long'])**2
train['rider_to_pickup_dist'] = train['rider_to_pickup_dist'].apply(np.sqrt)

test['rider_to_drop_off_dist'] = (test['rider_lat'] - test['drop_off_lat'])**2 + (test['rider_long'] - test['drop_off_long'])**2
test['rider_to_drop_off_dist'] = test['rider_to_drop_off_dist'].apply(np.sqrt)

test['rider_to_pickup_dist'] = (test['rider_lat'] - test['pickup_lat'])**2 + (test['rider_long'] - test['pickup_long'])**2
test['rider_to_pickup_dist'] = test['rider_to_pickup_dist'].apply(np.sqrt)

# Split data
main_cols = train.columns.difference(['ID', 'order_id', 'rider_id', 'Rider ID', 'target_0','target_1','target_2', 'dispatch_time','dispatch_day',	'client_id','drop_off_lat','drop_off_long','pickup_lat','pickup_long','rider_lat','rider_long']).tolist()
target_cols = ['target_0','target_1','target_2']
norm_num_cols = ['Active Rider Age', 'Average Partner Rating', 'Number of Ratings','rider_amount','rider_to_drop_off_dist','rider_to_pickup_dist']

#normalize numerical data
train['dispatch_day_of_week'] = train['dispatch_day_of_week']/7
test['dispatch_day_of_week'] = test['dispatch_day_of_week']/7

for col in norm_num_cols:
    #all_data[col] = all_data[col].fillna(all_data[col].median())
    test[col] = (test[col] - train[col].mean())/train[col].std()
    train[col] = (train[col] - train[col].mean())/train[col].std()
    

X = test[main_cols]
y = train[target_cols]

#X.head(1)
#X.info()
#y.info()
y.head(5)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 3031)
#X_test.info()
# Train a model
#model = LGBMClassifier(random_state=3031)
#model.fit(X_train, y_train)

# Make predictions
#y_pred = model.predict(X_test)  ###0.005972^2 + 0.196143^2

# Check score
#accuracy_score(y_test, y_pred)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35974 entries, 89599 to 52315
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Active Rider Age        35974 non-null  int64  
 1   Average Partner Rating  35974 non-null  float64
 2   Number of Ratings       35974 non-null  int64  
 3   client_type_Business    35974 non-null  uint8  
 4   client_type_Personal    35974 non-null  uint8  
 5   dispatch_day            35974 non-null  int64  
 6   dispatch_day_of_week    35974 non-null  int64  
 7   drop_off_lat            35974 non-null  float64
 8   drop_off_long           35974 non-null  float64
 9   order_carrier_type_0    35974 non-null  uint8  
 10  order_carrier_type_1    35974 non-null  uint8  
 11  order_carrier_type_2    35974 non-null  uint8  
 12  order_license_status    35974 non-null  int64  
 13  pickup_lat              35974 non-null  float64
 14  pickup_long             35974 non-

In [306]:
# Make predictions in test set and prepare submission file
#predictions = model.predict(test[main_cols])
#sub_file = ss.copy()
#sub_file.target = predictions
s#ub_file.to_csv('Baseline.csv', index = False)

NameError: name 's' is not defined