In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import csv
from sklearn.model_selection import train_test_split
import ml_metrics as metrics
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="ask")
from geopy.distance import distance
from geopy.extra.rate_limiter import RateLimiter
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=0.01, max_retries = 10)
from tqdm import tqdm
import pycountry
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingRegressor
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.inspection import permutation_importance

In [None]:
train = pd.read_csv("Data/train.csv")
test = pd.read_csv("Data/test.csv")
biker = pd.read_csv("Data/bikers.csv")
biker_net = pd.read_csv("Data/bikers_network.csv")
tours = pd.read_csv("Data/tours.csv")
tour_convoy = pd.read_csv("Data/tour_convoy.csv")

In [None]:
train_bikers_set = np.array(train["biker_id"].drop_duplicates())
test_bikers_set = np.array(test["biker_id"].drop_duplicates())
req_bikers_set = np.union1d(train_bikers_set, test_bikers_set)
tours_set = np.array(pd.merge(test["tour_id"].drop_duplicates(), 
                              train["tour_id"].drop_duplicates(), how = 'outer'))
tours_set = tours_set.reshape((tours_set.shape[0],))

In [None]:
biker = biker[biker.biker_id.isin(req_bikers_set)]
tours = tours[tours.tour_id.isin(tours_set)]
biker_net = biker_net[biker_net.biker_id.isin(req_bikers_set)]
tour_convoy = tour_convoy[tour_convoy.tour_id.isin(tours_set)]

In [None]:
lat_long = {}
biker = biker.fillna("")
bids, lats, longs = [], [], []
for index, row in tqdm(biker.iterrows()):
    bid = row["biker_id"]
    f = [0]*4
    l = row["area"]
    if pd.isna(l) or l == None or l == "":
        l = pycountry.countries.get(alpha_2=row["location_id"]).name
    l = ''.join([i for i in l if not i.isdigit()])
    location = geolocator.geocode(l, timeout = 100)
    if location == None:
        lat_long[bid] = (0,0)
    else:
        lat_long[bid] = (location.latitude, location.longitude)
    bids.append(bid)
    lats.append(lat_long[bid][0])
    longs.append(lat_long[bid][1])
print(len(lat_long))   

In [None]:
#biker["latitude"] = lats
#biker["longitude"] = longs
dire = "temp"
biker.to_csv(dire + "/bikers_useful.csv", index = False)
tours.to_csv(dire + "/tours_useful.csv", index = False)
biker_net.to_csv(dire + "/bikers_network_useful.csv", index = False)
tour_convoy.to_csv(dire + "/tour_convoy_useful.csv", index = False)

In [None]:
train = pd.read_csv("Data/train.csv")
test = pd.read_csv("Data/test.csv")
biker = { key: row for key, *row in csv.reader(open(dire + "/bikers_useful.csv", 'r'))}
print(biker.pop('biker_id'))
biker_net = { key: row for key, *row in csv.reader(open(dire + "/bikers_network_useful.csv", 'r'))}
print(biker_net.pop('biker_id'))
tours = { key: row for key, *row in csv.reader(open(dire + "/tours_useful.csv", 'r'))}
print(tours.pop('tour_id'))
tour_convoy = { key: row for key, *row in csv.reader(open(dire + "/tour_convoy_useful.csv", 'r'))}
print(tour_convoy.pop('tour_id'))

In [None]:
train_bikers_set = np.array(train["biker_id"].drop_duplicates())
test_bikers_set = np.array(test["biker_id"].drop_duplicates())
req_bikers_set = np.array([*biker])
tours_set = np.array([*tours])
tour_convoy_full = { key: row for key, *row in csv.reader(open("Data/tour_convoy.csv", 'r'))}

In [None]:
events_attended = {}
events_notgoing = {}
events_maybe = {}
for tid, row in tour_convoy_full.items():
    people_attended = row[0].split()
    people_maybe = row[1].split()
    people_notgoing = row[2].split()
    for bid in people_attended:
        events_attended[bid] = events_attended.get(bid, 0) + 1
    for bid in people_notgoing:
        events_notgoing[bid] = events_notgoing.get(bid, 0) + 1
    for bid in people_maybe:
        events_maybe[bid] = events_maybe.get(bid, 0) + 1
        
events_attended_friends = {}
events_notgoing_friends = {}
events_maybe_friends = {}
for bid, row in biker_net.items():
    friends = row[0].split()
    for friend in friends:
        events_attended_friends[bid] = events_attended_friends.get(bid, 0) + events_attended.get(friend, 0)
        events_notgoing_friends[bid] = events_notgoing_friends.get(bid, 0) + events_notgoing.get(friend, 0)
        events_maybe_friends[bid] = events_maybe_friends.get(bid, 0) + events_maybe.get(friend, 0)

In [None]:
def get_delta(bid, tid, timestamp):
    return (abs(datetime.strptime(timestamp[:10], "%d-%m-%Y") - 
                datetime.strptime(tours[tid][1], "%d-%m-%Y")).total_seconds())/1e7

def get_distance(bid, tid):
    return distance((biker[bid][7], biker[bid][8]), (tours[tid][6], tours[tid][7])).miles

def get_top10_wc(tid):
    row = tours[tid]
    wsum = 0
    for i in range(8, 18):
        wsum += int(row[i])
    return wsum

fraction_in = {}
total_attendees = {}
deltas = {}
total_maybe = {}

for tid, row in tour_convoy.items():
    not_going = len(row[3].split())
    invited = len(row[2].split())
    maybe = len(row[1].split())
    attend = len(row[0].split())
    fraction_in[tid] = (not_going)/(invited+1)
    total_attendees[tid] = attend
    total_maybe[tid] = maybe

for index, row in train.iterrows():
    a = row["biker_id"]
    b = row["tour_id"]
    c = row["timestamp"]
    curr_delta = get_delta(a,b,c)
    if a not in deltas:
        deltas[a] = curr_delta
    else:
        deltas[a] = min(deltas[a], curr_delta)
        
for index, row in test.iterrows():
    a = row["biker_id"]
    b = row["tour_id"]
    c = row["timestamp"]
    curr_delta = get_delta(a,b,c)
    if a not in deltas:
        deltas[a] = curr_delta
    else:
        deltas[a] = min(deltas[a], curr_delta)

In [None]:
d = 0
def make_feature(bid, tid, timestamp, invited):
    f = [0.0]*8
    f[0] = get_delta(bid, tid, timestamp)
    f[1] = fraction_in[tid]
    f[2] = total_attendees[tid]
    f[3] = get_distance(bid, tid)
    f[4] = events_attended_friends[bid]
    f[5] = get_top10_wc(tid)
    f[6] = 1 if f[0] == 0 else deltas[bid]/f[0]
    f[7] = invited
#     f[8] = total_maybe[tid]
#     f[9] = events_notgoing_friends[bid]
#     f[10] = events_maybe_friends[bid]
    return np.array(f)

import numpy as np 

import pandas as pd

import matplotlib.pyplot as plt

import csv

from sklearn.model_selection import train_test_split

import ml_metrics as metrics

from geopy.geocoders import Nominatim

geolocator = Nominatim(user_agent="ask")

from geopy.distance import distance

from geopy.extra.rate_limiter import RateLimiter

geocode = RateLimiter(geolocator.geocode, min_delay_seconds=0.01, max_retries = 10)

from tqdm import tqdm

import pycountry

from datetime import datetime

from sklearn.preprocessing import StandardScaler

from sklearn.datasets import make_moons, make_circles, make_classification

from sklearn.svm import SVC

from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingRegressor

from sklearn.metrics import confusion_matrix, mean_squared_error

from sklearn.model_selection import cross_val_score, train_test_split

from sklearn.inspection import permutation_importance

train = pd.read_csv("Data/train.csv")

test = pd.read_csv("Data/test.csv")

biker = pd.read_csv("Data/bikers.csv")

biker_net = pd.read_csv("Data/bikers_network.csv")

tours = pd.read_csv("Data/tours.csv")

tour_convoy = pd.read_csv("Data/tour_convoy.csv")

train_bikers_set = np.array(train["biker_id"].drop_duplicates())

test_bikers_set = np.array(test["biker_id"].drop_duplicates())

req_bikers_set = np.union1d(train_bikers_set, test_bikers_set)

tours_set = np.array(pd.merge(test["tour_id"].drop_duplicates(), 

                              train["tour_id"].drop_duplicates(), how = 'outer'))

tours_set = tours_set.reshape((tours_set.shape[0],))

biker = biker[biker.biker_id.isin(req_bikers_set)]

tours = tours[tours.tour_id.isin(tours_set)]

biker_net = biker_net[biker_net.biker_id.isin(req_bikers_set)]

tour_convoy = tour_convoy[tour_convoy.tour_id.isin(tours_set)]

lat_long = {}

biker = biker.fillna("")

bids, lats, longs = [], [], []

for index, row in tqdm(biker.iterrows()):

    bid = row["biker_id"]

    f = [0]*4

    l = row["area"]

    if pd.isna(l) or l == None or l == "":

        l = pycountry.countries.get(alpha_2=row["location_id"]).name

    l = ''.join([i for i in l if not i.isdigit()])

    location = geolocator.geocode(l, timeout = 100)

    if location == None:

        lat_long[bid] = (0,0)

    else:

        lat_long[bid] = (location.latitude, location.longitude)

    bids.append(bid)

    lats.append(lat_long[bid][0])

    longs.append(lat_long[bid][1])

print(len(lat_long))   

#biker["latitude"] = lats

#biker["longitude"] = longs

dire = "temp"

biker.to_csv(dire + "/bikers_useful.csv", index = False)

tours.to_csv(dire + "/tours_useful.csv", index = False)

biker_net.to_csv(dire + "/bikers_network_useful.csv", index = False)

tour_convoy.to_csv(dire + "/tour_convoy_useful.csv", index = False)

train = pd.read_csv("Data/train.csv")

test = pd.read_csv("Data/test.csv")

biker = { key: row for key, *row in csv.reader(open(dire + "/bikers_useful.csv", 'r'))}

print(biker.pop('biker_id'))

biker_net = { key: row for key, *row in csv.reader(open(dire + "/bikers_network_useful.csv", 'r'))}

print(biker_net.pop('biker_id'))

tours = { key: row for key, *row in csv.reader(open(dire + "/tours_useful.csv", 'r'))}

print(tours.pop('tour_id'))

tour_convoy = { key: row for key, *row in csv.reader(open(dire + "/tour_convoy_useful.csv", 'r'))}

print(tour_convoy.pop('tour_id'))

train_bikers_set = np.array(train["biker_id"].drop_duplicates())

test_bikers_set = np.array(test["biker_id"].drop_duplicates())

req_bikers_set = np.array([*biker])

tours_set = np.array([*tours])

tour_convoy_full = { key: row for key, *row in csv.reader(open("Data/tour_convoy.csv", 'r'))}

events_attended = {}

events_notgoing = {}

events_maybe = {}

for tid, row in tour_convoy_full.items():

    people_attended = row[0].split()

    people_maybe = row[1].split()

    people_notgoing = row[2].split()

    for bid in people_attended:

        events_attended[bid] = events_attended.get(bid, 0) + 1

    for bid in people_notgoing:

        events_notgoing[bid] = events_notgoing.get(bid, 0) + 1

    for bid in people_maybe:

        events_maybe[bid] = events_maybe.get(bid, 0) + 1

        

events_attended_friends = {}

events_notgoing_friends = {}

events_maybe_friends = {}

for bid, row in biker_net.items():

    friends = row[0].split()

    for friend in friends:

        events_attended_friends[bid] = events_attended_friends.get(bid, 0) + events_attended.get(friend, 0)

        events_notgoing_friends[bid] = events_notgoing_friends.get(bid, 0) + events_notgoing.get(friend, 0)

        events_maybe_friends[bid] = events_maybe_friends.get(bid, 0) + events_maybe.get(friend, 0)

def get_delta(bid, tid, timestamp):

    return (abs(datetime.strptime(timestamp[:10], "%d-%m-%Y") - 

                datetime.strptime(tours[tid][1], "%d-%m-%Y")).total_seconds())/1e7

​

def get_distance(bid, tid):

    return distance((biker[bid][7], biker[bid][8]), (tours[tid][6], tours[tid][7])).miles

​

def get_top10_wc(tid):

    row = tours[tid]

    wsum = 0

    for i in range(8, 18):

        wsum += int(row[i])

    return wsum

​

fraction_in = {}

total_attendees = {}

deltas = {}

total_maybe = {}

​

for tid, row in tour_convoy.items():

    not_going = len(row[3].split())

    invited = len(row[2].split())

    maybe = len(row[1].split())

    attend = len(row[0].split())

    fraction_in[tid] = (not_going)/(invited+1)

    total_attendees[tid] = attend

    total_maybe[tid] = maybe

​

for index, row in train.iterrows():

    a = row["biker_id"]

    b = row["tour_id"]

    c = row["timestamp"]

    curr_delta = get_delta(a,b,c)

    if a not in deltas:

        deltas[a] = curr_delta

    else:

        deltas[a] = min(deltas[a], curr_delta)

        

for index, row in test.iterrows():

    a = row["biker_id"]

    b = row["tour_id"]

    c = row["timestamp"]

    curr_delta = get_delta(a,b,c)

    if a not in deltas:

        deltas[a] = curr_delta

    else:

        deltas[a] = min(deltas[a], curr_delta)

d = 0

def make_feature(bid, tid, timestamp, invited):

    f = [0.0]*8

    f[0] = get_delta(bid, tid, timestamp)

    f[1] = fraction_in[tid]

    f[2] = total_attendees[tid]

    f[3] = get_distance(bid, tid)

    f[4] = events_attended_friends[bid]

    f[5] = get_top10_wc(tid)

    f[6] = 1 if f[0] == 0 else deltas[bid]/f[0]

    f[7] = invited

#     f[8] = total_maybe[tid]

#     f[9] = events_notgoing_friends[bid]

#     f[10] = events_maybe_friends[bid]

    return np.array(f)

​

X, Y = [], []

for index, row in train.iterrows():

    a = row["biker_id"]

    b = row["tour_id"]

    c = row["timestamp"]

    d = row["invited"]

    X.append(make_feature(a, b, c, d))

    if(row["like"] == 1):

        Y.append(1)

    else:

        Y.append(0)

        

X = np.array(X)

Y = np.array(Y)

​

print(X.shape)

nf = X.shape[1]

rfmf = nf - 2

gbmf = nf - 3

classifiers = [

    RandomForestClassifier(max_depth=20, n_estimators=361, max_features=6, max_samples = 0.7, random_state = 0),

    GradientBoostingRegressor(max_depth=20, n_estimators=351, max_features=7, random_state = 0)

]

clf0 = classifiers[0]

clf1 = classifiers[1]

scores = cross_val_score(clf0, X, Y, cv=5, scoring = 'balanced_accuracy')

print(scores)

print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

scores = cross_val_score(clf1, X, Y, cv=5, scoring = 'neg_mean_squared_error')

print(scores)

print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

clf0.fit(X,Y)

clf1.fit(X,Y)

feature_names = ['delta', 'not_going_ratio', 'attended', 'distance', 'eaf', 'word_count', 

                 'min_delta_ratio', 'invited', 'maybe', 'enf', 'emf']

​

X_train, X_test, y_train, y_test = train_test_split(

    X, Y, test_size=0.2, random_state=13)

​

params = { 'max_depth':20, 'n_estimators':151, 'max_features':7, 'random_state':1 }

#params = { 'max_depth':20, 'n_estimators':161, 'max_features':9, 'random_state':1 , 'max_samples':0.7}

​

#reg = RandomForestClassifier(**params)

reg = GradientBoostingRegressor(**params)

reg.fit(X_train, y_train)

​

'''

mse = mean_squared_error(y_test, reg.predict(X_test))

print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

​

test_score = np.zeros((params['n_estimators'],), dtype=np.float64)

for i, y_pred in enumerate(reg.staged_predict(X_test)):

    test_score[i] = reg.loss_(y_test, y_pred)

​

fig = plt.figure(figsize=(6, 6))

plt.subplot(1, 1, 1)

plt.title('Deviance')

plt.plot(np.arange(params['n_estimators']) + 1, reg.train_score_, 'b-',

         label='Training Set Deviance')

plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',

         label='Test Set Deviance')

plt.legend(loc='upper right')

plt.xlabel('Boosting Iterations')

plt.ylabel('Deviance')

fig.tight_layout()

plt.show() '''

​

feature_importance = reg.feature_importances_

sorted_idx = np.argsort(feature_importance)

pos = np.arange(sorted_idx.shape[0]) + .5

fig = plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)

plt.barh(pos, feature_importance[sorted_idx], align='center')

plt.yticks(pos, np.array(feature_names)[sorted_idx])

plt.title('Feature Importance (MDI)')

​

result = permutation_importance(reg, X_test, y_test, n_repeats=10,

                                random_state=42, n_jobs=2)

sorted_idx = result.importances_mean.argsort()

plt.subplot(1, 2, 2)

plt.boxplot(result.importances[sorted_idx].T,

            vert=False, labels=np.array(feature_names)[sorted_idx])

plt.title("Permutation Importance (test set)")

fig.tight_layout()

plt.show()

ts = {}

inv = {}

for index, row in test.iterrows():

    ts[(row["biker_id"], row["tour_id"])] = row["timestamp"]

    inv[(row["biker_id"], row["tour_id"])] = row["invited"]

bikers_out = []

tours_out = []

for biker1 in tqdm(test_bikers_set):

    idx = np.where(biker1==test["biker_id"]) 

    tour = list(test["tour_id"].loc[idx]) # for each unique biker in test data get all the events  

    score = {}

    for tou in tour:

        fe = make_feature(biker1, tou, ts[(biker1, tou)], inv[(biker1, tou)]).reshape(1,-1)

        s1 = clf0.predict_proba(fe)[0][1]

        s2 = clf1.predict(fe)[0]

        #print(s1, s2)

        score[tou] = (s1 + s2)/2

    #print(score)

    tour.sort(key = lambda x : score[x], reverse = True)

    tour = " ".join(tour) # list to space delimited string

    bikers_out.append(biker1)

    tours_out.append(tour)

sample_submission =pd.DataFrame(columns=["biker_id","tour_id"])

sample_submission["biker_id"] = bikers_out

sample_submission["tour_id"] = tours_out

sample_submission.to_csv(dire + "/submission.csv",index=False) # download this file from /kaggle/working directory

print(sample_submission.shape)

print(sample_submission.head(10))

​



X, Y = [], []
for index, row in train.iterrows():
    a = row["biker_id"]
    b = row["tour_id"]
    c = row["timestamp"]
    d = row["invited"]
    X.append(make_feature(a, b, c, d))
    if(row["like"] == 1):
        Y.append(1)
    else:
        Y.append(0)
        
X = np.array(X)
Y = np.array(Y)

print(X.shape)

In [None]:
nf = X.shape[1]
rfmf = nf - 2
gbmf = nf - 3
classifiers = [
    RandomForestClassifier(max_depth=20, n_estimators=361, max_features=6, max_samples = 0.7, random_state = 0),
    GradientBoostingRegressor(max_depth=20, n_estimators=351, max_features=7, random_state = 0)
]
clf0 = classifiers[0]
clf1 = classifiers[1]
scores = cross_val_score(clf0, X, Y, cv=5, scoring = 'balanced_accuracy')
print(scores)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
scores = cross_val_score(clf1, X, Y, cv=5, scoring = 'neg_mean_squared_error')
print(scores)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
clf0.fit(X,Y)
clf1.fit(X,Y)

In [None]:
feature_names = ['delta', 'not_going_ratio', 'attended', 'distance', 'eaf', 'word_count', 
                 'min_delta_ratio', 'invited', 'maybe', 'enf', 'emf']

X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=13)

params = { 'max_depth':20, 'n_estimators':151, 'max_features':7, 'random_state':1 }
#params = { 'max_depth':20, 'n_estimators':161, 'max_features':9, 'random_state':1 , 'max_samples':0.7}

#reg = RandomForestClassifier(**params)
reg = GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)

'''
mse = mean_squared_error(y_test, reg.predict(X_test))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

test_score = np.zeros((params['n_estimators'],), dtype=np.float64)
for i, y_pred in enumerate(reg.staged_predict(X_test)):
    test_score[i] = reg.loss_(y_test, y_pred)

fig = plt.figure(figsize=(6, 6))
plt.subplot(1, 1, 1)
plt.title('Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, reg.train_score_, 'b-',
         label='Training Set Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',
         label='Test Set Deviance')
plt.legend(loc='upper right')
plt.xlabel('Boosting Iterations')
plt.ylabel('Deviance')
fig.tight_layout()
plt.show() '''

feature_importance = reg.feature_importances_
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
fig = plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, np.array(feature_names)[sorted_idx])
plt.title('Feature Importance (MDI)')

result = permutation_importance(reg, X_test, y_test, n_repeats=10,
                                random_state=42, n_jobs=2)
sorted_idx = result.importances_mean.argsort()
plt.subplot(1, 2, 2)
plt.boxplot(result.importances[sorted_idx].T,
            vert=False, labels=np.array(feature_names)[sorted_idx])
plt.title("Permutation Importance (test set)")
fig.tight_layout()
plt.show()

In [None]:
ts = {}
inv = {}
for index, row in test.iterrows():
    ts[(row["biker_id"], row["tour_id"])] = row["timestamp"]
    inv[(row["biker_id"], row["tour_id"])] = row["invited"]

In [None]:
bikers_out = []
tours_out = []
for biker1 in tqdm(test_bikers_set):
    idx = np.where(biker1==test["biker_id"]) 
    tour = list(test["tour_id"].loc[idx]) # for each unique biker in test data get all the events  
    score = {}
    for tou in tour:
        fe = make_feature(biker1, tou, ts[(biker1, tou)], inv[(biker1, tou)]).reshape(1,-1)
        s1 = clf0.predict_proba(fe)[0][1]
        s2 = clf1.predict(fe)[0]
        #print(s1, s2)
        score[tou] = (s1 + s2)/2
    #print(score)
    tour.sort(key = lambda x : score[x], reverse = True)
    tour = " ".join(tour) # list to space delimited string
    bikers_out.append(biker1)
    tours_out.append(tour)

In [None]:
sample_submission =pd.DataFrame(columns=["biker_id","tour_id"])
sample_submission["biker_id"] = bikers_out
sample_submission["tour_id"] = tours_out
sample_submission.to_csv(dire + "/submission.csv",index=False) # download this file from /kaggle/working directory
print(sample_submission.shape)
print(sample_submission.head(10))