In [1]:
import numpy as np
import pandas as pd
from geopy import distance
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans, DBSCAN

In [2]:
df = pd.read_csv("dataset.csv")

df["AggrCoordinates"] = pd.Categorical(df.start_lat.astype(str) + ' ' + df.start_lon.astype(str))
mapping = df.AggrCoordinates.drop_duplicates().to_frame()
mapping['Restaurant'] = np.arange(mapping.shape[0])
df['Restaurant'] = mapping.set_index('AggrCoordinates').reindex(df.AggrCoordinates.values).Restaurant.values
df.drop(columns=["AggrCoordinates"], inplace=True)

df = df.loc[~df.Restaurant.isin(pd.read_csv('groupe_1.csv')['0'].unique())].reset_index(drop = True)

df['Distance'] = df.apply(lambda x: distance.distance((x['start_lat'], x['start_lon']), (x['end_lat'], x['end_lon'])).km, axis = 1)
df["HourTime"] = df.hour  + df.minute / 60

days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
df['Day'] = df.day_of_week.map(dict(zip(days, np.arange(7))))

In [3]:
df.head()

Unnamed: 0,observation_uuid,start_lat,start_lon,start_code_postal,end_lat,end_lon,end_code_postal,rain,heat,day_of_week,hour,minute,fee,Restaurant,Distance,HourTime,Day
0,0d742256-0b36-f3cd-0acf-eba4441030ae,48.8406,2.322,75014,48.8262,2.3192,75014,2,5,Saturday,13,44,1.69,0,1.614514,13.733333,5
1,743c7e9d-2fde-b035-2452-bc39dbf2eed1,48.8472,2.3527,75005,48.8337,2.388,75012,9,3,Tuesday,14,14,1.59,1,2.994697,14.233333,1
2,43893bc5-449c-56ef-3857-ebe679ffa869,48.9015,2.3704,75019,48.8814,2.3484,75009,1,9,Sunday,14,39,2.19,2,2.756647,14.65,6
3,284884a2-55e5-c1d9-b036-4ad9ee5be9b9,48.8504,2.2902,75015,48.8716,2.2811,75016,6,2,Thursday,12,46,1.79,3,2.45032,12.766667,3
4,a0ce5b5f-ca56-3921-f942-2f1efed97f9e,48.8583,2.3894,75011,48.8623,2.3426,75001,2,6,Friday,13,38,2.09,4,3.462698,13.633333,4


In [4]:
for_regression = df[["observation_uuid", "Restaurant", "rain", "heat", "Distance", "HourTime", "fee"]].set_index("observation_uuid")

In [5]:
for column_name in ["rain", "heat", "Distance", "HourTime", "fee"]:
    column = for_regression[column_name]
    for_regression[column_name] = (column - column.min()) / (column.max() - column.min())

In [6]:
for_regression.sort_values(by="Restaurant", inplace=True)

In [7]:
def model(df):
    y = df["fee"]
    X = df[["rain", "heat", "Distance", "HourTime"]]
    regression = sk.linear_model.LinearRegression().fit(X, y)
    return pd.DataFrame([np.hstack((regression.coef_, [regression.intercept_]))])
    
coeffs = for_regression.groupby("Restaurant").apply(model).droplevel(-1)
kmeans = KMeans(n_clusters=7, n_init=30).fit(coeffs)
coeffs["group"] = kmeans.labels_ + 1
coeffs = coeffs.reset_index()
coeffs

Unnamed: 0,Restaurant,0,1,2,3,4,group
0,0,0.008069,0.007139,0.449673,0.002783,0.119323,5
1,1,-0.003205,0.001222,0.306768,-0.008871,0.068270,1
2,2,-0.026269,-0.003479,0.585512,0.174063,-0.081569,7
3,3,0.118613,-0.052294,0.368461,-0.112237,0.100361,5
4,4,-0.005647,0.002436,0.472513,0.128162,-0.028063,3
...,...,...,...,...,...,...,...
565,595,-0.039516,0.051516,0.459283,0.193872,-0.000482,4
566,596,-0.016873,-0.006603,0.469918,0.086757,-0.019385,3
567,597,-0.037713,0.058108,0.513840,0.177423,-0.050402,3
568,598,0.209655,-0.112404,0.553846,0.192996,-0.152499,2


In [8]:
result = pd.merge(df[["observation_uuid", "Restaurant"]], coeffs[["group", "Restaurant"]], on="Restaurant")
result.drop(columns=["Restaurant"], inplace=True)

In [9]:
result

Unnamed: 0,observation_uuid,group
0,0d742256-0b36-f3cd-0acf-eba4441030ae,5
1,38dfe76b-ae35-8290-6d80-ab08c963d148,5
2,104dff66-23f1-b67e-01d3-4690a795ac54,5
3,8ec9ea98-6581-f934-9bcf-b1c4f87e3560,5
4,4a30f7cd-00fd-ec23-598c-a3b429b10823,5
...,...,...
4555,9f3176c0-468b-db70-8b6e-6609f5b9cc00,3
4556,c175de4b-7901-0563-f200-e63ad5700b5a,3
4557,126bbdd7-f8d1-cd15-e017-68c509c0fbd4,3
4558,37448afb-3023-73be-3a6d-7a7e16706465,3


In [10]:
result.group.unique()

array([5, 1, 7, 3, 4, 6, 2], dtype=int32)

In [11]:
df = pd.read_csv("dataset.csv")

df["AggrCoordinates"] = pd.Categorical(df.start_lat.astype(str) + ' ' + df.start_lon.astype(str))
mapping = df.AggrCoordinates.drop_duplicates().to_frame()
mapping['Restaurant'] = np.arange(mapping.shape[0])
df['Restaurant'] = mapping.set_index('AggrCoordinates').reindex(df.AggrCoordinates.values).Restaurant.values
df.drop(columns=["AggrCoordinates"], inplace=True)
group_0 = df.groupby('Restaurant').agg({'fee': ['mean', 'std']}).fee.query('std == 0')

group_0["group"] = 0
result2 = pd.merge(df[["observation_uuid", "Restaurant"]], group_0.reset_index()[["group", "Restaurant"]], on="Restaurant")
result2.drop(columns=["Restaurant"], inplace=True)

In [12]:
result3 = pd.concat((result, result2), ignore_index=True)
result3.rename(columns={"group": "algorithm"}, inplace=True)
result3.to_csv("result.csv", index=False)

In [13]:
result3

Unnamed: 0,observation_uuid,algorithm
0,0d742256-0b36-f3cd-0acf-eba4441030ae,5
1,38dfe76b-ae35-8290-6d80-ab08c963d148,5
2,104dff66-23f1-b67e-01d3-4690a795ac54,5
3,8ec9ea98-6581-f934-9bcf-b1c4f87e3560,5
4,4a30f7cd-00fd-ec23-598c-a3b429b10823,5
...,...,...
4795,ff3e0ba1-0ac7-28b4-a418-65bf350d278d,0
4796,41a8a6e1-65e0-4993-7f41-1fed1e70e799,0
4797,869bdbd2-e72b-b5b7-0712-0911b3b68b57,0
4798,33a1d1c2-ad4a-b155-c09f-cd8f739cd488,0
