In [1]:
import modin.pandas as pd
import numpy as np
# import ray
# ray.init(runtime_env={'env_vars': {'__MODIN_AUTOIMPORT_PANDAS__': '1'}})

In [3]:
%%time
dtype = {'orderid'            : 'int64',
         'pick'               : 'int64',
         '1st_deliver_attempt': 'int64',
         '2nd_deliver_attempt': 'float64',
         'buyeraddress'       : 'object',
         'selleraddress'      : 'object'}

df = pd.read_csv("/Users/sonle/Documents/Data/delivery_orders_march.csv", dtype=dtype)

CPU times: user 709 ms, sys: 427 ms, total: 1.14 s
Wall time: 12.5 s


In [4]:
WORKDAYS = '1111110'
HOLIDAYS = ['2020-03-08', '2020-03-25', '2020-03-30', '2020-03-31']

GMT8_OFFSET = 3600 * 8
DURATION_1DAY = 3600 * 24

sla_matrix_1st_attempt = [
	[3, 5, 7, 7],
	[5, 5, 7, 7],
	[7, 7, 7, 7],
	[7, 7, 7, 7]
]

sla_matrix_2nd_attempt = [
	[3, 3, 3, 3],
	[3, 3, 3, 3],
	[3, 3, 3, 3],
	[3, 3, 3, 3]
]
locations = ["Metro Manila", "Luzon", "Visayas", "Mindanao"]
locations = [loc.lower() for loc in locations]
location_to_index = {loc: i for i, loc in enumerate(locations)}
print(location_to_index)

min_length = min(map(len, locations))
trunc_location_to_index = {loc [-min_length:]: i for i, loc in enumerate(locations)}
print(trunc_location_to_index)

{'metro manila': 0, 'luzon': 1, 'visayas': 2, 'mindanao': 3}
{'anila': 0, 'luzon': 1, 'sayas': 2, 'danao': 3}


In [5]:
map_to_dict = dict(enumerate(np.array(sla_matrix_1st_attempt).flatten()))

In [6]:
def tweak_result(df: pd.DataFrame) -> pd.DataFrame:
	def convert_address(df):
		"""
        Function to convert address to given index
        :param df: original df
        :return: Updated buyeraddress and selleraddress with exact index of cites
        """
		# df_['buyeraddress'] = df_['buyeraddress'].apply(lambda x: x[-min_length:].lower()).map(trunc_location_to_index)
		# df_['selleraddress'] = df_['selleraddress'].apply(lambda x: x[-min_length:].lower()).map(trunc_location_to_index)
		return (df.assign(
				buyeraddress=df.buyeraddress.apply(lambda x: x [-min_length:].lower()).map(trunc_location_to_index),
				selleraddress=df.selleraddress.apply(lambda x: x [-min_length:].lower()).map(trunc_location_to_index)))

	def convert_time_todate(df):
		"""
        Function to fill NaN and convert time to second
        :param df: original df
        :return: df with ['pick', '1st_deliver_attempt', '2nd_deliver_attempt'] in second
        """
		df ['2nd_deliver_attempt'] = df ['2nd_deliver_attempt'].fillna(0).astype(np.int64)
		cols = ['pick', '1st_deliver_attempt', '2nd_deliver_attempt']
		df [cols] = df [cols].applymap(lambda x: (x + GMT8_OFFSET) // DURATION_1DAY)
		return df

	def convert_working_days(df):
		"""
        Function to calculate the transit time
        :param df: original df
        :return: df with two new columns NUM_DAYS1 and NUM_DAYS2 as transit time
        """
		t1 = df ['pick'].values.astype('datetime64[D]')
		t2 = df ['1st_deliver_attempt'].values.astype('datetime64[D]')
		t3 = df ['2nd_deliver_attempt'].values.astype('datetime64[D]')
		return (df
		        .assign(NUM_DAYS1=lambda x: np.busday_count(t1, t2, weekmask=WORKDAYS, holidays=HOLIDAYS),
		                NUM_DAYS2=lambda x: np.busday_count(t2, t3, weekmask=WORKDAYS, holidays=HOLIDAYS))
		        )

	def computing_sla(df):
		"""
        Function to calculate the sla time
        :param df: original df
        :return: df with new sla column
        """
		return (
			df.assign(temp_col=lambda x: 4 * x.buyeraddress + x.selleraddress,
			          sla1=lambda x: x.temp_col.map(map_to_dict))
			.drop(columns=['temp_col']))

	return (df.pipe(convert_address)
	        .pipe(convert_time_todate)
	        .pipe(convert_working_days)
	        .pipe(computing_sla)
	        )


In [7]:
%%time
result = tweak_result(df)

CPU times: user 27.1 s, sys: 1.8 s, total: 28.9 s
Wall time: 52.8 s


In [9]:
%%time
display(
		result
		.assign(is_late=(result.NUM_DAYS1 > result.sla1) | (result.NUM_DAYS2 > 3))
		.astype(int)
		.loc [:, ['is_late']].value_counts(normalize=True)
		# .to_csv('submission.csv', index=False) this line is to export the submission file if needed
)

[2m[1m[36m(scheduler +5m12s)[0m Tip: use `ray status` to view detailed cluster status. To disable these messages, set RAY_SCHEDULER_EVENTS=0.


is_late
0    0.759966
1    0.240034
dtype: float64

CPU times: user 661 ms, sys: 404 ms, total: 1.07 s
Wall time: 5.15 s
