In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,ID,date,warehouse_ID,Latitude,Longitude,Product_Type,year,month,is_weekend,is_warehouse_closed,daily_dispatch_count,weekly_dispatch_count
0,0x2710,2017-01-01,WH_0x3e9,41.681471,-72.794746,Type_A,2017,January,Yes,No,5.5,5.5
1,0x33e6,2017-01-01,WH_0x3ea,38.749077,-105.18306,Type_A,2017,January,Yes,No,6.1,6.1
2,0x2711,2017-01-02,WH_0x3e9,41.681471,-72.794746,Type_A,2017,January,No,No,4.8,
3,0x33e7,2017-01-02,WH_0x3ea,38.749077,-105.18306,Type_A,2017,January,No,No,6.7,
4,0x2712,2017-01-03,WH_0x3e9,41.681471,-72.794746,Type_A,2017,January,No,No,5.8,


In [3]:
train.shape

(16644, 12)

In [4]:
train.duplicated().sum()

0

In [5]:
train.isnull().sum()

ID                           0
date                         0
warehouse_ID                 0
Latitude                   332
Longitude                  416
Product_Type                 0
year                         0
month                        0
is_weekend                 499
is_warehouse_closed        166
daily_dispatch_count         0
weekly_dispatch_count    14272
dtype: int64

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16644 entries, 0 to 16643
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ID                     16644 non-null  object 
 1   date                   16644 non-null  object 
 2   warehouse_ID           16644 non-null  object 
 3   Latitude               16312 non-null  float64
 4   Longitude              16228 non-null  float64
 5   Product_Type           16644 non-null  object 
 6   year                   16644 non-null  int64  
 7   month                  16644 non-null  object 
 8   is_weekend             16145 non-null  object 
 9   is_warehouse_closed    16478 non-null  object 
 10  daily_dispatch_count   16644 non-null  float64
 11  weekly_dispatch_count  2372 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 1.5+ MB


In [7]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,ID,date,warehouse_ID,Latitude,Longitude,Product_Type,year,month,is_weekend,is_warehouse_closed
0,0x31fc,2021-05-01,WH_0x3e9,41.681471,,Type_A,2021,May,,No
1,0x31fd,2021-05-01,WH_0x3e9,41.681471,,Type_B,2021,May,Yes,No
2,0x3ed2,2021-05-01,WH_0x3ea,38.749077,-105.18306,Type_A,2021,May,Yes,No
3,0x3ed3,2021-05-01,WH_0x3ea,38.749077,-105.18306,Type_B,2021,May,Yes,No
4,0x4a3a,2021-05-01,WH_0x3eb,35.67897,-109.067413,Type_A,2021,May,Yes,Yes


In [8]:
test.shape

(4900, 10)

In [9]:
test.isnull().sum()

ID                       0
date                     0
warehouse_ID             0
Latitude                98
Longitude              122
Product_Type             0
year                     0
month                    0
is_weekend             147
is_warehouse_closed     49
dtype: int64

In [12]:
combined = pd.concat([train, test], axis = 0)
combined.head()

Unnamed: 0,ID,date,warehouse_ID,Latitude,Longitude,Product_Type,year,month,is_weekend,is_warehouse_closed,daily_dispatch_count,weekly_dispatch_count
0,0x2710,2017-01-01,WH_0x3e9,41.681471,-72.794746,Type_A,2017,January,Yes,No,5.5,5.5
1,0x33e6,2017-01-01,WH_0x3ea,38.749077,-105.18306,Type_A,2017,January,Yes,No,6.1,6.1
2,0x2711,2017-01-02,WH_0x3e9,41.681471,-72.794746,Type_A,2017,January,No,No,4.8,
3,0x33e7,2017-01-02,WH_0x3ea,38.749077,-105.18306,Type_A,2017,January,No,No,6.7,
4,0x2712,2017-01-03,WH_0x3e9,41.681471,-72.794746,Type_A,2017,January,No,No,5.8,


In [13]:
combined.shape

(21544, 12)

In [14]:
combined['date'] = pd.to_datetime(combined['date'])

In [15]:
combined['day_of_week'] = combined['date'].dt.day_name()

In [16]:
combined.head()

Unnamed: 0,ID,date,warehouse_ID,Latitude,Longitude,Product_Type,year,month,is_weekend,is_warehouse_closed,daily_dispatch_count,weekly_dispatch_count,day_of_week
0,0x2710,2017-01-01,WH_0x3e9,41.681471,-72.794746,Type_A,2017,January,Yes,No,5.5,5.5,Sunday
1,0x33e6,2017-01-01,WH_0x3ea,38.749077,-105.18306,Type_A,2017,January,Yes,No,6.1,6.1,Sunday
2,0x2711,2017-01-02,WH_0x3e9,41.681471,-72.794746,Type_A,2017,January,No,No,4.8,,Monday
3,0x33e7,2017-01-02,WH_0x3ea,38.749077,-105.18306,Type_A,2017,January,No,No,6.7,,Monday
4,0x2712,2017-01-03,WH_0x3e9,41.681471,-72.794746,Type_A,2017,January,No,No,5.8,,Tuesday


In [21]:
def weekend_impute(x):
    if x == 'Sunday' or x == 'Saturday':
        return 'Yes'
    else:
        return 'No'

In [22]:
combined['is_weekend'] = combined['is_weekend'].apply(weekend_impute)

In [24]:
combined = combined.drop('day_of_week', axis = 1)

In [25]:
combined.isnull().sum()

ID                           0
date                         0
warehouse_ID                 0
Latitude                   430
Longitude                  538
Product_Type                 0
year                         0
month                        0
is_weekend                   0
is_warehouse_closed        215
daily_dispatch_count      4900
weekly_dispatch_count    19172
dtype: int64