In [53]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

# Load data

In [54]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

In [55]:
train.head()

Unnamed: 0,row_id,time,x,y,direction,congestion
0,0,1991-04-01 00:00:00,0,0,EB,70
1,1,1991-04-01 00:00:00,0,0,NB,49
2,2,1991-04-01 00:00:00,0,0,SB,24
3,3,1991-04-01 00:00:00,0,1,EB,18
4,4,1991-04-01 00:00:00,0,1,NB,60


In [56]:
"""
row_id - a unique identifier for this instance
time - the 20-minute period in which each measurement was taken
x - the east-west midpoint coordinate of the roadway
y - the north-south midpoint coordinate of the roadway

direction - the direction of travel of the roadway.
EB indicates "eastbound" travel, for example,
while SW indicates a "southwest" direction of travel.

congestion - congestion levels for the roadway during each hour;
the target. The congestion measurements have been normalized
to the range 0 to 100.
"""
print()




# Exploratory Data Analysis

In [57]:
train.shape

(848835, 6)

In [58]:
train.describe()

Unnamed: 0,row_id,x,y,congestion
count,848835.0,848835.0,848835.0,848835.0
mean,424417.0,1.138462,1.630769,47.815305
std,245037.70221,0.801478,1.089379,16.799392
min,0.0,0.0,0.0,0.0
25%,212208.5,0.0,1.0,35.0
50%,424417.0,1.0,2.0,47.0
75%,636625.5,2.0,3.0,60.0
max,848834.0,2.0,3.0,100.0


In [59]:
train.isnull().sum()

row_id        0
time          0
x             0
y             0
direction     0
congestion    0
dtype: int64

In [60]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 848835 entries, 0 to 848834
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   row_id      848835 non-null  int64 
 1   time        848835 non-null  object
 2   x           848835 non-null  int64 
 3   y           848835 non-null  int64 
 4   direction   848835 non-null  object
 5   congestion  848835 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 38.9+ MB


In [61]:
train["direction"].unique()

array(['EB', 'NB', 'SB', 'WB', 'NE', 'SW', 'NW', 'SE'], dtype=object)

In [62]:
train.time = pd.to_datetime(train.time)

In [63]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 848835 entries, 0 to 848834
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   row_id      848835 non-null  int64         
 1   time        848835 non-null  datetime64[ns]
 2   x           848835 non-null  int64         
 3   y           848835 non-null  int64         
 4   direction   848835 non-null  object        
 5   congestion  848835 non-null  int64         
dtypes: datetime64[ns](1), int64(4), object(1)
memory usage: 38.9+ MB


In [64]:
train.time = pd.to_datetime(train.time)
train['year'] = train.time.dt.year
train['month'] = train.time.dt.month
train['week'] = train.time.dt.isocalendar().week
train['hour'] = train.time.dt.hour
train['minute'] = train.time.dt.minute
train['day_of_week'] = train.time.dt.day_name()
train['day_of_year'] = train.time.dt.dayofyear
train['is_weekend'] = (train.time.dt.dayofweek >= 5).astype("int")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 848835 entries, 0 to 848834
Data columns (total 14 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   row_id       848835 non-null  int64         
 1   time         848835 non-null  datetime64[ns]
 2   x            848835 non-null  int64         
 3   y            848835 non-null  int64         
 4   direction    848835 non-null  object        
 5   congestion   848835 non-null  int64         
 6   year         848835 non-null  int64         
 7   month        848835 non-null  int64         
 8   week         848835 non-null  UInt32        
 9   hour         848835 non-null  int64         
 10  minute       848835 non-null  int64         
 11  day_of_week  848835 non-null  object        
 12  day_of_year  848835 non-null  int64         
 13  is_weekend   848835 non-null  int32         
dtypes: UInt32(1), datetime64[ns](1), int32(1), int64(9), object(2)
memory usage: 85.0+ M

Unnamed: 0,row_id,time,x,y,direction,congestion,year,month,week,hour,minute,day_of_week,day_of_year,is_weekend
0,0,1991-04-01,0,0,EB,70,1991,4,14,0,0,Monday,91,0
1,1,1991-04-01,0,0,NB,49,1991,4,14,0,0,Monday,91,0
2,2,1991-04-01,0,0,SB,24,1991,4,14,0,0,Monday,91,0
3,3,1991-04-01,0,1,EB,18,1991,4,14,0,0,Monday,91,0
4,4,1991-04-01,0,1,NB,60,1991,4,14,0,0,Monday,91,0


0         Monday
1         Monday
2         Monday
3         Monday
4         Monday
           ...  
848830    Monday
848831    Monday
848832    Monday
848833    Monday
848834    Monday
Name: day_of_week, Length: 848835, dtype: string

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 848835 entries, 0 to 848834
Data columns (total 14 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   row_id       848835 non-null  int64         
 1   time         848835 non-null  datetime64[ns]
 2   x            848835 non-null  int64         
 3   y            848835 non-null  int64         
 4   direction    848835 non-null  object        
 5   congestion   848835 non-null  int64         
 6   year         848835 non-null  int64         
 7   month        848835 non-null  int64         
 8   week         848835 non-null  UInt32        
 9   hour         848835 non-null  int64         
 10  minute       848835 non-null  int64         
 11  day_of_week  848835 non-null  object        
 12  day_of_year  848835 non-null  int64         
 13  is_weekend   848835 non-null  int32         
dtypes: UInt32(1), datetime64[ns](1), int32(1), int64(9), object(2)
memory usage: 85.0+ M