In [1]:
import os
import pandas as pd
import datetime as dt
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
plt.rcParams['figure.figsize'] = [16, 10]
plt.rcParams['font.size'] = 14
pd.set_option('display.max_columns', 99)
start = dt.datetime.now()

In [3]:
validation_splits = pd.DataFrame([
    ['Atlanta', 33.791, 33.835],
    ['Boston', 42.361, 42.383],
    ['Chicago', 41.921, 41.974],
    ['Philadelphia', 39.999, 40.046],
], columns=['City', 'l1', 'l2'])

direction_encoding = {
    'N': 0,
    'NE': 1 / 4,
    'E': 1 / 2,
    'SE': 3 / 4,
    'S': 1,
    'SW': 5 / 4,
    'W': 3 / 2,
    'NW': 7 / 4
}

road_encoding = {
    'Road': 1,
    'Street': 2,
    'Avenue': 2,
    'Drive': 3,
    'Broad': 3,
    'Boulevard': 4
}
monthly_rainfall = {
    'Atlanta1': 5.02, 'Atlanta5': 3.95, 'Atlanta6': 3.63, 'Atlanta7': 5.12,
    'Atlanta8': 3.67, 'Atlanta9': 4.09, 'Atlanta10': 3.11, 'Atlanta11': 4.10,
    'Atlanta12': 3.82, 'Boston1': 3.92, 'Boston5': 3.24, 'Boston6': 3.22,
    'Boston7': 3.06, 'Boston8': 3.37, 'Boston9': 3.47, 'Boston10': 3.79,
    'Boston11': 3.98, 'Boston12': 3.73, 'Chicago1': 1.75, 'Chicago5': 3.38,
    'Chicago6': 3.63, 'Chicago7': 3.51, 'Chicago8': 4.62, 'Chicago9': 3.27,
    'Chicago10': 2.71, 'Chicago11': 3.01, 'Chicago12': 2.43,
    'Philadelphia1': 3.52, 'Philadelphia5': 3.88, 'Philadelphia6': 3.29,
    'Philadelphia7': 4.39, 'Philadelphia8': 3.82, 'Philadelphia9': 3.88,
    'Philadelphia10': 2.75, 'Philadelphia11': 3.16, 'Philadelphia12': 3.31
}
monthly_temperature = {
    'Atlanta1': 43, 'Atlanta5': 69, 'Atlanta6': 76, 'Atlanta7': 79,
    'Atlanta8': 78, 'Atlanta9': 73, 'Atlanta10': 62, 'Atlanta11': 53,
    'Atlanta12': 45, 'Boston1': 30, 'Boston5': 59, 'Boston6': 68, 'Boston7': 74,
    'Boston8': 73, 'Boston9': 66, 'Boston10': 55, 'Boston11': 45,
    'Boston12': 35, 'Chicago1': 27, 'Chicago5': 60, 'Chicago6': 70,
    'Chicago7': 76, 'Chicago8': 76, 'Chicago9': 68, 'Chicago10': 56,
    'Chicago11': 45, 'Chicago12': 32, 'Philadelphia1': 35, 'Philadelphia5': 66,
    'Philadelphia6': 76, 'Philadelphia7': 81, 'Philadelphia8': 79,
    'Philadelphia9': 72, 'Philadelphia10': 60, 'Philadelphia11': 49,
    'Philadelphia12': 40}

In [5]:
train = pd.read_csv(
    './data/train.csv')
test = pd.read_csv('./data/test.csv')
train.shape, test.shape

train['IsTrain'] = 1
test['IsTrain'] = 0
full = pd.concat([train, test], sort=True)

((856387, 28), (1921357, 13))

In [6]:
# Validation Groups
full = full.merge(validation_splits, on='City')
full['ValidationGroup'] = 1
full.loc[full.Latitude <= full.l1, 'ValidationGroup'] = 0
full.loc[full.Latitude > full.l2, 'ValidationGroup'] = 2
full.drop(['l1', 'l2'], axis=1, inplace=True)

In [7]:
cols = [c for c in test.columns if c not in ['Path']]
train.loc[train.DistanceToFirstStop_p80 > 0, cols + ['DistanceToFirstStop_p80']].head()
test[cols].head()

Unnamed: 0,RowId,IntersectionId,Latitude,Longitude,EntryStreetName,ExitStreetName,EntryHeading,ExitHeading,Hour,Weekend,Month,City,IsTrain,DistanceToFirstStop_p80
109,1921466,0,33.791659,-84.430032,Marietta Boulevard Northwest,Marietta Boulevard Northwest,SE,SE,9,0,7,Atlanta,1,57.4
125,1921482,0,33.791659,-84.430032,Marietta Boulevard Northwest,Marietta Boulevard Northwest,SE,SE,17,0,7,Atlanta,1,55.5
127,1921484,0,33.791659,-84.430032,Marietta Boulevard Northwest,Marietta Boulevard Northwest,SE,SE,18,0,7,Atlanta,1,53.3
131,1921488,0,33.791659,-84.430032,Marietta Boulevard Northwest,Marietta Boulevard Northwest,SE,SE,20,0,7,Atlanta,1,59.9
201,1921558,0,33.791659,-84.430032,Marietta Boulevard Northwest,Marietta Boulevard Northwest,SE,SE,9,0,8,Atlanta,1,57.2


Unnamed: 0,RowId,IntersectionId,Latitude,Longitude,EntryStreetName,ExitStreetName,EntryHeading,ExitHeading,Hour,Weekend,Month,City,IsTrain
0,0,1,33.75094,-84.393032,Peachtree Street Southwest,Peachtree Street Southwest,NE,NE,0,0,6,Atlanta,0
1,1,1,33.75094,-84.393032,Peachtree Street Southwest,Mitchell Street Southwest,SW,SE,0,0,6,Atlanta,0
2,2,1,33.75094,-84.393032,Peachtree Street Southwest,Peachtree Street Southwest,SW,SW,0,0,6,Atlanta,0
3,3,1,33.75094,-84.393032,Peachtree Street Southwest,Peachtree Street Southwest,NE,NE,1,0,6,Atlanta,0
4,4,1,33.75094,-84.393032,Peachtree Street Southwest,Peachtree Street Southwest,SW,SW,1,0,6,Atlanta,0


In [8]:
full['Latitude3'] = full.Latitude.round(3)
full['Longitude3'] = full.Longitude.round(3)
full['EntryStreetMissing'] = 1 * full.EntryStreetName.isna()
full['ExitStreetMissing'] = 1 * full.ExitStreetName.isna()

full['CMWH'] = full.City + '_' \
               + full.Month.astype(str) + '_' \
               + full.Weekend.astype(str) + '_' \
               + full.Hour.astype(str)

full.EntryHeading = full.EntryHeading.replace(direction_encoding)
full.ExitHeading = full.ExitHeading.replace(direction_encoding)
full['DiffHeading'] = full['EntryHeading'] - full['ExitHeading']

In [9]:
full['city_month'] = full["City"] + full["Month"].astype(str)
full["Rainfall"] = full['city_month'].replace(monthly_rainfall)
full["Temperature"] = full['city_month'].replace(monthly_temperature)
full.drop('city_month', axis=1, inplace=True)

In [10]:
def road_encode(x):
    for road in road_encoding.keys():
        if road in x:
            return road_encoding[road]
    return 0

full = full.fillna(dict(EntryStreetName='Unknown Something',
                        ExitStreetName='Unknown Something'))

full['EntryType'] = full['EntryStreetName'].apply(road_encode)
full['ExitType'] = full['ExitStreetName'].apply(road_encode)

In [11]:
full.EntryStreetName = full.City + ' ' + full.EntryStreetName
full.ExitStreetName = full.City + ' ' + full.ExitStreetName
full['Intersection'] = full.City + ' ' + full.IntersectionId.astype(str)

full['SameStreet'] = 1 * (full.EntryStreetName == full.ExitStreetName)

In [12]:
# Geolocation
for col in ['Latitude', 'Longitude']:
    scaler = StandardScaler()
    full[col] = scaler.fit_transform(full[col].values.reshape(-1, 1))

# Distance from CityCenter
full = full.merge(
    full.groupby('City')[['Latitude', 'Longitude']].mean(),
    left_on='City', right_index=True, suffixes=['', 'Dist']
)
full.LatitudeDist = (5 * np.abs(full.Latitude - full.LatitudeDist)).round(3)
full.LongitudeDist = (5 * np.abs(full.Longitude - full.LongitudeDist)).round(3)
full['CenterDistL1'] = (5 * (full.LatitudeDist + full.LongitudeDist)).round(3)
full['CenterDistL2'] = (3 * np.sqrt(
    (full.LatitudeDist ** 2 + full.LongitudeDist ** 2))).round(3)

In [13]:
def add_frequency(df, column):
    cnt = df.groupby(column)[['RowId']].count()
    cnt.loc[cnt.RowId > 10, 'RowId'] = 10 * (
            cnt.loc[cnt.RowId > 10, 'RowId'] // 10)
    cnt.columns = [f'{column}Count']
    return df.merge(cnt, left_on=column, right_index=True)

full = add_frequency(full, 'Longitude3')
full = add_frequency(full, 'Latitude3')
full = add_frequency(full, 'ExitStreetName')
full = add_frequency(full, 'EntryStreetName')
full = add_frequency(full, 'Intersection')
full = add_frequency(full, 'Path')

# Frequency Encoding with unique intersections
def add_unique_intersections(df, column):
    cnt = df.groupby(column)[['Intersection']].nunique()
    cnt.loc[cnt.Intersection > 10, 'Intersection'] = 5 * (
            cnt.loc[cnt.Intersection > 10, 'Intersection'] // 5)
    cnt.columns = [f'{column}UniqueIntersections']
    return df.merge(cnt, left_on=column, right_index=True)

full = add_unique_intersections(full, 'Longitude3')
full = add_unique_intersections(full, 'Latitude3')
full = add_unique_intersections(full, 'ExitStreetName')
full = add_unique_intersections(full, 'EntryStreetName')


In [14]:
columns_to_encode = [
    'City',
    'EntryStreetName',
    'ExitStreetName',
    'Intersection',
    'CMWH'
]
for c in columns_to_encode:
    encoder = LabelEncoder()
    full[c] = encoder.fit_transform(full[c])

In [15]:
full.to_csv('./data/features_v3.csv.gz', compression='gzip', index=False)

In [16]:
train = full[full.IsTrain == 1].copy()
test = full[full.IsTrain == 0].copy()

column_stats = pd.concat([
    pd.DataFrame(full.count()).rename(columns={0: 'cnt'}),
    pd.DataFrame(train.count()).rename(columns={0: 'train_cnt'}),
    pd.DataFrame(test.count()).rename(columns={0: 'test_cnt'}),
    pd.DataFrame(full.nunique()).rename(columns={0: 'unique'}),
    pd.DataFrame(train.nunique()).rename(columns={0: 'train_unique'}),
    pd.DataFrame(test.nunique()).rename(columns={0: 'test_unique'}),
], sort=True, axis=1)
column_stats['seen_in_train%'] = (
            100 * column_stats.train_unique / column_stats.unique).round(1)
column_stats = column_stats.sort_values(by='unique')
column_stats.to_csv('col_stats.csv')
column_stats

Unnamed: 0,cnt,train_cnt,test_cnt,unique,train_unique,test_unique,seen_in_train%
Weekend,2777744,856387,1921357,2,2,2,100.0
IsTrain,2777744,856387,1921357,2,1,1,50.0
ExitStreetMissing,2777744,856387,1921357,2,2,2,100.0
EntryStreetMissing,2777744,856387,1921357,2,2,2,100.0
SameStreet,2777744,856387,1921357,2,2,2,100.0
ValidationGroup,2777744,856387,1921357,3,2,3,66.7
City,2777744,856387,1921357,4,4,4,100.0
ExitType,2777744,856387,1921357,5,5,5,100.0
EntryType,2777744,856387,1921357,5,5,5,100.0
ExitHeading,2777744,856387,1921357,8,8,8,100.0
