# Feature engineering for HubAndSpoke and Routed (Door to Door) vehicle classifier

In this notebook we will show how to perform feature engineering on raw trip data and extract high level constructs from the data. 

In this notebook we will cover four main feature engineering techniques which are:
* Bucketing
* Hashing
* Crossing
* Embedding

<img src="https://user-images.githubusercontent.com/31048109/59074132-89d39300-88a0-11e9-8eea-b0105be62bc7.png">

### Step 1: Loading the data 

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import pandas as pd

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
downloaded = drive.CreateFile({'id':"1oc1JytEzmGX6abYlik8ehloH1fAORblU"})
downloaded.GetContentFile('Annon.csv') 

In [None]:
df = pd.read_csv("Annon.csv")

In [None]:
df.head()

In [None]:
df.GeoLabel.value_counts()

### Step 2: Bucketing into level 6 geohash

In [None]:
#df['StartGeohash'] = df.StartGeohash.apply(lambda x: x[0:6])
#df['StopGeohash'] = df.StopGeohash.apply(lambda x: x[0:6])

### Step 3: Hashing
* Create a hash function that will find top_50 most frequently visited geohash location and rank them. 
* Apply the hash function to all geohashes. 

In [None]:
# For each hardwareId and startGeohash pair sum up the trip count.
df['HStartGeohash'] = df.HardwareId.astype(str) + ' ' + df.StartGeohash
d_start = df.HStartGeohash.value_counts()

In [None]:
# For each hardwareId and stopGeohash pair sum up the trip count.
df['HStopGeohash'] = df.HardwareId.astype(str) + ' ' + df.StopGeohash
d_stop = df.HStopGeohash.value_counts()

In [None]:
d_total = d_start + d_stop

In [None]:
def get_dict(hardwareid):
    d = {}
    for k in d_start.keys():
        if hardwareid in k:
            d[k.split()[1]] = d_start[k]
    for k in d_stop.keys():
        if hardwareid in k:
            if k in d:
                d[k.split()[1]] += d_stop[k]
            else:
                d[k.split()[1]] = d_stop[k]
    return d
                
    

In [None]:
import operator
rank_main = {}
for h in df.HardwareId.value_counts().index:
    hid = str(h)
    #print hid
    d = get_dict(hid)
    sorted_x = sorted(d.items(), key=operator.itemgetter(1), reverse=True)
    top_50 = sorted_x[0:50]
    rank = {}
    i = 0
    for x in top_50:
        rank[x[0]] = i
        i += 1
    rank_main[hid] = rank    

In [None]:
# Hash function that returns the index of a given hardwareId, geohash pair.
def get_index(x):
    rank = rank_main[x.split()[0]]
    if x.split()[1] in rank:
        return rank[x.split()[1]]
    return -1

In [None]:
df['StartIndex'] = df.HStartGeohash.apply(lambda x: get_index(x))
df['StopIndex'] = df.HStopGeohash.apply(lambda x: get_index(x))

In [None]:
df.head()

In [None]:
df = df[df.StartIndex != -1]
df = df[df.StopIndex != -1]

In [None]:
len(df)

### Step 5: Visualize the results and find patterns from the features

In [None]:
top_10_hub = df[df.GeoLabel.apply(lambda x: 'Hub' in x)].HardwareId.value_counts().index[100:105]
top_10_routed = df[df.GeoLabel.apply(lambda x: 'Rou' in x)].HardwareId.value_counts().index[100:105]

In [None]:
df_sample = df[df.HardwareId.apply(lambda x: x in top_10_hub or x in top_10_routed)]

In [None]:
len(df_sample)

In [None]:
!git clone https://github.com/jsiddique/facets_labeling.git

In [None]:
from os import sys
sys.path.append('./facets_labeling/')


In [None]:
from facets_labeling import colab_dive

In [None]:
import base64
labels = ['Routed', 'HubSpoke']
from facets_labeling import colab_dive 
fc = colab_dive.Facets()
results = fc.create_classes(labels=labels)
fc.define_atlas(df_sample[['HardwareId', 'StartIndex', 'StopIndex', 'GeoLabel']], atlas_url="Atlas.jpg")
fc.render_html('feature.html')

In [None]:
f = open('feature.html', 'r')
t = f.read()
f.close()
from IPython.display import HTML
display(HTML(t))

### Find patterns in the Facets visualization
* Select color by GeoLabel
* Select StartIndex as X_axis
* Select StopIndex as Y_axis
* Adjust the bins

### Step 6: Cross Start and Stop Index

In [None]:
df['Crossed'] = df.StartIndex.astype(str) + ' ' + df.StopIndex.astype(str)

In [None]:
g = df.groupby(['HardwareId'])

In [None]:
geolabel = g.GeoLabel.apply(min)

In [None]:
df_index = g.Crossed.apply(list)

In [None]:
df_index.head()

### Visualize Some Vehicles

In [None]:
def array2D(arr1D):
    arr = [[0.0 for x in range(25)] for y in range(25)]    
    for i in arr1D:
        x = int(i.split()[0])
        y = int(i.split()[1])        
        if x < 25 and y < 25:
            arr[x][y] += 1.0
    sum_arr = [sum(x) for x in arr]
    total = sum(sum_arr)
    for i in range(25):
        for j in range(25):
            arr[i][j] /= total
    return arr

In [None]:
df_2d =  df_index.apply(lambda x: array2D(x))

In [None]:
df_sample = pd.DataFrame()
df_sample['HardwareId'] = df_2d.index
df_sample['GeoLabel'] = geolabel.values
df_sample['Features'] = df_2d.values

In [None]:
import matplotlib.pyplot as plt
img_dim_inches = 2
img_dpi = 100
def matplot_image(data, i, j):
    fig = plt.figure(figsize=[img_dim_inches, img_dim_inches], dpi=img_dpi)
    ax = fig.add_axes([0.2, 0.2, 0.8, 0.8])
    ax.set_xticks([0, 10,20])
    ax.set_yticks([0, 10,20])
    
    ax.tick_params(axis='x', colors='#24477b')
    ax.tick_params(axis='y', colors='#00aeef')
    
    ax.spines['bottom'].set_color('white')
    ax.spines['top'].set_color('white')
    ax.spines['left'].set_color('white')
    ax.spines['right'].set_color('white')
    
    _= ax.pcolor(data, cmap=plt.cm.Blues)
    ax.set_facecolor('white')
    plt.show()
    plt.close(fig)
    
    

In [None]:
df_sample.head()

In [None]:
%matplotlib inline

In [None]:
hub = df_sample[df_sample.GeoLabel == 'HubSpoke']

In [None]:
hub.head()

In [None]:
# Plot some hub and spoke vehicles
for i in range(5, 15):
    matplot_image(hub['Features'].values[i], 0, 0)
    print ('Hub and Spoke')

In [None]:
routed = df_sample[df_sample.GeoLabel == 'Routed']

In [None]:
routed.head()

In [None]:
# Plot some routed vehicles
for i in range(5, 15):
    matplot_image(routed['Features'].values[i], 0, 0)
    print ('Routed')

In [None]:
df_sample