# Feature Engineering


Author: Jasmine Qin  
Date: 2020-06-12

In [1]:
# Basics
import pandas as pd
import geopandas as gpd
import numpy as np
import seaborn as sns
import time
import re
import json
from collections import defaultdict, Counter

from shapely.ops import nearest_points
from shapely.geometry import Point
import shapely.speedups

Using TensorFlow backend.


In [2]:
# Options
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
shapely.speedups.enable()

In [3]:
combined_train = pd.read_csv('../../data/processed/04_combined_train.csv',
                    low_memory=False)
combined_validation = pd.read_csv('../../data/processed/04_combined_validate.csv',
                         low_memory=False)

In [4]:
def fill_geom(df):
    """This function fills Geom for some business_id
        and recovers around 1000 geoms in train set.
    """

    # list of business_id that has null geom
    list_of_id = df[df.Geom.isnull()].business_id.unique()

    # get all rows for these ids from original df
    could_fill = df[df.business_id.isin(list_of_id)]

    # able to find geom for these ids
    list_of_id = could_fill[could_fill.Geom.notnull()].business_id.unique()

    # fill geoms
    for i in list_of_id:
        df_i = df[df.business_id == i]
        geom = df_i[df_i.Geom.notnull()].Geom.values[0]
        df.loc[df.business_id == i, 'Geom'] = geom

    return df


In [5]:
def chain(df):
    """This function counts how many times a business name
        occurs in the entire dataframe.

       It is not aggregated to years in order to capture
        the scenario of a business gone out of business
        for a couple of years but came back at a different
        location later on.

       When counting chain businesses, both business name
        and business industry are used. This is because
        some business names are owner's names so there are
        duplicated names for completely different businesses.
    """

    # count business by name so filter out the ones
    #   without a name first
    df_copy2 = df.copy()
    df_copy = df[df.BusinessName.notnull()]

    names = []

    # use business_id because ids are aggregated
    #  using location. e.g., Starbucks at different locations
    #  will have the same name but different ids
    for i in df_copy.business_id.unique():
        names.append(
            (df_copy.loc[df_copy.business_id == i,
                         'BusinessName'].values[0],
             df_copy.loc[df_copy.business_id == i,
                         'BusinessIndustry'].values[0]))

    # count names
    name_dict = Counter(names)

    # add chain column
    chain = []
    for i in range(len(df_copy2)):
        name = df_copy2.iloc[i, df_copy2.columns.get_loc(
            'BusinessName')]
        industry = df_copy2.iloc[i, df_copy2.columns.get_loc(
            'BusinessIndustry')]
        
        if pd.isnull(name):
            chain.append(name)
        else:
            try:
                chain.append(name_dict[
                    (name, industry)])
            except:
                chain.append(0)

    df_copy2['chain'] = chain

    return df_copy2


In [6]:
def history(df):
    """This function assigns a binary variable
        to each business id:
        if the business has been operating for
        more than 5 years, it will be assigned
        an 1, otherwise 0.
    """

    df_copy = df.copy()
    df_copy['history'] = np.zeros(len(df_copy))

    for i in df_copy.business_id.unique():
        id_hist = len(df_copy[df_copy.business_id == i])

        if id_hist >= 5:
            history = [0]*5+[1]*(id_hist-5)
            df_copy.loc[df_copy.business_id == i, 
                        'history'] = history

    return df_copy

## Nearby business - train set

In [9]:
geom_train = combined_train[['business_id', 'FOLDERYEAR', 'BusinessType', 'BusinessIndustry', 'Geom']]
geom_train = geom_train[geom_train.Geom.notnull()]
geom_train['lat'] = geom_train.Geom.apply(lambda p: json.loads(p)['coordinates'][0])
geom_train['lon'] = geom_train.Geom.apply(lambda p: json.loads(p)['coordinates'][1])
geom_train['geometry'] = [Point(geom_train['lat'].iloc[i], 
                             geom_train['lon'].iloc[i]) for i in range(len(geom_train['lat']))]
geom_train.drop(columns=['Geom', 'lat', 'lon'], inplace=True)

geom_train_gpd = gpd.GeoDataFrame(geom_train, crs={'init': 'epsg:4326'} , geometry='geometry')

In [11]:
# Main Function
def get_nearest_values(row, other_gdf):
    """Find the nearest point and return count of nearest businesses for selected row."""

    # get union of the other GeoDataFrame's geometries
    other_points = other_gdf["geometry"].unary_union

    # find the nearest points
    nearest_geoms = nearest_points(row["geometry"], other_points)
    
    nearest_data = other_gdf.loc[other_gdf["geometry"] == nearest_geoms[1]]

    return len(nearest_data)

In [14]:
df2 = geom_train_gpd.copy()
df2 = df2[df2.FOLDERYEAR == 2000]
count_dict_2000 = defaultdict(dict)
for y in df2.FOLDERYEAR.unique():
    for i in df2.BusinessType.unique():
        df = df2[(df2.FOLDERYEAR == y) & (df2.BusinessType == i)]
        for index, row in df.iterrows():
            count_dict_2000[y][row['business_id']] = get_nearest_values(row, other_gdf=df)

In [16]:
df2 = geom_train_gpd.copy()
df2 = df2[df2.FOLDERYEAR == 2001]
count_dict_2001 = defaultdict(dict)
for y in df2.FOLDERYEAR.unique():
    for i in df2.BusinessType.unique():
        df = df2[(df2.FOLDERYEAR == y) & (df2.BusinessType == i)]
        for index, row in df.iterrows():
            count_dict_2001[y][row['business_id']] = get_nearest_values(row, other_gdf=df)

In [17]:
df2 = geom_train_gpd.copy()
df2 = df2[df2.FOLDERYEAR == 2002]
count_dict_2002 = defaultdict(dict)
for y in df2.FOLDERYEAR.unique():
    for i in df2.BusinessType.unique():
        df = df2[(df2.FOLDERYEAR == y) & (df2.BusinessType == i)]
        for index, row in df.iterrows():
            count_dict_2002[y][row['business_id']] = get_nearest_values(row, other_gdf=df)

In [55]:
df2 = geom_train_gpd.copy()
df2 = df2[df2.FOLDERYEAR == 2003]
count_dict_2003 = defaultdict(dict)
for y in df2.FOLDERYEAR.unique():
    for i in df2.BusinessType.unique():
        df = df2[(df2.FOLDERYEAR == y) & (df2.BusinessType == i)]
        for index, row in df.iterrows():
            count_dict_2003[y][row['business_id']] = get_nearest_values(row, other_gdf=df)

In [56]:
df2 = geom_train_gpd.copy()
df2 = df2[df2.FOLDERYEAR == 2004]
count_dict_2004 = defaultdict(dict)
for y in df2.FOLDERYEAR.unique():
    for i in df2.BusinessType.unique():
        df = df2[(df2.FOLDERYEAR == y) & (df2.BusinessType == i)]
        for index, row in df.iterrows():
            count_dict_2004[y][row['business_id']] = get_nearest_values(row, other_gdf=df)

In [57]:
df2 = geom_train_gpd.copy()
df2 = df2[df2.FOLDERYEAR == 2005]
count_dict_2005 = defaultdict(dict)
for y in df2.FOLDERYEAR.unique():
    for i in df2.BusinessType.unique():
        df = df2[(df2.FOLDERYEAR == y) & (df2.BusinessType == i)]
        for index, row in df.iterrows():
            count_dict_2005[y][row['business_id']] = get_nearest_values(row, other_gdf=df)

In [65]:
df2 = geom_train_gpd.copy()
df2 = df2[df2.FOLDERYEAR == 2006]
count_dict_2006 = defaultdict(dict)
for y in df2.FOLDERYEAR.unique():
    for i in df2.BusinessType.unique():
        df = df2[(df2.FOLDERYEAR == y) & (df2.BusinessType == i)]
        for index, row in df.iterrows():
            count_dict_2006[y][row['business_id']] = get_nearest_values(row, other_gdf=df)

In [69]:
df2 = geom_train_gpd.copy()
df2 = df2[df2.FOLDERYEAR == 2007]
count_dict_2007 = defaultdict(dict)
for y in df2.FOLDERYEAR.unique():
    for i in df2.BusinessType.unique():
        df = df2[(df2.FOLDERYEAR == y) & (df2.BusinessType == i)]
        for index, row in df.iterrows():
            count_dict_2007[y][row['business_id']] = get_nearest_values(row, other_gdf=df)

In [70]:
df2 = geom_train_gpd.copy()
df2 = df2[df2.FOLDERYEAR == 2008]
count_dict_2008 = defaultdict(dict)
for y in df2.FOLDERYEAR.unique():
    for i in df2.BusinessType.unique():
        df = df2[(df2.FOLDERYEAR == y) & (df2.BusinessType == i)]
        for index, row in df.iterrows():
            count_dict_2008[y][row['business_id']] = get_nearest_values(row, other_gdf=df)

In [71]:
df2 = geom_train_gpd.copy()
df2 = df2[df2.FOLDERYEAR == 2009]
count_dict_2009 = defaultdict(dict)
for y in df2.FOLDERYEAR.unique():
    for i in df2.BusinessType.unique():
        df = df2[(df2.FOLDERYEAR == y) & (df2.BusinessType == i)]
        for index, row in df.iterrows():
            count_dict_2009[y][row['business_id']] = get_nearest_values(row, other_gdf=df)

In [72]:
df2 = geom_train_gpd.copy()
df2 = df2[df2.FOLDERYEAR == 2010]
count_dict_2010 = defaultdict(dict)
for y in df2.FOLDERYEAR.unique():
    for i in df2.BusinessType.unique():
        df = df2[(df2.FOLDERYEAR == y) & (df2.BusinessType == i)]
        for index, row in df.iterrows():
            count_dict_2010[y][row['business_id']] = get_nearest_values(row, other_gdf=df)

In [73]:
df2 = geom_train_gpd.copy()
df2 = df2[df2.FOLDERYEAR == 2011]
count_dict_2011 = defaultdict(dict)
for y in df2.FOLDERYEAR.unique():
    for i in df2.BusinessType.unique():
        df = df2[(df2.FOLDERYEAR == y) & (df2.BusinessType == i)]
        for index, row in df.iterrows():
            count_dict_2011[y][row['business_id']] = get_nearest_values(row, other_gdf=df)

In [74]:
df2 = geom_train_gpd.copy()
df2 = df2[df2.FOLDERYEAR == 2012]
count_dict_2012 = defaultdict(dict)
for y in df2.FOLDERYEAR.unique():
    for i in df2.BusinessType.unique():
        df = df2[(df2.FOLDERYEAR == y) & (df2.BusinessType == i)]
        for index, row in df.iterrows():
            count_dict_2012[y][row['business_id']] = get_nearest_values(row, other_gdf=df)

In [75]:
df2 = geom_train_gpd.copy()
df2 = df2[df2.FOLDERYEAR == 2013]
count_dict_2013 = defaultdict(dict)
for y in df2.FOLDERYEAR.unique():
    for i in df2.BusinessType.unique():
        df = df2[(df2.FOLDERYEAR == y) & (df2.BusinessType == i)]
        for index, row in df.iterrows():
            count_dict_2013[y][row['business_id']] = get_nearest_values(row, other_gdf=df)

In [76]:
df2 = geom_train_gpd.copy()
df2 = df2[df2.FOLDERYEAR == 2014]
count_dict_2014 = defaultdict(dict)
for y in df2.FOLDERYEAR.unique():
    for i in df2.BusinessType.unique():
        df = df2[(df2.FOLDERYEAR == y) & (df2.BusinessType == i)]
        for index, row in df.iterrows():
            count_dict_2014[y][row['business_id']] = get_nearest_values(row, other_gdf=df)

In [77]:
df2 = geom_train_gpd.copy()
df2 = df2[df2.FOLDERYEAR == 2015]
count_dict_2015 = defaultdict(dict)
for y in df2.FOLDERYEAR.unique():
    for i in df2.BusinessType.unique():
        df = df2[(df2.FOLDERYEAR == y) & (df2.BusinessType == i)]
        for index, row in df.iterrows():
            count_dict_2015[y][row['business_id']] = get_nearest_values(row, other_gdf=df)

In [78]:
df2 = geom_train_gpd.copy()
df2 = df2[df2.FOLDERYEAR == 2016]
count_dict_2016 = defaultdict(dict)
for y in df2.FOLDERYEAR.unique():
    for i in df2.BusinessType.unique():
        df = df2[(df2.FOLDERYEAR == y) & (df2.BusinessType == i)]
        for index, row in df.iterrows():
            count_dict_2016[y][row['business_id']] = get_nearest_values(row, other_gdf=df)

In [83]:
df2 = geom_train_gpd.copy()
df2 = df2[df2.FOLDERYEAR == 2017]
count_dict_2017 = defaultdict(dict)
for y in df2.FOLDERYEAR.unique():
    for i in df2.BusinessType.unique():
        df = df2[(df2.FOLDERYEAR == y) & (df2.BusinessType == i)]
        for index, row in df.iterrows():
            count_dict_2017[y][row['business_id']] = get_nearest_values(row, other_gdf=df)

In [84]:
df2 = geom_train_gpd.copy()
df2 = df2[df2.FOLDERYEAR == 2018]
count_dict_2018 = defaultdict(dict)
for y in df2.FOLDERYEAR.unique():
    for i in df2.BusinessType.unique():
        df = df2[(df2.FOLDERYEAR == y) & (df2.BusinessType == i)]
        for index, row in df.iterrows():
            count_dict_2018[y][row['business_id']] = get_nearest_values(row, other_gdf=df)

In [85]:
df2 = geom_train_gpd.copy()
df2 = df2[df2.FOLDERYEAR == 2019]
count_dict_2019 = defaultdict(dict)
for y in df2.FOLDERYEAR.unique():
    for i in df2.BusinessType.unique():
        df = df2[(df2.FOLDERYEAR == y) & (df2.BusinessType == i)]
        for index, row in df.iterrows():
            count_dict_2019[y][row['business_id']] = get_nearest_values(row, other_gdf=df)

In [86]:
df2 = geom_train_gpd.copy()
df2 = df2[df2.FOLDERYEAR == 2020]
count_dict_2020 = defaultdict(dict)
for y in df2.FOLDERYEAR.unique():
    for i in df2.BusinessType.unique():
        df = df2[(df2.FOLDERYEAR == y) & (df2.BusinessType == i)]
        for index, row in df.iterrows():
            count_dict_2020[y][row['business_id']] = get_nearest_values(row, other_gdf=df)

In [87]:
df2 = geom_train_gpd.copy()
df2 = df2[df2.FOLDERYEAR == 1999]
count_dict_1999 = defaultdict(dict)
for y in df2.FOLDERYEAR.unique():
    for i in df2.BusinessType.unique():
        df = df2[(df2.FOLDERYEAR == y) & (df2.BusinessType == i)]
        for index, row in df.iterrows():
            count_dict_1999[y][row['business_id']] = get_nearest_values(row, other_gdf=df)

In [88]:
df2 = geom_train_gpd.copy()
df2 = df2[df2.FOLDERYEAR == 1998]
count_dict_1998 = defaultdict(dict)
for y in df2.FOLDERYEAR.unique():
    for i in df2.BusinessType.unique():
        df = df2[(df2.FOLDERYEAR == y) & (df2.BusinessType == i)]
        for index, row in df.iterrows():
            count_dict_1998[y][row['business_id']] = get_nearest_values(row, other_gdf=df)

In [89]:
df2 = geom_train_gpd.copy()
df2 = df2[df2.FOLDERYEAR == 1997]
count_dict_1997 = defaultdict(dict)
for y in df2.FOLDERYEAR.unique():
    for i in df2.BusinessType.unique():
        df = df2[(df2.FOLDERYEAR == y) & (df2.BusinessType == i)]
        for index, row in df.iterrows():
            count_dict_1997[y][row['business_id']] = get_nearest_values(row, other_gdf=df)

In [99]:
dict_all_years = {**count_dict_1997,
                  **count_dict_1998,
                  **count_dict_1999,
                  **count_dict_2000,
                  **count_dict_2001,
                  **count_dict_2002,
                  **count_dict_2003,
                  **count_dict_2004,
                  **count_dict_2005,
                  **count_dict_2006,
                  **count_dict_2007,
                  **count_dict_2008,
                  **count_dict_2009,
                  **count_dict_2010,
                  **count_dict_2011,
                  **count_dict_2012,
                  **count_dict_2013,
                  **count_dict_2014,
                  **count_dict_2015,
                  **count_dict_2016,
                  **count_dict_2017,
                  **count_dict_2018,
                  **count_dict_2019,
                  **count_dict_2020}


In [100]:
dict_all_years = {(outerKey, innerKey): values for outerKey, innerDict in dict_all_years.items()
                  for innerKey, values in innerDict.items()}
pd.DataFrame(dict_all_years, index=[
    'nearest_business_count']).T.reset_index().rename(
    columns={'level_0': 'FOLDERYEAR',
             'level_1': 'business_id'}).to_csv("nearby_business_train.csv", index=False)


## Nearby business - validation set

In [101]:
geom_valid = combined_validation[['business_id', 'FOLDERYEAR', 'BusinessType', 'BusinessIndustry', 'Geom']]
geom_valid = geom_valid[geom_valid.Geom.notnull()]
geom_valid['lat'] = geom_valid.Geom.apply(lambda p: json.loads(p)['coordinates'][0])
geom_valid['lon'] = geom_valid.Geom.apply(lambda p: json.loads(p)['coordinates'][1])
geom_valid['geometry'] = [Point(geom_valid['lat'].iloc[i], 
                             geom_valid['lon'].iloc[i]) for i in range(len(geom_valid['lat']))]
geom_valid.drop(columns=['Geom', 'lat', 'lon'], inplace=True)

geom_valid_gpd = gpd.GeoDataFrame(geom_valid, crs={'init': 'epsg:4326'} , geometry='geometry')

In [102]:
count_dict_valid = defaultdict(dict)
for y in geom_valid_gpd.FOLDERYEAR.unique():
    for i in geom_valid_gpd.BusinessType.unique():
        df = geom_valid_gpd[(geom_valid_gpd.FOLDERYEAR == y) & (geom_valid_gpd.BusinessType == i)]
        for index, row in df.iterrows():
            count_dict_valid[y][row['business_id']] = get_nearest_values(row, other_gdf=df)

In [104]:
count_dict_valid = {(outerKey, innerKey): values for outerKey, innerDict in count_dict_valid.items()
                  for innerKey, values in innerDict.items()}
pd.DataFrame(count_dict_valid, index=[
    'nearest_business_count']).T.reset_index().rename(
    columns={'level_0': 'FOLDERYEAR',
             'level_1': 'business_id'}).to_csv("nearby_business_valid.csv", index=False)