In [None]:
#This script takes previously generated datasets (output_1) and expands them by addi-tional IVs (business_proximity). for the calculation of business_proximity the category 'restaurants' is used for calculations intentionally as every restaurant poses a competi-tor/threat in itself to other restaurants.

import pandas as pd
import numpy as np
from math import radians, cos, sin, asin, sqrt
from sklearn.metrics.pairwise import haversine_distances
from itertools import combinations
from multiprocessing import Pool
import multiprocessing as mp
import os
import time
start_time = time.time()
# set working directory to where python file is located
wd = os.path.dirname(os.path.abspath(__file__))
os.chdir(wd)
def haversine_vectorized(coords):
distances = haversine_distances(coords) * 6371 # convert haversine distances to km
return distances
def compute_category_intersections(group):
category_intersections = {}
26
for i, j in combinations(group.index, 2):
categories_i = set(group.at[i, 'categories'].split(', '))
categories_j = set(group.at[j, 'categories'].split(', '))
intersection = len(categories_i.intersection(categories_j))
category_intersections[(i, j)] = intersection
category_intersections[(j, i)] = intersection
print(f"Intersection between {i} and {j}: {intersection} categories")
return category_intersections
def compute_proximity_within_group(group, distances, category_intersections):
proximities = {}
for i in group.index:
proximity_sum = 0.0
categories_x = set(group.at[i, 'categories'].split(', '))
BC = len(categories_x)
print(f"\nCalculating proximity for business {i}, Total categories (BC): {BC}")
for j in group.index:
if i != j:
BCs = category_intersections.get((i, j), 0)
distance = distances[group.index.get_loc(i), group.index.get_loc(j)]
if distance != 0:
proximity_value = (BCs / BC) / distance
proximity_sum += proximity_value
print(f" Compared with business {j}: Shared categories (BCs): {BCs}, Dis-tance: {distance:.2f} km, Proximity: {proximity_value:.5f}")
proximities[i] = proximity_sum
print(f"Total proximity for business {i}: {proximity_sum:.5f}")
return proximities
def optimized_proximity_computation(df, grouping_column):
# group dataframe by city or postal code
grouped = df.groupby(grouping_column)
proximity_series = pd.Series(index=df.index, dtype=float)
27
for name, group in grouped:
print(f"\nComputing proximites for group: {name}")
# convert latitude and longitude to radians for the group
coords = group[['latitude', 'longitude']].apply(np.radians).to_numpy()
distances = haversine_vectorized(coords)
category_intersections = compute_category_intersections(group)
# compute proximity within group
group_proximities = compute_proximity_within_group(group, distances, cate-gory_intersections)
for index, proximity in group_proximities.items():
proximity_series.at[index] = proximity
print(f"Assigned proximity {proximity:.5f} to business {index}")
# assign computed proximites back to df
df['business_proximity'] = proximity_series
return df
def process_state_subset(args):
state, subset, grouping_column = args
print(f"Processing state: {state}")
return optimized_proximity_computation(subset, grouping_column)
def run_proximity_computation_with_mp(df, grouping_column):
# split the df into subsets by state
state_groups = {state: subset for state, subset in df.groupby('state')}
# prepare arguments for multiprocessing (aka frying my memory)
pool_args = [(state, subset.copy(), grouping_column) for state, subset in state_groups.items()]
results = []
# process every subset in parallel
with Pool(mp.cpu_count()) as pool:
results_objects = [pool.apply_async(process_state_subset, args=(arg,)) for arg in pool_args]
for r in results_objects:
try:
result = r.get(timeout=300)
28
results.append(result)
except mp.TimeoutError:
print("Process timed out and was terminated")
# combine results from each subset
combined_results = pd.concat(results) if results else pd.DataFrame()
return combined_results
input_path_main = 'output_1.csv'
df = pd.read_csv(input_path_main)
new_df = optimized_proximity_computation(df, 'postal_code')
df = new_df
# Cap outliers to the 99 Percentile
business_proximity_99_percentile = df['business_proximity'].quantile(0.99)
print(business_proximity_99_percentile)
df ['capped_business_proximity'] = df['business_proximity'].apply(lambda x: min(x, business_proximity_99_percentile))
# Scaling the Data using RobustScaler from scikit-learn
from sklearn.preprocessing import RobustScaler
import matplotlib.pyplot as plt
import seaborn as sns
scaler = RobustScaler()
# Reshape the Data since scaler requires a 2D array
capped_proximity_array = df['capped_business_proximity'].values.reshape(-1, 1)
# Fit the scaler to the data and then transform it
scaled_capped_proximity = scaler.fit_transform(capped_proximity_array)
df['scaled_capped_business_proximity'] = scaled_capped_proximity.flatten()
# Save Subplots directly
figsize = (16, 9)
29
# Histogram for the original data
plt.figure(figsize=figsize)
sns.histplot(df['business_proximity'], kde=True)
plt.title('Original Business Proximity')
plt.savefig('original_business_proximity.png', dpi=300)
# Histogram for the capped data
plt.figure(figsize=figsize)
sns.histplot(df['capped_business_proximity'], kde=True, color='orange')
plt.title('Capped Business Proximity')
plt.savefig('capped_business_proximity.png', dpi=300)
# Histogram for the scaled capped data
plt.figure(figsize=figsize)
sns.histplot(df['scaled_capped_business_proximity'], kde=True, color='green')
plt.title('Scaled Capped Business Proximity')
plt.savefig('scaled_capped_business_proximity.png', dpi=300)
# Show all three subplots side by side for comparison and save as compiled figure
plt.figure(figsize=(24, 8))
# Histogram for the original data
plt.subplot(1, 3, 1)
sns.histplot(df['business_proximity'], kde=True)
plt.title('Original Business Proximity')
# Histogram for the capped data
plt.subplot(1, 3, 2)
sns.histplot(df['capped_business_proximity'], kde=True, color='orange')
plt.title('Capped Business Proximity')
# Histogram for the scaled capped data
plt.subplot(1, 3, 3)
sns.histplot(df['scaled_capped_business_proximity'], kde=True, color='green')
plt.title('Scaled Capped Business Proximity')
plt.tight_layout()
plt.savefig('business_proximity_analysis.png', dpi=300)
# Show all plots
30
plt.show()
# export data
with pd.ExcelWriter('output_2.xlsx') as writer:
df.to_excel(writer, sheet_name='sheet_1', index=False)
df.to_csv('output_2.csv', index=False)
print('--- Runtime: %s seconds ---' % (time.time() - start_time))