In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import skimpy
import string
import re
from skimpy import clean_columns

In [None]:
def clean_data(data, bus_data):
    # Clean column names
    data = clean_columns(data)
    
    # Remove all programs where min age > 25
    data = data[data["min_age"] < 25]
    
    # Clean category names (program types)
    # Make lowercase
    data.category_name = data.category_name.apply(lambda x: x.lower() if isinstance(x, str) else x)

    def remove_punctuation(text):
        if isinstance(text, str):
            return text.translate(str.maketrans('', '', string.punctuation))
        else:
            return text

    def replace_spaces_with_underscore(text):
        if isinstance(text, str):
            # Replace one or more whitespace characters with a single underscore
            return re.sub(r'\s+', '_', text)
        else:
            return text

    def remove_trailing_underscores(text):
        if isinstance(text, str):
            # Remove any trailing underscores
            text = text.rstrip('_')
            return text
        else:
            return text
        
    # Remove punctuation, replace spaces with underscores, remove trailing underscores for category names
    data.category_name = data.category_name.apply(remove_punctuation)
    data.category_name = data.category_name.apply(replace_spaces_with_underscore)
    data.category_name = data.category_name.apply(remove_trailing_underscores)
    
    # Clean state + city
    data["state"] = data["state"].replace("Illinois", "IL")
    data = data[data.city == "Chicago"]
    
    # Add age range col
    data["age_range"] = data.max_age - data.min_age
    
    # Add distance to nearest bus stop col
    bus_coords = bus_data[['latitude', 'longitude']].to_numpy()
    mcmf_coords = data[['latitude', 'longitude']].to_numpy()
    
    mcmf_in_radians = np.array([[radians(float(x)) for x in coord] for coord in mcmf_coords])
    bus_in_radians = np.array([[radians(float(x)) for x in coord] for coord in bus_coords])
     
    dists = haversine_distances(mcmf_in_radians, bus_in_radians) # Calculate haversine dists then convert to km
    dists_km = dists*6371 # multiply by radius of Earth
    
    min_dist = np.nanmin(dists_km, axis=1) # Minimum distances to each bus stop

    data['km_to_bus_stop'] = min_dist