# Data Cleaning

In [1]:
## Import necessary packages

import sklearn
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso, Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
## Read v4 csv dataset

data = pd.read_csv("Data/project_data_v4.csv")
display(data)

Unnamed: 0,id,program_name,category_name,meeting_type,program_pays_participants,program_has_scholarships,program_provides_transportation,program_provides_free_food,program_price,capacity,...,median_household_income,median_household_income_bracket,asian_prop,black_prop,hispanic_prop,white_prop,other_prop,cluster_location,simpson_diversity_index,gini_diversity_index
0,78752,BBBS Community Based Mentoring (Year Round),Academic Support,face_to_face,Not Paid,YES,YES,False,Free,2000.0,...,91907,"Greater than 90,000",0.185,0.242,0.100,0.436,0.037,West Side,0.705746,0.457548
1,78752,BBBS Community Based Mentoring (Year Round),Work + Career,face_to_face,Not Paid,YES,YES,False,Free,2000.0,...,91907,"Greater than 90,000",0.185,0.242,0.100,0.436,0.037,West Side,0.705746,0.457548
2,78749,Youth Mentoring and Academic Supports,Academic Support,face_to_face,Not Paid,NO,NO,True,Free,100.0,...,30848,"30,000 to 49,999",0.002,0.827,0.112,0.048,0.011,West Side,0.301098,0.163996
3,78749,Youth Mentoring and Academic Supports,Music & Art,face_to_face,Not Paid,NO,NO,True,Free,100.0,...,30848,"30,000 to 49,999",0.002,0.827,0.112,0.048,0.011,West Side,0.301098,0.163996
4,78745,"Strong, Smart, Bold Summer Camp - Girls Inc.",Music & Art,face_to_face,Not Paid,NO,NO,False,Free,100.0,...,35719,"30,000 to 49,999",0.146,0.651,0.054,0.106,0.044,South Side,0.538795,0.320879
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72007,148925,Medicina Scholars,Healthcare,face_to_face,Unknown_Code,NO,,,Free,,...,91907,"Greater than 90,000",0.185,0.242,0.100,0.436,0.037,West Side,0.705746,0.457548
72008,146348,Park Cleanup at Kelvyn Park,Helping Your Community,face_to_face,Not Paid,NO,NO,False,Free,,...,54355,"50,000 to 69,999",0.023,0.045,0.825,0.099,0.008,Northwest Side,0.306956,0.167507
72009,146348,Park Cleanup at Kelvyn Park,Nature,face_to_face,Not Paid,NO,NO,False,Free,,...,54355,"50,000 to 69,999",0.023,0.045,0.825,0.099,0.008,Northwest Side,0.306956,0.167507
72010,146350,Park Cleanup at Kelvyn Park,Helping Your Community,face_to_face,Not Paid,NO,NO,False,Free,,...,54355,"50,000 to 69,999",0.023,0.045,0.825,0.099,0.008,Northwest Side,0.306956,0.167507


In [3]:
#Flatten dataset to only contain unique observations 

#list of column names excluding category name 

columns = list(data.columns)
columns.remove('category_name')

# Delete duplicate rows based on specific columns 
freq_data = data.drop_duplicates(subset=columns, ignore_index = True)

display(freq_data)

Unnamed: 0,id,program_name,category_name,meeting_type,program_pays_participants,program_has_scholarships,program_provides_transportation,program_provides_free_food,program_price,capacity,...,median_household_income,median_household_income_bracket,asian_prop,black_prop,hispanic_prop,white_prop,other_prop,cluster_location,simpson_diversity_index,gini_diversity_index
0,78752,BBBS Community Based Mentoring (Year Round),Academic Support,face_to_face,Not Paid,YES,YES,False,Free,2000.0,...,91907,"Greater than 90,000",0.185,0.242,0.100,0.436,0.037,West Side,0.705746,0.457548
1,78749,Youth Mentoring and Academic Supports,Academic Support,face_to_face,Not Paid,NO,NO,True,Free,100.0,...,30848,"30,000 to 49,999",0.002,0.827,0.112,0.048,0.011,West Side,0.301098,0.163996
2,78745,"Strong, Smart, Bold Summer Camp - Girls Inc.",Music & Art,face_to_face,Not Paid,NO,NO,False,Free,100.0,...,35719,"30,000 to 49,999",0.146,0.651,0.054,0.106,0.044,South Side,0.538795,0.320879
3,78745,"Strong, Smart, Bold Summer Camp - Girls Inc.",Music & Art,face_to_face,Not Paid,NO,NO,False,Free,100.0,...,39030,"30,000 to 49,999",0.007,0.896,0.033,0.041,0.023,South Side,0.193836,0.102134
4,78745,"Strong, Smart, Bold Summer Camp - Girls Inc.",Music & Art,face_to_face,Not Paid,NO,NO,False,Free,100.0,...,52082,"50,000 to 69,999",0.060,0.660,0.022,0.209,0.048,South Side,0.514331,0.303100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67494,146356,IT Adult Apprenticeship | REACH for IT,Computers,face_to_face,"Paid, Type Unknown",NO,,,Free,,...,91907,"Greater than 90,000",0.185,0.242,0.100,0.436,0.037,West Side,0.705746,0.457548
67495,146345,Park Cleanup at Kelvyn Park,Helping Your Community,face_to_face,Not Paid,NO,NO,False,Free,,...,54355,"50,000 to 69,999",0.023,0.045,0.825,0.099,0.008,Northwest Side,0.306956,0.167507
67496,148925,Medicina Scholars,Healthcare,face_to_face,Unknown_Code,NO,,,Free,,...,91907,"Greater than 90,000",0.185,0.242,0.100,0.436,0.037,West Side,0.705746,0.457548
67497,146348,Park Cleanup at Kelvyn Park,Helping Your Community,face_to_face,Not Paid,NO,NO,False,Free,,...,54355,"50,000 to 69,999",0.023,0.045,0.825,0.099,0.008,Northwest Side,0.306956,0.167507


In [4]:
# Discard programs where the minimum age is over 25

display(data[data['min_age'] > 25])

under_25_data = data[data['min_age'] <= 25]

Unnamed: 0,id,program_name,category_name,meeting_type,program_pays_participants,program_has_scholarships,program_provides_transportation,program_provides_free_food,program_price,capacity,...,median_household_income,median_household_income_bracket,asian_prop,black_prop,hispanic_prop,white_prop,other_prop,cluster_location,simpson_diversity_index,gini_diversity_index
68,77506,Tennis - Senior Round Robin at McFetridge,Sports + Wellness,face_to_face,Not Paid,NO,NO,False,$50 or Less,,...,71513,"70,000 to 89,999",0.086,0.027,0.412,0.440,0.035,Northwest Side,0.627306,0.389513
1973,77507,Tennis - Senior Round Robin at McFetridge,Sports + Wellness,face_to_face,Not Paid,NO,NO,False,$50 or Less,,...,71513,"70,000 to 89,999",0.086,0.027,0.412,0.440,0.035,Northwest Side,0.627306,0.389513
2118,77508,Tennis - Senior Round Robin at McFetridge,Sports + Wellness,face_to_face,Not Paid,NO,NO,False,$50 or Less,,...,71513,"70,000 to 89,999",0.086,0.027,0.412,0.440,0.035,Northwest Side,0.627306,0.389513
2122,77511,Tennis - Senior Round Robin at McFetridge,Sports + Wellness,face_to_face,Not Paid,NO,NO,False,$50 or Less,,...,71513,"70,000 to 89,999",0.086,0.027,0.412,0.440,0.035,Northwest Side,0.627306,0.389513
2124,77509,Tennis - Senior Round Robin at McFetridge,Sports + Wellness,face_to_face,Not Paid,NO,NO,False,$50 or Less,,...,71513,"70,000 to 89,999",0.086,0.027,0.412,0.440,0.035,Northwest Side,0.627306,0.389513
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25059,105802,Medicare 101,Managing Money,face_to_face,Not Paid,NO,NO,False,Free,99.0,...,88578,"70,000 to 89,999",0.044,0.007,0.155,0.769,0.026,Far North Side,0.381953,0.213840
38618,121945,"Power of Attorney Workshop: For Property, Heal...",Managing Money,face_to_face,Not Paid,NO,NO,False,Free,,...,60301,"50,000 to 69,999",0.263,0.022,0.187,0.487,0.040,Far North Side,0.656609,0.414004
45630,134633,Retired and Senior Volunteer Program (RSVP),Helping Your Community,face_to_face,Unknown_Code,NO,,,Free,,...,57095,"50,000 to 69,999",0.001,0.934,0.044,0.009,0.012,Far Southeast Side,0.125482,0.064843
45631,134634,Retired and Senior Volunteer Program (RSVP),Helping Your Community,face_to_face,Unknown_Code,NO,,,Free,,...,57095,"50,000 to 69,999",0.001,0.934,0.044,0.009,0.012,Far Southeast Side,0.125482,0.064843


In [5]:
# Identify proper predictor and feature variables and split train/test 

X = data[...]
y = data[...]

sc = StandardScaler()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

sc.fit(X_train)

X_train_scaled = sc.transform(X_train)
X_test_scaled = sc.transform(X_test)

KeyError: Ellipsis

In [None]:
# Use Lasso and Ridge regression to find which features best predict and affect the data

#Lasso regression
lasso = LassoCV(cv=5)
lasso.fit(X_train, y_train)
alpha = lasso.alpha_

print(f"Optimal alpha: {alpha:.4f}")

lasso_model = Lasso(alpha=alpha)
lasso_model.fit(X_train, y_train)

selected_features = np.where(lasso_model.coef_ != 0)[0]
print(f"Selected features: {selected_features}")


#ridge regression
ridge = RidgeCV(cv=5)
ridge.fit(X_train, y_train)
alpha = ridge.alpha_

print(f"Optimal alpha: {alpha:.4f}")

ridge_model = Ridge(alpha=alpha)
ridge_model.fit(X_train, y_train)

selected_features = np.where(ridge_model.coef_ != 0)[0]
print(f"Selected features: {selected_features}")