In [102]:
import pandas as pd

In [103]:
contributions = pd.read_csv('individual_contributions.csv', sep='|', header=None,
                            usecols=[0, 3, 10, 13, 14, 20],
                            names=['CMTE_ID',
                                  'TRANSACTION_PGI',
                                  'ZIP_CODE',
                                  'TRANSACTION_DT',
                                  'TRANSACTION_AMT',
                                  'SUB_ID'],
                            dtype={"CMTE_ID": str, 
                                   "TRANSACTION_PGI": str,
                                   "ZIP_CODE": str,
                                  'TRANSACTION_DT': str,
                                   "TRANSACTION_AMT": float, 
                                   "SUB_ID": str
                                  }
                           )

In [104]:
contributions['zipcode_5'] = contributions['ZIP_CODE'].str[0:5]
contributions['ZIP_CODE'] = pd.to_numeric(contributions['ZIP_CODE'], errors='coerce')

In [105]:
contributions.dropna(subset=['CMTE_ID', 'ZIP_CODE','TRANSACTION_AMT','SUB_ID','zipcode_5'],inplace=True)
contributions = contributions.loc[contributions['TRANSACTION_AMT'] > 0]

In [107]:
# Illinois Exclusive

contributions = contributions.loc[(contributions['ZIP_CODE'] > 600000000) & (contributions['ZIP_CODE'] < 630000000)]

In [108]:
contributions.head()

Unnamed: 0,CMTE_ID,TRANSACTION_PGI,ZIP_CODE,TRANSACTION_DT,TRANSACTION_AMT,SUB_ID,zipcode_5
649,C00235739,P,603021617.0,2052015,208.0,4032020151240886655,60302
650,C00235739,P,603021617.0,2202015,208.0,4032020151240886656,60302
673,C00235739,P,601262235.0,2202015,270.0,4032020151240886522,60126
758,C00235739,,604481479.0,2052015,139.0,4032020151240886209,60448
759,C00235739,,604481479.0,2202015,145.0,4032020151240886210,60448


In [109]:
contributions.to_csv('contributions.csv', index=False)

In [110]:
grouped = contributions.groupby(['zipcode_5'])

In [111]:
zip_summary_df = pd.DataFrame()
zip_summary_df['donations_sum'] = grouped['TRANSACTION_AMT'].sum()
zip_summary_df['donations_median'] = grouped['TRANSACTION_AMT'].median()
zip_summary_df['donations_count'] = grouped['SUB_ID'].count()
zip_summary_df.reset_index(inplace=True)

In [112]:
zip_summary_df.head()

Unnamed: 0,zipcode_5,donations_sum,donations_median,donations_count
0,60002,49741.0,30.0,777
1,60004,321414.0,50.0,2693
2,60005,305335.0,25.0,1736
3,60006,1001.0,251.0,3
4,60007,165900.0,35.0,1222


In [113]:
zip_summary_df.dtypes

zipcode_5            object
donations_sum       float64
donations_median    float64
donations_count       int64
dtype: object

In [114]:
zip_summary_df.to_csv('zipcode_donations.csv', index=False)

In [116]:
census_df = pd.read_csv('census_data_2016.csv',
                       usecols=['GEO.id', 'HD01_VD01', 'HD01_VD02',
                                'HC01_VC06', 'HC01_VC07','HC01_VC85', 'HC01_VC131',
                                'HC01_EST_VC08', 'HC01_EST_VC11', 'HC01_EST_VC13',
                                'HC01_EST_VC14', 'HC01_EST_VC15'])
header=['zipcode', 'pop_total', 'pop_white', 
        'pop_employed','pop_unemployed', 'median_household_income', 'pop_with_healthcare',
        'edu_25+_total', '25+_HS', 'edu_25+_assoc', 
        'edu_25+_bachelor', 'edu_25+_grad']
census_df = census_df[1:]
census_df.columns = header
census_df.head()
#https://www.census.gov/glossary/#term_Employed

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,zipcode,pop_total,pop_white,pop_employed,pop_unemployed,median_household_income,pop_with_healthcare,edu_25+_total,25+_HS,edu_25+_assoc,edu_25+_bachelor,edu_25+_grad
1,8600000US00601,17800,14436,3904,2152,11507,16503,11887,2985,604,1845,376
2,8600000US00602,39716,22941,11560,3116,15511,37497,27546,6076,3689,3840,1736
3,8600000US00603,51565,35176,12722,3768,16681,47081,35589,9590,2890,5809,2239
4,8600000US00606,6320,3739,1467,205,11648,6167,4381,1553,206,349,77
5,8600000US00610,27976,16259,8327,1587,17751,27106,19237,6329,1618,2368,626


In [117]:
for column in header[1:]:
    census_df[column] = pd.to_numeric(census_df[column], errors='coerce')
census_df.dtypes

zipcode                     object
pop_total                    int64
pop_white                    int64
pop_employed                 int64
pop_unemployed               int64
median_household_income    float64
pop_with_healthcare          int64
edu_25+_total                int64
25+_HS                       int64
edu_25+_assoc                int64
edu_25+_bachelor             int64
edu_25+_grad                 int64
dtype: object

In [118]:
census_summary_df = pd.DataFrame()
census_summary_df['zipcode_5'] = census_df['zipcode'].str[9:14]
census_summary_df['pop_total'] = census_df['pop_total']
census_summary_df['unemployment_rate'] = census_df['pop_unemployed'] / (census_df['pop_unemployed'] + census_df['pop_employed'])
census_summary_df['median_household_income'] = census_df['median_household_income']
census_summary_df['healthcare_rate'] = census_df['pop_with_healthcare'] / census_df['pop_total']
census_summary_df['hs_graduation_rate'] = census_df['25+_HS'] / census_df['edu_25+_total']
census_summary_df['assoc_degree_rate'] = census_df['edu_25+_assoc'] / census_df['edu_25+_total']
census_summary_df['bachelor_degree_rate'] = census_df['edu_25+_bachelor'] / census_df['edu_25+_total']
census_summary_df['grad_degree_rate'] = census_df['edu_25+_grad'] / census_df['edu_25+_total']
census_summary_df.head()

Unnamed: 0,zipcode_5,pop_total,unemployment_rate,median_household_income,healthcare_rate,hs_graduation_rate,assoc_degree_rate,bachelor_degree_rate,grad_degree_rate
1,601,17800,0.35535,11507.0,0.927135,0.251115,0.050812,0.155212,0.031631
2,602,39716,0.212319,15511.0,0.944128,0.220576,0.133921,0.139403,0.063022
3,603,51565,0.228502,16681.0,0.913042,0.269465,0.081205,0.163225,0.062913
4,606,6320,0.122608,11648.0,0.975791,0.354485,0.047021,0.079662,0.017576
5,610,27976,0.160077,17751.0,0.968902,0.329001,0.084109,0.123096,0.032541


In [119]:
census_summary_df.dtypes

zipcode_5                   object
pop_total                    int64
unemployment_rate          float64
median_household_income    float64
healthcare_rate            float64
hs_graduation_rate         float64
assoc_degree_rate          float64
bachelor_degree_rate       float64
grad_degree_rate           float64
dtype: object

In [120]:
combined_summary_df = census_summary_df.merge(zip_summary_df, how='inner')

In [121]:
combined_summary_df['donations_sum_quartile'] = pd.qcut(combined_summary_df['donations_sum'], q=[0, .25, .5, .75, 1.], labels=[4, 3, 2, 1])
combined_summary_df['donations_median_quartile'] = pd.qcut(combined_summary_df['donations_median'], q=[0, .25, .5, .75, 1.], labels=[4, 3, 2, 1])

In [122]:
combined_summary_df.sort_values(by=['donations_sum'])

Unnamed: 0,zipcode_5,pop_total,unemployment_rate,median_household_income,healthcare_rate,hs_graduation_rate,assoc_degree_rate,bachelor_degree_rate,grad_degree_rate,donations_sum,donations_median,donations_count,donations_sum_quartile,donations_median_quartile
941,62466,6346,0.048335,45956.0,0.276552,0.304444,0.079192,0.038990,0.010707,15.0,15.0,1,4,4
1059,62814,2036,0.120000,54145.0,0.892927,0.360951,0.156340,0.077810,0.020173,15.0,5.0,3,4,4
568,61468,56,0.000000,,1.000000,0.805556,0.000000,0.000000,0.111111,18.0,9.0,2,4,4
1089,62860,616,0.051020,56750.0,0.881494,0.533333,0.098925,0.075269,0.019355,25.0,25.0,1,4,4
1030,62672,246,0.116667,44107.0,0.849593,0.482234,0.081218,0.111675,0.025381,32.0,16.0,2,4,4
1002,62617,708,0.121711,50039.0,0.916667,0.523715,0.075099,0.100791,0.019763,34.0,5.0,6,4,4
574,61474,209,0.113821,45833.0,0.933014,0.503268,0.065359,0.071895,0.019608,35.0,35.0,1,4,3
406,61043,100,0.060000,44821.0,0.910000,0.378378,0.081081,0.162162,0.040541,36.0,18.0,2,4,4
1077,62838,1809,0.114641,39426.0,0.863460,0.462199,0.074742,0.030928,0.018900,40.0,20.0,2,4,4
204,60474,903,0.103139,63077.0,0.936877,0.440129,0.067961,0.087379,0.032362,40.0,40.0,1,4,3


In [123]:
combined_summary_df.to_csv('cleaned_combined_zipcodes.csv', index=False)

In [124]:
combined_summary_df = pd.read_csv('cleaned_combined_zipcodes.csv')

In [None]:
from numpy.random import seed
seed(1)
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split

In [86]:
X = combined_summary_df[['pop_total', 'unemployment_rate', 'median_household_income',
                         'healthcare_rate', 'hs_graduation_rate', 'assoc_degree_rate', 
                         'bachelor_degree_rate', 'grad_degree_rate']]
y = combined_summary_df['donation_sum_quartile']
print(X.shape, y.shape)

(29721, 8) (29721,)


In [87]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=1, stratify=y)

In [88]:
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [89]:
# Create an empty sequential model
model = Sequential()

In [90]:
# Add the first layer where the input dimensions are the 561 columns of the training data
model.add(Dense(10, activation='relu', input_dim=X_train.shape[1]))

In [91]:
# Add a second hidden layer
model.add(Dense(10, activation='relu'))

In [92]:
# The output layer has 5 columns that are one-hot encoded
y_train.shape

(22290, 5)

In [93]:
# Add output layer
model.add(Dense(y_train.shape[1], activation="softmax"))

In [94]:
# Compile the model using categorical_crossentropy for the loss function, the adam optimizer,
# and add accuracy to the training metrics
model.compile(loss="categorical_crossentropy",
              optimizer="adam", metrics=['accuracy'])

In [95]:
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [97]:
# Use the training data to fit (train) the model
model.fit(
    X_train_scaled,
    y_train,
    epochs=10,
    shuffle=True,
    verbose=2
)

Epoch 1/10
 - 1s - loss: nan - acc: 0.0000e+00
Epoch 2/10
 - 1s - loss: nan - acc: 0.0000e+00
Epoch 3/10
 - 1s - loss: nan - acc: 0.0000e+00
Epoch 4/10
 - 1s - loss: nan - acc: 0.0000e+00
Epoch 5/10
 - 1s - loss: nan - acc: 0.0000e+00
Epoch 6/10
 - 1s - loss: nan - acc: 0.0000e+00
Epoch 7/10
 - 1s - loss: nan - acc: 0.0000e+00
Epoch 8/10
 - 1s - loss: nan - acc: 0.0000e+00
Epoch 9/10
 - 1s - loss: nan - acc: 0.0000e+00
Epoch 10/10
 - 1s - loss: nan - acc: 0.0000e+00


<tensorflow.python.keras.callbacks.History at 0x1d78a1eba90>

In [98]:
model_loss, model_accuracy = model.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

 - 1s - loss: nan - acc: 0.0000e+00
Loss: nan, Accuracy: 0.0


In [99]:
encoded_predictions = model.predict_classes(X_test_scaled[:5])
prediction_labels = label_encoder.inverse_transform(encoded_predictions)

In [100]:
print(f"Predicted classes: {prediction_labels}")
print(f"Actual Labels: {list(y_test[:5])}")

Predicted classes: [1 1 1 1 1]
Actual Labels: [array([0., 0., 0., 1., 0.], dtype=float32), array([0., 0., 0., 0., 1.], dtype=float32), array([0., 0., 1., 0., 0.], dtype=float32), array([0., 0., 0., 0., 1.], dtype=float32), array([0., 0., 0., 1., 0.], dtype=float32)]
