In [3]:
import pandas as pd

In [None]:
contributions = pd.read_csv('individual_contributions.csv', sep='|', header=None,
                            usecols=[0, 3, 10, 13, 14, 20],
                            names=['CMTE_ID',
                                  'TRANSACTION_PGI',
                                  'ZIP_CODE',
                                  'TRANSACTION_DT',
                                  'TRANSACTION_AMT',
                                  'SUB_ID'],
                            dtype={"CMTE_ID": str, 
                                   "TRANSACTION_PGI": str,
                                   "ZIP_CODE": str,
                                  'TRANSACTION_DT': str,
                                   "TRANSACTION_AMT": float, 
                                   "SUB_ID": str
                                  }
                           )

In [None]:
contributions['zipcode_5'] = contributions['ZIP_CODE'].str[0:5]
contributions['ZIP_CODE'] = pd.to_numeric(contributions['ZIP_CODE'], errors='coerce')

In [None]:
contributions.dropna(subset=['CMTE_ID', 'ZIP_CODE','TRANSACTION_AMT','SUB_ID','zipcode_5'],inplace=True)
contributions = contributions.loc[contributions['TRANSACTION_AMT'] > 0]

In [None]:
# Illinois Exclusive

contributions = contributions.loc[(contributions['ZIP_CODE'] > 600000000) & (contributions['ZIP_CODE'] < 630000000)]

In [None]:
contributions.head()

In [None]:
contributions.to_csv('contributions.csv', index=False)

In [None]:
grouped = contributions.groupby(['zipcode_5'])

In [None]:
zip_summary_df = pd.DataFrame()
zip_summary_df['donations_sum'] = grouped['TRANSACTION_AMT'].sum()
zip_summary_df['donations_median'] = grouped['TRANSACTION_AMT'].median()
zip_summary_df['donations_count'] = grouped['SUB_ID'].count()
zip_summary_df.reset_index(inplace=True)

In [None]:
zip_summary_df.head()

In [None]:
zip_summary_df.dtypes

In [None]:
zip_summary_df.to_csv('zipcode_donations.csv', index=False)

In [None]:
census_df = pd.read_csv('census_data_2016.csv',
                       usecols=['GEO.id', 'HD01_VD01', 'HD01_VD02',
                                'HC01_VC06', 'HC01_VC07','HC01_VC85', 'HC01_VC131',
                                'HC01_EST_VC08', 'HC01_EST_VC11', 'HC01_EST_VC13',
                                'HC01_EST_VC14', 'HC01_EST_VC15'])
header=['zipcode', 'pop_total', 'pop_white', 
        'pop_employed','pop_unemployed', 'median_household_income', 'pop_with_healthcare',
        'edu_25+_total', '25+_HS', 'edu_25+_assoc', 
        'edu_25+_bachelor', 'edu_25+_grad']
census_df = census_df[1:]
census_df.columns = header
census_df.head()
#https://www.census.gov/glossary/#term_Employed

In [None]:
for column in header[1:]:
    census_df[column] = pd.to_numeric(census_df[column], errors='coerce')
census_df.dtypes

In [None]:
census_summary_df = pd.DataFrame()
census_summary_df['zipcode_5'] = census_df['zipcode'].str[9:14]
census_summary_df['pop_total'] = census_df['pop_total']
census_summary_df['unemployment_rate'] = census_df['pop_unemployed'] / (census_df['pop_unemployed'] + census_df['pop_employed'])
census_summary_df['median_household_income'] = census_df['median_household_income']
census_summary_df['healthcare_rate'] = census_df['pop_with_healthcare'] / census_df['pop_total']
census_summary_df['hs_graduation_rate'] = census_df['25+_HS'] / census_df['edu_25+_total']
census_summary_df['assoc_degree_rate'] = census_df['edu_25+_assoc'] / census_df['edu_25+_total']
census_summary_df['bachelor_degree_rate'] = census_df['edu_25+_bachelor'] / census_df['edu_25+_total']
census_summary_df['grad_degree_rate'] = census_df['edu_25+_grad'] / census_df['edu_25+_total']
census_summary_df.head()

In [None]:
census_summary_df.dtypes

In [None]:
combined_summary_df = census_summary_df.merge(zip_summary_df, how='inner')

In [None]:
combined_summary_df['donations_sum_quartile'] = pd.qcut(combined_summary_df['donations_sum'], q=[0, .25, .5, .75, 1.], labels=[4, 3, 2, 1])
combined_summary_df['donations_median_quartile'] = pd.qcut(combined_summary_df['donations_median'], q=[0, .25, .5, .75, 1.], labels=[4, 3, 2, 1])

In [None]:
combined_summary_df.sort_values(by=['donations_sum'])

In [None]:
combined_summary_df.to_csv('cleaned_combined_zipcodes.csv', index=False)

In [59]:
combined_summary_df = pd.read_csv('cleaned_combined_zipcodes.csv')

In [60]:
from numpy.random import seed
seed(1)
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split

In [75]:
combined_summary_df.dropna(inplace=True)
X = combined_summary_df[['pop_total', 'unemployment_rate', 'median_household_income',
                         'healthcare_rate', 'hs_graduation_rate', 'assoc_degree_rate', 
                         'bachelor_degree_rate', 'grad_degree_rate']]
y = combined_summary_df['donations_median_quartile']
print(X.shape, y.shape)

(1160, 8) (1160,)


In [76]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=1, stratify=y)

In [77]:
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [78]:
y_test

array([[0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       ...,
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0.]], dtype=float32)

In [79]:
# Create an empty sequential model
model = Sequential()

In [80]:
# Add the first layer where the input dimensions are the 561 columns of the training data
model.add(Dense(50, activation='relu', input_dim=X_train.shape[1]))

In [81]:
# Add a second hidden layer
model.add(Dense(50, activation='relu'))

In [82]:
# The output layer has 5 columns that are one-hot encoded
y_train.shape

(870, 5)

In [83]:
# Add output layer
model.add(Dense(y_train.shape[1], activation="softmax"))

In [84]:
# Compile the model using categorical_crossentropy for the loss function, the adam optimizer,
# and add accuracy to the training metrics
model.compile(loss="categorical_crossentropy",
              optimizer="adam", metrics=['accuracy'])

In [85]:
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [86]:
y_train

array([[0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0.],
       ...,
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.]], dtype=float32)

In [87]:
# Use the training data to fit (train) the model
model.fit(
    X_train_scaled,
    y_train,
    epochs=100,
    shuffle=True,
    verbose=2
)

Epoch 1/100
 - 1s - loss: 1.5474 - acc: 0.2770
Epoch 2/100
 - 0s - loss: 1.4068 - acc: 0.3494
Epoch 3/100
 - 0s - loss: 1.3508 - acc: 0.3494
Epoch 4/100
 - 0s - loss: 1.3156 - acc: 0.3736
Epoch 5/100
 - 0s - loss: 1.2951 - acc: 0.3908
Epoch 6/100
 - 0s - loss: 1.2804 - acc: 0.3920
Epoch 7/100
 - 0s - loss: 1.2669 - acc: 0.4069
Epoch 8/100
 - 0s - loss: 1.2570 - acc: 0.3966
Epoch 9/100
 - 0s - loss: 1.2473 - acc: 0.4195
Epoch 10/100
 - 0s - loss: 1.2440 - acc: 0.4092
Epoch 11/100
 - 0s - loss: 1.2403 - acc: 0.3989
Epoch 12/100
 - 0s - loss: 1.2291 - acc: 0.4264
Epoch 13/100
 - 0s - loss: 1.2240 - acc: 0.4368
Epoch 14/100
 - 0s - loss: 1.2215 - acc: 0.4276
Epoch 15/100
 - 0s - loss: 1.2199 - acc: 0.4253
Epoch 16/100
 - 0s - loss: 1.2092 - acc: 0.4425
Epoch 17/100
 - 0s - loss: 1.2051 - acc: 0.4437
Epoch 18/100
 - 0s - loss: 1.2016 - acc: 0.4356
Epoch 19/100
 - 0s - loss: 1.1966 - acc: 0.4310
Epoch 20/100
 - 0s - loss: 1.1980 - acc: 0.4414
Epoch 21/100
 - 0s - loss: 1.1908 - acc: 0.4517
E

<tensorflow.python.keras.callbacks.History at 0x227d876d668>

In [91]:
# Evaluate the model using the training data
model_loss, model_accuracy = model.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

 - 0s - loss: 1.5272 - acc: 0.3034
Loss: 1.527183276209338, Accuracy: 0.30344828963279724


In [89]:
test = np.expand_dims(X_test_scaled[0], axis=0)
test.shape

(1, 8)

In [58]:
print(f"Predicted class: {model.predict_classes(test)}")

Predicted class: [1]
