In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [19]:
def show_corr(df):
    # make a clean matrix, the font can fit in the box
    plt.figure(figsize=(16, 16))
    sns.heatmap(df.corr(), annot=True, fmt='.2f', cmap='coolwarm')
    plt.show()

### Load data and Clean data

In [20]:
# Load data
datapath = 'Data/national_csv/national_tes.csv'
df = pd.read_csv(datapath)

# Drop unnecessary columns
drop_columns = [
    'tc_gap', 'treecanopy', # most values are -1
    'holc_grade', 'tesctyscor', # NULL values
    'GEOID', 'place', 'state', 'state_abbr','county', 'ua_name', 'congressio','cnpysource', # unnecessary
    'cbg_pop', 'priority_i', 'pctpoc', 'pctpov', 'unemplrate', 'dep_ratio','dep_perc', 'linguistic', 'temp_diff' # high corr
]
df = df.drop(drop_columns, axis=1)

### Split data

In [21]:
X, y = df.drop('unemplnorm', axis=1), df['unemplnorm']
X_dev, X_test, y_dev, y_test = train_test_split(X, y, 
                                                random_state=42,
                                                test_size = .2)

### Target Encoding for Categorical Features

In [22]:
from category_encoders import TargetEncoder
# Create the target encoder
encoder = TargetEncoder()

# Fit the encoder on the development set (you should pass both the feature and the target)
encoder.fit(X_dev['biome'].astype(str), y_dev)

# Transform the feature (apply the encoding)
transformed_biome = encoder.transform(X_dev['biome'].astype(str))
transformed_biome_test = encoder.transform(X_test['biome'].astype(str))

X_dev['biome_target'] = transformed_biome
X_test['biome_target'] = transformed_biome_test
X_dev = X_dev.drop('biome', axis=1)
X_test = X_test.drop('biome', axis=1)

### Reset Index

In [23]:
X_dev.reset_index(drop=True, inplace=True)  
X_test.reset_index(drop=True, inplace=True)

### Save the dataframe to local

In [24]:
# if there is no finalized data folder, create one
import os
if not os.path.exists('finalized_data'):
    os.makedirs('finalized_data')

# save the data
X_dev.to_csv('finalized_data/X_dev.csv', index=False)
X_test.to_csv('finalized_data/X_test.csv', index=False)
y_dev.to_csv('finalized_data/y_dev.csv', index=False)
y_test.to_csv('finalized_data/y_test.csv', index=False)


### Helper function for reading data

In [25]:
# read_data helper function
def read_data():
    X_dev = pd.read_csv('finalized_data/X_dev.csv')
    X_test = pd.read_csv('finalized_data/X_test.csv')
    y_dev = pd.read_csv('finalized_data/y_dev.csv')
    y_test = pd.read_csv('finalized_data/y_test.csv')
    return X_dev, X_test, y_dev, y_test

### Model

In [26]:
X_dev, X_val, y_dev, y_val = read_data()

In [28]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Activation

input_shape = X_dev.shape[1]

model = Sequential()

# First hidden layer
model.add(Dense(128, input_shape=(input_shape,)))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.3))

# Second hidden layer
model.add(Dense(128))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.3))

# Third hidden layer
model.add(Dense(64))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.2))

# Fourth hidden layer
model.add(Dense(32))
model.add(Activation('relu'))

# Output layer
model.add(Dense(1))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit(X_dev, y_dev, validation_data=(X_val, y_val), epochs=10, batch_size=32)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7f77e8f83f70>