In [20]:
import pandas as pd

import tensorflow as tf
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

from config.config import db_password
from sqlalchemy import create_engine, inspect
from sqlalchemy.orm import Session
import psycopg2


In [12]:
db_url = f"postgresql+psycopg2://postgres:{db_password}@database.cfqtepfdzy8v.us-east-2.rds.amazonaws.com:5432/postgres"
pool = create_engine(db_url)

In [14]:
# Create an inspector to check table names
inspector = inspect(pool)
inspector.get_table_names()

['pseof', 'pseoe', 'pseo_all', 't_e']

In [15]:
pseo_df = pd.read_sql('pseo_all', pool)

In [17]:
pseo_df.dtypes

agg_level_pseo                   int64
inst_level                      object
institution                      int64
degree_level                     int64
cip_level                       object
cipcode                          int64
grad_cohort                      int64
grad_cohort_years                int64
geo_level                       object
geography                        int64
ind_level                       object
industry                         int64
y1_p50_earnings                  int64
y5_p50_earnings                  int64
y1_ipeds_count                   int64
y5_ipeds_count                   int64
status_y1_earnings               int64
status_y5_earnings               int64
y1_grads_emp                   float64
y1_grads_emp_instate           float64
y5_grads_emp                   float64
y5_grads_emp_instate           float64
status_y1_grads_emp            float64
status_y1_grads_emp_instate    float64
status_y5_grads_emp            float64
status_y5_grads_emp_insta

In [25]:
# Since inst_level, geo_level, and ind level are all the same value they are dropped from the dataframe
dropped_df= pseo_df.drop(['inst_level', 'geo_level', 'ind_level', 'cip_level'], axis = 1)
dropped_df.head()

Unnamed: 0,agg_level_pseo,institution,degree_level,cipcode,grad_cohort,grad_cohort_years,geography,industry,y1_p50_earnings,y5_p50_earnings,...,status_y1_earnings,status_y5_earnings,y1_grads_emp,y1_grads_emp_instate,y5_grads_emp,y5_grads_emp_instate,status_y1_grads_emp,status_y1_grads_emp_instate,status_y5_grads_emp,status_y5_grads_emp_instate
0,38,105100,5,0,0,3,0,0,37267,51849,...,1,1,49897.0,27741.0,32189.0,18740.0,1.0,1.0,1.0,1.0
1,38,105100,7,0,0,5,0,0,50236,60695,...,1,1,14182.0,8311.0,8975.0,5023.0,1.0,1.0,1.0,1.0
2,38,105100,17,0,0,5,0,0,68751,82978,...,1,1,2238.0,992.0,1401.0,547.0,1.0,1.0,1.0,1.0
3,38,105100,18,0,0,5,0,0,71317,94382,...,1,1,1987.0,1208.0,1208.0,795.0,1.0,1.0,1.0,1.0
4,38,105500,5,0,0,3,0,0,43688,58537,...,1,1,8795.0,7012.0,4493.0,3171.0,1.0,1.0,1.0,1.0


In [27]:
# One hot encoding is used on cipcode column

# Create the OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit the encoder and produce encoded DataFrame
encode_df = pd.DataFrame(enc.fit_transform(dropped_df.cipcode.values.reshape(-1,1)))

# Rename encoded columns
encode_df.columns = enc.get_feature_names_out(['cipcode'])
encode_df.head()

Unnamed: 0,cipcode_0,cipcode_1,cipcode_3,cipcode_4,cipcode_5,cipcode_9,cipcode_10,cipcode_11,cipcode_12,cipcode_13,...,cipcode_44,cipcode_45,cipcode_46,cipcode_47,cipcode_48,cipcode_49,cipcode_50,cipcode_51,cipcode_52,cipcode_54
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
# Merge the two DataFrames together and drop the cip_level column
merged_df = dropped_df.merge(right=encode_df, left_index=True, right_index=True).drop(columns=["cipcode"])
merged_df

Unnamed: 0,agg_level_pseo,institution,degree_level,grad_cohort,grad_cohort_years,geography,industry,y1_p50_earnings,y5_p50_earnings,y1_ipeds_count,...,cipcode_44,cipcode_45,cipcode_46,cipcode_47,cipcode_48,cipcode_49,cipcode_50,cipcode_51,cipcode_52,cipcode_54
0,38,105100,5,0,3,0,0,37267,51849,81456,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,38,105100,7,0,5,0,0,50236,60695,19824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,38,105100,17,0,5,0,0,68751,82978,2786,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,38,105100,18,0,5,0,0,71317,94382,2956,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,38,105500,5,0,3,0,0,43688,58537,15223,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53315,48,3730300,3,2001,5,0,0,23828,39852,363,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53316,48,3730300,3,2001,5,0,0,31624,41119,63,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
53317,48,3730300,3,2001,5,0,0,26840,35645,79,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
53318,48,3789400,3,2001,5,0,0,26167,39475,95,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
# Check column names to make sure merge was done correctly
merged_df.columns

Index(['agg_level_pseo', 'institution', 'degree_level', 'grad_cohort',
       'grad_cohort_years', 'geography', 'industry', 'y1_p50_earnings',
       'y5_p50_earnings', 'y1_ipeds_count', 'y5_ipeds_count',
       'status_y1_earnings', 'status_y5_earnings', 'y1_grads_emp',
       'y1_grads_emp_instate', 'y5_grads_emp', 'y5_grads_emp_instate',
       'status_y1_grads_emp', 'status_y1_grads_emp_instate',
       'status_y5_grads_emp', 'status_y5_grads_emp_instate', 'cipcode_0',
       'cipcode_1', 'cipcode_3', 'cipcode_4', 'cipcode_5', 'cipcode_9',
       'cipcode_10', 'cipcode_11', 'cipcode_12', 'cipcode_13', 'cipcode_14',
       'cipcode_15', 'cipcode_16', 'cipcode_19', 'cipcode_22', 'cipcode_23',
       'cipcode_24', 'cipcode_25', 'cipcode_26', 'cipcode_27', 'cipcode_30',
       'cipcode_31', 'cipcode_38', 'cipcode_39', 'cipcode_40', 'cipcode_41',
       'cipcode_42', 'cipcode_43', 'cipcode_44', 'cipcode_45', 'cipcode_46',
       'cipcode_47', 'cipcode_48', 'cipcode_49', 'cipcode_50', 'c

In [45]:
# Convert column names to strings to prevent later futures warnings
merged_df.columns = merged_df.columns.map(str)
merged_df.columns

Index(['agg_level_pseo', 'institution', 'degree_level', 'grad_cohort',
       'grad_cohort_years', 'geography', 'industry', 'y1_p50_earnings',
       'y5_p50_earnings', 'y1_ipeds_count', 'y5_ipeds_count',
       'status_y1_earnings', 'status_y5_earnings', 'y1_grads_emp',
       'y1_grads_emp_instate', 'y5_grads_emp', 'y5_grads_emp_instate',
       'status_y1_grads_emp', 'status_y1_grads_emp_instate',
       'status_y5_grads_emp', 'status_y5_grads_emp_instate', 'cipcode_0',
       'cipcode_1', 'cipcode_3', 'cipcode_4', 'cipcode_5', 'cipcode_9',
       'cipcode_10', 'cipcode_11', 'cipcode_12', 'cipcode_13', 'cipcode_14',
       'cipcode_15', 'cipcode_16', 'cipcode_19', 'cipcode_22', 'cipcode_23',
       'cipcode_24', 'cipcode_25', 'cipcode_26', 'cipcode_27', 'cipcode_30',
       'cipcode_31', 'cipcode_38', 'cipcode_39', 'cipcode_40', 'cipcode_41',
       'cipcode_42', 'cipcode_43', 'cipcode_44', 'cipcode_45', 'cipcode_46',
       'cipcode_47', 'cipcode_48', 'cipcode_49', 'cipcode_50', 'c

In [46]:
# Define features dataset
X= merged_df.drop(['y1_p50_earnings','y5_p50_earnings', 'y1_ipeds_count', 'status_y1_earnings','status_y5_earnings','y1_grads_emp','y1_grads_emp_instate', 'y5_grads_emp',
       'y5_grads_emp_instate', 'status_y1_grads_emp', 'status_y1_grads_emp_instate', 'status_y5_grads_emp','y5_ipeds_count','status_y5_grads_emp_instate'],axis= 1)
X

Unnamed: 0,agg_level_pseo,institution,degree_level,grad_cohort,grad_cohort_years,geography,industry,cipcode_0,cipcode_1,cipcode_3,...,cipcode_44,cipcode_45,cipcode_46,cipcode_47,cipcode_48,cipcode_49,cipcode_50,cipcode_51,cipcode_52,cipcode_54
0,38,105100,5,0,3,0,0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,38,105100,7,0,5,0,0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,38,105100,17,0,5,0,0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,38,105100,18,0,5,0,0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,38,105500,5,0,3,0,0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53315,48,3730300,3,2001,5,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53316,48,3730300,3,2001,5,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
53317,48,3730300,3,2001,5,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
53318,48,3789400,3,2001,5,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
# Define target dataset
y = dropped_df["y1_p50_earnings"].ravel()
y[:5]

array([37267, 50236, 68751, 71317, 43688], dtype=int64)

In [48]:
# Splitting into Train and Test sets for Logistic regression.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Create scaler instance
X_scaler = StandardScaler()

# Fit the scaler
X_scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

print(f'Training set size: {X_train_scaled.shape} ---- Testing set size: {X_test.shape}')

Training set size: (39990, 45) ---- Testing set size: (13330, 45)


In [50]:
# Create the Keras Sequential model
nn_model = tf.keras.models.Sequential()

# Add our first Dense layer, including the input layer
nn_model.add(tf.keras.layers.Dense(units=80, activation="relu", input_dim=len(X_train_scaled[0])))

# Add a hidden layer
nn_model.add(tf.keras.layers.Dense(units=40, activation="relu"))

# Add the output layer that uses a probability activation function
nn_model.add(tf.keras.layers.Dense(units=1, activation="linear"))

# Check the structure of the Sequential model
nn_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 80)                3680      
                                                                 
 dense_1 (Dense)             (None, 40)                3240      
                                                                 
 dense_2 (Dense)             (None, 1)                 41        
                                                                 
Total params: 6,961
Trainable params: 6,961
Non-trainable params: 0
_________________________________________________________________


In [51]:
# Compile the model
nn_model.compile(optimizer="adam", loss="mean_absolute_error", metrics=["mean_absolute_error"])

In [52]:
# Fit the model to the training data
fit_model = nn_model.fit(X_train_scaled, y_train, epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
