In [2]:
# Import Dependencies
import numpy as np
import pandas as pd
import pyspark as spark
import sklearn as skl
from pathlib import Path
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
# Create pandas dataframe from the csv file
df = pd.read_csv('Pseo_All.csv')

In [5]:
df.head()

Unnamed: 0,agg_level_pseo,inst_level,institution,degree_level,cip_level,cipcode,grad_cohort,grad_cohort_years,geo_level,geography,...,status_y1_earnings,status_y5_earnings,y1_grads_emp,y1_grads_emp_instate,y5_grads_emp,y5_grads_emp_instate,status_y1_grads_emp,status_y1_grads_emp_instate,status_y5_grads_emp,status_y5_grads_emp_instate
0,38,I,105100,5,A,0,0,3,N,0,...,1,1,49897.0,27741.0,32189.0,18740.0,1.0,1.0,1.0,1.0
1,38,I,105100,5,A,0,0,3,N,0,...,1,1,49897.0,27741.0,32189.0,18740.0,1.0,1.0,1.0,1.0
2,38,I,105100,5,A,0,0,3,N,0,...,1,1,49897.0,27741.0,32189.0,18740.0,1.0,1.0,1.0,1.0
3,38,I,105100,5,A,0,0,3,N,0,...,1,1,49897.0,27741.0,32189.0,18740.0,1.0,1.0,1.0,1.0
4,38,I,105100,5,A,0,0,3,N,0,...,1,1,49897.0,27741.0,32189.0,18740.0,1.0,1.0,1.0,1.0


In [6]:
df.dtypes

agg_level_pseo                   int64
inst_level                      object
institution                      int64
degree_level                     int64
cip_level                       object
cipcode                          int64
grad_cohort                      int64
grad_cohort_years                int64
geo_level                       object
geography                        int64
ind_level                       object
industry                         int64
y1_p50_earnings                  int64
y5_p50_earnings                  int64
y1_ipeds_count                   int64
y5_ipeds_count                   int64
status_y1_earnings               int64
status_y5_earnings               int64
y1_grads_emp                   float64
y1_grads_emp_instate           float64
y5_grads_emp                   float64
y5_grads_emp_instate           float64
status_y1_grads_emp            float64
status_y1_grads_emp_instate    float64
status_y5_grads_emp            float64
status_y5_grads_emp_insta

In [7]:
df.groupby('inst_level').inst_level.count()

inst_level
I    223160
Name: inst_level, dtype: int64

In [8]:
df.groupby('geo_level').geo_level.count()

geo_level
N    223160
Name: geo_level, dtype: int64

In [9]:
df.groupby('cip_level').cip_level.count()

cip_level
2    97984
4    26872
2    58122
4    17994
A    22188
Name: cip_level, dtype: int64

In [10]:
df.groupby('ind_level').ind_level.count()

ind_level
A    223160
Name: ind_level, dtype: int64

In [11]:
# Since inst_level, geo_level, and ind level are all the same value they are dropped from the dataframe
dropped_df= df.drop(['inst_level', 'geo_level', 'ind_level'], axis = 1)
dropped_df.head()

Unnamed: 0,agg_level_pseo,institution,degree_level,cip_level,cipcode,grad_cohort,grad_cohort_years,geography,industry,y1_p50_earnings,...,status_y1_earnings,status_y5_earnings,y1_grads_emp,y1_grads_emp_instate,y5_grads_emp,y5_grads_emp_instate,status_y1_grads_emp,status_y1_grads_emp_instate,status_y5_grads_emp,status_y5_grads_emp_instate
0,38,105100,5,A,0,0,3,0,0,37267,...,1,1,49897.0,27741.0,32189.0,18740.0,1.0,1.0,1.0,1.0
1,38,105100,5,A,0,0,3,0,0,37267,...,1,1,49897.0,27741.0,32189.0,18740.0,1.0,1.0,1.0,1.0
2,38,105100,5,A,0,0,3,0,0,37267,...,1,1,49897.0,27741.0,32189.0,18740.0,1.0,1.0,1.0,1.0
3,38,105100,5,A,0,0,3,0,0,37267,...,1,1,49897.0,27741.0,32189.0,18740.0,1.0,1.0,1.0,1.0
4,38,105100,5,A,0,0,3,0,0,37267,...,1,1,49897.0,27741.0,32189.0,18740.0,1.0,1.0,1.0,1.0


In [24]:
#Replacing all "A" values with the number 1 so that they are all the same data type
dropped_df.loc[dropped_df["cip_level"] == "A", "cip_level"] = 1

# Changing data type
dropped_df['cip_level']= dropped_df['cip_level'].astype(int)
dropped_df.dtypes

agg_level_pseo                   int64
institution                      int64
degree_level                     int64
cip_level                        int64
cipcode                          int64
grad_cohort                      int64
grad_cohort_years                int64
geography                        int64
industry                         int64
y1_p50_earnings                  int64
y5_p50_earnings                  int64
y1_ipeds_count                   int64
y5_ipeds_count                   int64
status_y1_earnings               int64
status_y5_earnings               int64
y1_grads_emp                   float64
y1_grads_emp_instate           float64
y5_grads_emp                   float64
y5_grads_emp_instate           float64
status_y1_grads_emp            float64
status_y1_grads_emp_instate    float64
status_y5_grads_emp            float64
status_y5_grads_emp_instate    float64
dtype: object

In [25]:
# One hot encoding is used on cip_level column

# Create the OneHotEncoder instance
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(sparse=False)

# Fit the encoder and produce encoded DataFrame
encode_df = pd.DataFrame(enc.fit_transform(dropped_df.cip_level.values.reshape(-1,1)))

# Rename encoded columns
encode_df.columns = enc.get_feature_names(['cip_level'])
encode_df.head()

Unnamed: 0,cip_level_1,cip_level_2,cip_level_4
0,1.0,0.0,0.0
1,1.0,0.0,0.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,1.0,0.0,0.0


In [26]:
# Merge the two DataFrames together and drop the cip_level column
dropped_df.merge(encode_df,left_index=True,right_index=True).drop("cip_level",1)

Unnamed: 0,agg_level_pseo,institution,degree_level,cipcode,grad_cohort,grad_cohort_years,geography,industry,y1_p50_earnings,y5_p50_earnings,...,y1_grads_emp_instate,y5_grads_emp,y5_grads_emp_instate,status_y1_grads_emp,status_y1_grads_emp_instate,status_y5_grads_emp,status_y5_grads_emp_instate,cip_level_1,cip_level_2,cip_level_4
0,38,105100,5,0,0,3,0,0,37267,51849,...,27741.0,32189.0,18740.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
1,38,105100,5,0,0,3,0,0,37267,51849,...,27741.0,32189.0,18740.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
2,38,105100,5,0,0,3,0,0,37267,51849,...,27741.0,32189.0,18740.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
3,38,105100,5,0,0,3,0,0,37267,51849,...,27741.0,32189.0,18740.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
4,38,105100,5,0,0,3,0,0,37267,51849,...,27741.0,32189.0,18740.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223155,48,3730300,3,52,2001,5,0,0,26840,35645,...,,,,,,,,0.0,0.0,1.0
223156,48,3789400,3,24,2001,5,0,0,26167,39475,...,,,,,,,,0.0,0.0,1.0
223157,48,3789400,3,24,2001,5,0,0,26167,39475,...,,,,,,,,0.0,0.0,1.0
223158,48,3956300,3,13,2001,5,0,0,29553,31238,...,,,,,,,,0.0,0.0,1.0


In [61]:
dropped_df.columns

Index(['agg_level_pseo', 'institution', 'degree_level', 'cip_level', 'cipcode',
       'grad_cohort', 'grad_cohort_years', 'geography', 'industry',
       'y1_p50_earnings', 'y5_p50_earnings', 'y1_ipeds_count',
       'y5_ipeds_count', 'status_y1_earnings', 'status_y5_earnings',
       'y1_grads_emp', 'y1_grads_emp_instate', 'y5_grads_emp',
       'y5_grads_emp_instate', 'status_y1_grads_emp',
       'status_y1_grads_emp_instate', 'status_y5_grads_emp',
       'status_y5_grads_emp_instate'],
      dtype='object')

In [110]:
# Applying the condition
dropped_df["y1_p50_earnings"] = np.where(df["y1_p50_earnings"] < 35000, 0, 1)
dropped_df.groupby('y1_p50_earnings').y1_p50_earnings.count()

y1_p50_earnings
0    114374
1    108786
Name: y1_p50_earnings, dtype: int64

In [111]:
# Define features dataset
X= dropped_df.drop(['y1_p50_earnings','y5_p50_earnings', 'y1_ipeds_count', 'status_y1_earnings','status_y5_earnings','y1_grads_emp','y1_grads_emp_instate', 'y5_grads_emp',
       'y5_grads_emp_instate', 'status_y1_grads_emp', 'status_y1_grads_emp_instate', 'status_y5_grads_emp','y5_ipeds_count','status_y5_grads_emp_instate'],axis= 1)
X.head()


Unnamed: 0,agg_level_pseo,institution,degree_level,cip_level,cipcode,grad_cohort,grad_cohort_years,geography,industry
0,38,105100,5,1,0,0,3,0,0
1,38,105100,5,1,0,0,3,0,0
2,38,105100,5,1,0,0,3,0,0
3,38,105100,5,1,0,0,3,0,0
4,38,105100,5,1,0,0,3,0,0


In [112]:
# Define target dataset
y = dropped_df["y1_p50_earnings"].ravel()
y[:5]

array([1, 1, 1, 1, 1])

In [113]:
# Splitting into Train and Test sets.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [114]:
# Create scaler instance
X_scaler = skl.preprocessing.StandardScaler()

# Fit the scaler
X_scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [116]:
# Define the model - deep neural net
import tensorflow as tf

# Create the Keras Sequential model
nn_model = tf.keras.models.Sequential()

# Add our first Dense layer, including the input layer
nn_model.add(tf.keras.layers.Dense(units=1, activation="relu", input_dim=9))

# Add the output layer that uses a probability activation function
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the Sequential model
nn_model.summary()

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_24 (Dense)            (None, 1)                 10        
                                                                 
 dense_25 (Dense)            (None, 1)                 2         
                                                                 
Total params: 12
Trainable params: 12
Non-trainable params: 0
_________________________________________________________________


In [117]:
# Compile the model
nn_model.compile(optimizer="adam", loss="mean_squared_error", metrics=["accuracy"])

In [118]:
# Fit the model to the training data
fit_model = nn_model.fit(X_train_scaled, y_train, epochs=40)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [119]:
# Prepare for test on year 5 earnings
dropped_df["y5_p50_earnings"] = np.where(df["y5_p50_earnings"] < 50000, 0, 1)
dropped_df.groupby('y5_p50_earnings').y1_p50_earnings.count()

y5_p50_earnings
0    116852
1    106308
Name: y1_p50_earnings, dtype: int64

In [120]:
# Define features dataset
X= dropped_df.drop(['y1_p50_earnings','y5_p50_earnings', 'y1_ipeds_count', 'status_y1_earnings','status_y5_earnings','y1_grads_emp','y1_grads_emp_instate', 'y5_grads_emp',
       'y5_grads_emp_instate', 'status_y1_grads_emp', 'status_y1_grads_emp_instate', 'status_y5_grads_emp','y5_ipeds_count','status_y5_grads_emp_instate'],axis= 1)
X.head()


Unnamed: 0,agg_level_pseo,institution,degree_level,cip_level,cipcode,grad_cohort,grad_cohort_years,geography,industry
0,38,105100,5,1,0,0,3,0,0
1,38,105100,5,1,0,0,3,0,0
2,38,105100,5,1,0,0,3,0,0
3,38,105100,5,1,0,0,3,0,0
4,38,105100,5,1,0,0,3,0,0


In [121]:
# Define target dataset
y = dropped_df["y5_p50_earnings"].ravel()
y[:5]

array([1, 1, 1, 1, 1])

In [122]:
# Splitting into Train and Test sets.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [123]:
# Create scaler instance
X_scaler = skl.preprocessing.StandardScaler()

# Fit the scaler
X_scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [125]:
# Define the model - deep neural net
import tensorflow as tf

# Create the Keras Sequential model
nn_model = tf.keras.models.Sequential()

# Add our first Dense layer, including the input layer
nn_model.add(tf.keras.layers.Dense(units=1, activation="relu", input_dim=9))

# Add the output layer that uses a probability activation function
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the Sequential model
nn_model.summary()

Model: "sequential_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_28 (Dense)            (None, 1)                 10        
                                                                 
 dense_29 (Dense)            (None, 1)                 2         
                                                                 
Total params: 12
Trainable params: 12
Non-trainable params: 0
_________________________________________________________________


In [126]:
# Compile the model
nn_model.compile(optimizer="adam", loss="mean_squared_error", metrics=["accuracy"])

In [127]:
# Fit the model to the training data
fit_model = nn_model.fit(X_train_scaled, y_train, epochs=40)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
