In [4]:
# Import Dependencies
import numpy as np
import pandas as pd
import pyspark as spark
from pathlib import Path
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [5]:
import warnings
warnings.filterwarnings('ignore')

In [8]:
# Connecting to S3 bucket
import s3fs

# Create pandas dataframe from the csv file
df = pd.read_csv('https://jamesliu-databootcamp-bucket.s3.us-east-2.amazonaws.com/pseof_all.csv')

In [9]:
df.head()

Unnamed: 0,agg_level_pseo,inst_level,institution,degree_level,cip_level,cipcode,grad_cohort,grad_cohort_years,geo_level,geography,...,y10_grads_nme,status_y1_grads_emp,status_y1_grads_emp_instate,status_y5_grads_emp,status_y5_grads_emp_instate,status_y10_grads_emp,status_y10_grads_emp_instate,status_y1_grads_nme,status_y5_grads_nme,status_y10_grads_nme
0,38,I,105100,5,A,0,0,3,N,0,...,7281.0,1,1,1,1,1,1,1,1,1
1,38,I,105100,7,A,0,0,5,N,0,...,1149.0,1,1,1,1,1,1,1,1,1
2,38,I,105100,17,A,0,0,5,N,0,...,191.0,1,1,1,1,1,1,1,1,1
3,38,I,105100,18,A,0,0,5,N,0,...,256.0,1,1,1,1,1,1,1,1,1
4,38,I,105200,2,A,0,0,5,N,0,...,,1,1,1,1,-1,-1,1,1,-1


In [10]:
earnings_df = pd.read_csv('https://jamesliu-databootcamp-bucket.s3.us-east-2.amazonaws.com/pseoe_all.csv')
earnings_df.head()

Unnamed: 0,agg_level_pseo,inst_level,institution,degree_level,cip_level,cipcode,grad_cohort,grad_cohort_years,geo_level,geography,...,y10_ipeds_count,status_y1_earnings,status_y1_grads_earn,status_y5_earnings,status_y5_grads_earn,status_y10_earnings,status_y10_grads_earn,status_y1_ipeds_count,status_y5_ipeds_count,status_y10_ipeds_count
0,38,I,105100,5,A,0.0,0,3,N,0,...,28151.0,1,1,1,1,1,1,4,4,4
1,38,I,105100,7,A,0.0,0,5,N,0,...,5641.0,1,1,1,1,1,1,1,1,1
2,38,I,105100,17,A,0.0,0,5,N,0,...,813.0,1,1,1,1,1,1,1,1,1
3,38,I,105100,18,A,0.0,0,5,N,0,...,863.0,1,1,1,1,1,1,1,1,1
4,38,I,105200,2,A,0.0,0,5,N,0,...,,1,1,1,1,-1,-1,4,4,3


In [11]:
df.dtypes

agg_level_pseo                    int64
inst_level                       object
institution                      object
degree_level                      int64
cip_level                        object
cipcode                           int64
grad_cohort                       int64
grad_cohort_years                 int64
geo_level                        object
geography                         int64
ind_level                        object
industry                         object
y1_grads_emp                    float64
y1_grads_emp_instate            float64
y5_grads_emp                    float64
y5_grads_emp_instate            float64
y10_grads_emp                   float64
y10_grads_emp_instate           float64
y1_grads_nme                    float64
y5_grads_nme                    float64
y10_grads_nme                   float64
status_y1_grads_emp               int64
status_y1_grads_emp_instate       int64
status_y5_grads_emp               int64
status_y5_grads_emp_instate       int64


In [12]:
earnings_df.dtypes

agg_level_pseo              int64
inst_level                 object
institution                object
degree_level                int64
cip_level                  object
cipcode                   float64
grad_cohort                 int64
grad_cohort_years           int64
geo_level                  object
geography                   int64
ind_level                  object
industry                    int64
y1_p25_earnings           float64
y1_p50_earnings           float64
y1_p75_earnings           float64
y1_grads_earn             float64
y5_p25_earnings           float64
y5_p50_earnings           float64
y5_p75_earnings           float64
y5_grads_earn             float64
y10_p25_earnings          float64
y10_p50_earnings          float64
y10_p75_earnings          float64
y10_grads_earn            float64
y1_ipeds_count            float64
y5_ipeds_count            float64
y10_ipeds_count           float64
status_y1_earnings          int64
status_y1_grads_earn        int64
status_y5_earn

In [None]:
merged_df= df.merge(earnings_df, how= 'left', on='degree_level')
merged_df.head()

In [8]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['inst_level'] = le.fit_transform(df['inst_level'])
df.dtypes

agg_level_pseo                    int64
inst_level                        int64
institution                      object
degree_level                      int64
cip_level                        object
cipcode                           int64
grad_cohort                       int64
grad_cohort_years                 int64
geo_level                        object
geography                         int64
ind_level                        object
industry                         object
y1_grads_emp                    float64
y1_grads_emp_instate            float64
y5_grads_emp                    float64
y5_grads_emp_instate            float64
y10_grads_emp                   float64
y10_grads_emp_instate           float64
y1_grads_nme                    float64
y5_grads_nme                    float64
y10_grads_nme                   float64
status_y1_grads_emp               int64
status_y1_grads_emp_instate       int64
status_y5_grads_emp               int64
status_y5_grads_emp_instate       int64


In [9]:
# Object columns are dropped for analysis
drop_objects_df = df.drop(columns=['institution', 'cip_level','geo_level', 'industry','ind_level'])
drop_objects_df= drop_objects_df.dropna()
drop_objects_df.head()

Unnamed: 0,agg_level_pseo,inst_level,degree_level,cipcode,grad_cohort,grad_cohort_years,geography,y1_grads_emp,y1_grads_emp_instate,y5_grads_emp,...,y10_grads_nme,status_y1_grads_emp,status_y1_grads_emp_instate,status_y5_grads_emp,status_y5_grads_emp_instate,status_y10_grads_emp,status_y10_grads_emp_instate,status_y1_grads_nme,status_y5_grads_nme,status_y10_grads_nme
0,38,0,5,0,0,3,0,49897.0,27741.0,32189.0,...,7281.0,1,1,1,1,1,1,1,1,1
1,38,0,7,0,0,5,0,14182.0,8311.0,8975.0,...,1149.0,1,1,1,1,1,1,1,1,1
2,38,0,17,0,0,5,0,2238.0,992.0,1401.0,...,191.0,1,1,1,1,1,1,1,1,1
3,38,0,18,0,0,5,0,1987.0,1208.0,1208.0,...,256.0,1,1,1,1,1,1,1,1,1
10,38,0,5,0,0,3,0,8795.0,7012.0,4493.0,...,993.0,1,1,1,1,1,1,1,1,1


In [18]:
# Define features dataset
X= drop_objects_df.drop(columns=['status_y5_grads_emp_instate'])
X.head()

Unnamed: 0,agg_level_pseo,inst_level,degree_level,cipcode,grad_cohort,grad_cohort_years,geography,y1_grads_emp,y1_grads_emp_instate,y5_grads_emp,...,y5_grads_nme,y10_grads_nme,status_y1_grads_emp,status_y1_grads_emp_instate,status_y5_grads_emp,status_y10_grads_emp,status_y10_grads_emp_instate,status_y1_grads_nme,status_y5_grads_nme,status_y10_grads_nme
0,38,0,5,0,0,3,0,49897.0,27741.0,32189.0,...,10235.0,7281.0,1,1,1,1,1,1,1,1
1,38,0,7,0,0,5,0,14182.0,8311.0,8975.0,...,2424.0,1149.0,1,1,1,1,1,1,1,1
2,38,0,17,0,0,5,0,2238.0,992.0,1401.0,...,300.0,191.0,1,1,1,1,1,1,1,1
3,38,0,18,0,0,5,0,1987.0,1208.0,1208.0,...,379.0,256.0,1,1,1,1,1,1,1,1
10,38,0,5,0,0,3,0,8795.0,7012.0,4493.0,...,1892.0,993.0,1,1,1,1,1,1,1,1


In [19]:
# Define target dataset
y = drop_objects_df["status_y5_grads_emp_instate"].ravel()
y[:5]

array([1, 1, 1, 1, 1])

In [20]:
# Splitting into Train and Test sets.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [21]:
# Creating a StandardScaler instance.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [22]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [23]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [24]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [25]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

ValueError: Shape of passed values is (1, 1), indices imply (2, 2)

In [27]:
# Calculating the accuracy score.
acc_score = balanced_accuracy_score(y_test, predictions)

In [29]:
# Displaying results
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, predictions))

Accuracy Score : 1.0
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          1       1.00      1.00      0.00      1.00      0.00      0.00      4711

avg / total       1.00      1.00      0.00      1.00      0.00      0.00      4711

