In [1]:
def run_exp():    
    import os
    from snowflake.snowpark import Session
    from snowflake.ml.modeling.pipeline import Pipeline
    from snowflake.ml.modeling.preprocessing import MinMaxScaler, OrdinalEncoder
    from snowflake.ml.modeling.metrics import mean_squared_error, mean_absolute_error, r2_score
    from snowflake.ml.modeling.xgboost import XGBClassifier
    from snowflake.snowpark import Session, FileOperation

    connection_parameters = {
        "account": "ug94937.us-east4.gcp",
        "user": "ADITYASINGH",
        "password": os.environ.get('SF_Password'),
        "role": "ADITYASINGH",  # optional
        "warehouse": "FOSFOR_INSIGHT_WH",  # optional
    #     "authenticator": "externalbrowser", # optional
        "database": "FIRST_DB",  # optional
        "schema": "PUBLIC",  # optional
    } 
    
    session = Session.builder.configs(connection_parameters).create()
    session.sql_simplifier_enabled = True
    
    

In [13]:
# data = session.table('EMPLOYEE')
df_train, df_test = session.table('EMPLOYEE').drop('ROW').random_split(weights=[0.9, 0.1], seed=0)

In [15]:
df_train.show()

-------------------------------------------------------------------------------------------------------------------------------------------
|"EDUCATION"  |"JOININGYEAR"  |"CITY"     |"PAYMENTTIER"  |"AGE"  |"GENDER"  |"EVERBENCHED"  |"EXPERIENCEINCURRENTDOMAIN"  |"LEAVEORNOT"  |
-------------------------------------------------------------------------------------------------------------------------------------------
|Bachelors    |2017           |Bangalore  |3              |34     |Male      |No             |0                            |0             |
|Bachelors    |2013           |Pune       |1              |28     |Female    |No             |3                            |1             |
|Bachelors    |2014           |New Delhi  |3              |38     |Female    |No             |2                            |0             |
|Masters      |2016           |Bangalore  |3              |27     |Male      |No             |5                            |1             |
|Masters      |2017 

In [25]:
cols = df_train.columns
cols.remove('LEAVEORNOT')
print(cols)

['EDUCATION', 'JOININGYEAR', 'CITY', 'PAYMENTTIER', 'AGE', 'GENDER', 'EVERBENCHED', 'EXPERIENCEINCURRENTDOMAIN']


In [26]:
data_schema = session.sql("DESCRIBE TABLE EMPLOYEE").collect()

In [27]:
categorical_types = ['VARCHAR','CHAR','STRING','TEXT','BOOL']
categorical_columns = []
for row in data_schema:
    for typ in categorical_types:
        if typ in row['type'] and row['name']!='LEAVEORNOT':
            categorical_columns.append(row['name'])
            break

In [28]:
numerical_columns = list(set(cols) - set(categorical_columns))

In [29]:
print(numerical_columns)
print(categorical_columns)

['AGE', 'EXPERIENCEINCURRENTDOMAIN', 'JOININGYEAR', 'PAYMENTTIER']
['EDUCATION', 'CITY', 'GENDER', 'EVERBENCHED']


In [30]:
categorical_columns_oe = list(map(lambda a: a+'_OE', categorical_columns))

In [31]:
print(categorical_columns_oe)
print(categorical_columns)

['EDUCATION_OE', 'CITY_OE', 'GENDER_OE', 'EVERBENCHED_OE']
['EDUCATION', 'CITY', 'GENDER', 'EVERBENCHED']


In [33]:
# Define a pipeline that does the preprocessing and training of 
# a XGBRegressor model
pipe = Pipeline(steps=[
          ("ord", OrdinalEncoder(input_cols=categorical_columns, output_cols=categorical_columns_oe)),
          ("scaler", MinMaxScaler(input_cols=numerical_columns, output_cols=numerical_columns)),
          ("regressor", XGBClassifier(input_cols=categorical_columns_oe+numerical_columns
                                      , label_cols=["LEAVEORNOT"]
                                      , output_cols=['PREDICTION'], n_jobs=-1))
         ]
       )
 
# Fit the pipeline
xgb_model = pipe.fit(df_train)
 
# Test the model
df_test_pred = xgb_model.predict(df_test)
mse = mean_squared_error(df=df_test_pred, y_true_col_names="LEAVEORNOT", y_pred_col_names="PREDICTION")
mae = mean_absolute_error(df=df_test_pred, y_true_col_names="LEAVEORNOT", y_pred_col_names="PREDICTION")
r2 = r2_score(df=df_test_pred, y_true_col_name="LEAVEORNOT", y_pred_col_name="PREDICTION")
print(f'MSE: {mse}')
print(f'MAE: {mae}')
print(f'R2: {r2}')

MSE: 0.1475054229934924
MAE: 0.147505
R2: 0.3736663336668855
