In [1]:
# Acknowledgement: We used GeekorGeeks to use some of the syntax for these machine learning models and Python documentation for
# python general syntax issues for functions like index,drop, etc.
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.index.html 
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.drop.html 
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.dropna.html 
# https://www.geeksforgeeks.org/machine-learning/how-to-plot-roc-curve-in-python/ 


# importing all required libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score



In [None]:
for i in range(1,11):    
    # loading the training and val
    train_data = pd.read_csv("C:/Users/satavans/Downloads/60Sec_overlap_tv_comb/traincomb" + str(i)+ ".csv")
    val_data = pd.read_csv("C:/Users/satavans/Downloads/60Sec_overlap_tv_comb/validcomb" + str(i)+ ".csv")
    # removing unneceassry columns 
    train_data = train_data.drop(['Status1', 'Status2', 'Status3','Status4', 'Status5', 'Status6', 'Status7','Status8', 'Status9','Status10', 'n.x','ID', 'X.1.x','id1', 'ID', 'X.x', 'X.1.y', 'X.y', 'Treat.y', 'n.y'], axis= 1)

    # dropping unnamed column at the start
    train_data = train_data.iloc[:,1:]
    train_data.head()

    val_data2 = val_data.copy()
    # dropping columns that we don't want as predictors
    val_data = val_data.drop(['Status1', 'Status2', 'Status3','Status4', 'Status5', 'Status6', 'Status7','Status8', 'Status9','Status10', 'n.x','ID', 'X.1.x','id1', 'ID', 'X.x', 'X.1.y', 'X.y', 'Treat.y', 'n.y'], axis= 1)

    # dropping unnamed column at the start
    val_data = val_data.iloc[:,1:]
    val_data.head()

    # dropping all rows that have NA values(mostly in target) 
    train_data = train_data.dropna(axis=0, how='any')
    val_data = val_data.dropna(axis=0, how='any')
    val_data2 = val_data2.dropna(axis=0, how='any')

    # splitting data into treatment and predictor variables
    train_target = train_data['Treat']
    train_predictors = train_data.drop('Treat', axis = 1)
    val_target = val_data['Treat']
    val_predictors = val_data.drop('Treat', axis = 1)

    # fitting the random forest model 
    model1 = RandomForestClassifier(n_estimators = 400, max_depth = 7, max_features=0.1,class_weight = 'balanced', random_state = 1234)
    model1.fit(train_predictors, train_target)

    # balanced weights for xgboost model
    pos_rate = sum(val_target == 0)/sum(val_target != 0)


    # fitting the XGBoost model
    model2 = XGBClassifier(n_estimators =400, max_depth = 7, learning_rate = 0.05,
                        scale_pos_weight = pos_rate, random_state = 1234,reg_alpha = 1, colsample_bylevel = 1)

    model2.fit(train_predictors, train_target)

    

    # tuning hyperparameter process for both random forest and xgboost models

    params_xgb = {
        'learning_rate': [0.05, 0.01, 0.1],
        'n_estimators': [200, 400, 500],
        'max_depth': [3, 5, 7],
        'random_state': [1234],
        'reg_alpha': [0,1,10],
        'colsample_bylevel': [0.5,1],
        'scale_pos_weight': [pos_rate]
    }

    xgb_grid = GridSearchCV(model2, param_grid=params_xgb, cv=5).fit(train_predictors, train_target)

    pred_y_xgb = xgb_grid.predict(val_predictors)
    pr_y_xgb = xgb_grid.predict_proba(val_predictors)

    print('XBG: ' + str(accuracy_score(val_target, pred_y_xgb)))
    print("Best XGB parameters:", xgb_grid.best_params_)
    print(xgb_grid.best_estimator_)

    params_rf = {
        'max_features': [1.0, 0.33, 0.1],
        'n_estimators': [400],
        'max_depth': [3, 5, 7],
        'random_state': [1234],
        'class_weight': ['balanced']
    }

    rf_grid = GridSearchCV(model1, param_grid=params_rf, cv=5).fit(train_predictors, train_target)

    pred_y_rf = rf_grid.predict(val_predictors)
    pr_y_rf = rf_grid.predict_proba(val_predictors)

    print('RF: ' + str(accuracy_score(val_target, pred_y_rf)))
    print("Best RF parameters:", rf_grid.best_params_)
    print(rf_grid.best_estimator_)

    # making csv for feature importances for two models
    imps_rf = rf_grid.best_estimator_.feature_importances_
    imps_xgb = xgb_grid.best_estimator_.feature_importances_

    imps_labels = train_predictors.columns

    feat_df = pd.DataFrame({'Feature': imps_labels, 'Importance_RF': imps_rf, 'Importance_XGB': imps_xgb})
    feat_df.to_csv('C:/Users/satavans/Downloads/RF_XGB_Results_Featimps/Featimp_XGB_RF_Comb/feature_importance_comb_60s/' + 'featimp' + str(i) + '.csv', index=False)
    
    # storing the predictions and calculating accuracy score
    val_predictions = pred_y_rf
    accuracy1 = accuracy_score(val_predictions, val_target)
    print('Accuracy:', accuracy1 *100, '%')

    # storing the predictions and calculating accuracy score
    predict = pred_y_xgb
    accuracy2 = accuracy_score(predict, val_target)
    print('Accuracy:', accuracy2 *100, '%')

    # storing predictions and probs for model1
    rf_probs = pr_y_rf[:, 1]
    val_predictions = pred_y_rf

    # making results file
    results_df = pd.DataFrame({
        'sample_ID': val_data2['ID'],
        'predicted_prob': rf_probs,
        'target': val_target.values,
        'pred': val_predictions
    })

    # saving csv file
    results_df.to_csv('C:/Users/satavans/Downloads/RF_XGB_Results_Featimps/Results_XGB_RF_Comb/results_comb_60s/' + 'rf_results' + str(i) + '.csv', index=False)

    # storing predictions and probs for model2
    xgb_probs = pr_y_xgb[:, 1]
    val_predictions = pred_y_xgb

    # making the results csv file
    results_df = pd.DataFrame({
        'sample_ID': val_data2['ID'],
        'predicted_prob': xgb_probs,
        'target': val_target.values,
        'pred': val_predictions
    })

    # saving file
    results_df.to_csv('C:/Users/satavans/Downloads/RF_XGB_Results_Featimps/Results_XGB_RF_Comb/results_comb_60s/' + 'xgb_results' + str(i) + '.csv', index=False)

    # predicted probs for treatment = 1
    prob_rf = pr_y_rf[:, 1]
    prob_xgb = pr_y_xgb[:, 1]

    # getting roc measures for random forest
    false_pos_rate_rf, true_pos_rate_rf, thresh = roc_curve(val_target, prob_rf)
    auc_rf = roc_auc_score(val_target, prob_rf)

    # getting roc measures XGBoost
    false_pos_rate_xgb, true_pos_rate_xgb, thresh1 = roc_curve(val_target, prob_xgb)
    auc_xgb = roc_auc_score(val_target, prob_xgb)

    plt.figure(figsize=(10, 6))

    # roc for random forest
    plt.plot(false_pos_rate_rf, true_pos_rate_rf, label=f'RF (AUC = {auc_rf:.2f})')

    # roc for xgboost
    plt.plot(false_pos_rate_xgb, true_pos_rate_xgb, label=f'XGB (AUC = {auc_xgb:.2f})')

    # random prediction line
    plt.plot([0, 1], [0, 1], linestyle='--')

    # labels for graph
    plt.title('60S Eye + Vehicle ROC Curve')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()
    plt.savefig('C:/Users/satavans/Downloads/RF_XGB_Results_Featimps/ROCs/60s_comb_ROCs/Split' + str(i) + '.png')
    plt.show()

FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/satavans/Downloads/60Sec_overlap_tv_comb/validcomb1.csv'