#### Contains the analysis presented in ICAIF 2021 Paper 'A Machine Learning Approach to Detect Early Signs of Startup Success'

Author: Abhinav Nadh Thirupathi

Run this notebook top to bottom to reproduce the results

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv("data/study/study_data.csv",low_memory=False)

## Data Normalization

In [None]:
from scipy import stats

cols = data.columns.values

# Groups the companies by 'Years Since Founded' and standardizes non-binary features in each group
for col in cols[:-2]:
    if col.startswith('Details.Description') or col.startswith('Website.') or col.startswith('Overview') or col.startswith('Education') or col.startswith('Major'):
        if col not in ["Overview.Gender.Agender", "Overview.Gender.Non-Binary"]:
            data[col] = data.groupby('Details.Years Since Founded')[col].transform(lambda x : stats.zscore(x,ddof=1,nan_policy='omit'))

## LOOCV

In [None]:
# Splits the data into features and target
Y = data[data.columns[-2:]].copy()
X = data.drop(columns=['Target', 'Details.Years Since Founded'])

In [None]:
import xgboost as xgb

xg = xgb.XGBClassifier(random_state=1)
xg.fit(X,Y['Target'])

### Permutation Importance

In [None]:
from sklearn import inspection
r = inspection.permutation_importance(xg, X, Y['Target'], n_repeats=3160, random_state=1, n_jobs=-1)

In [None]:
for i in r.importances_mean.argsort()[::-1]:
    if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
        print("{:<8}: {:.3f} +/- {:.3f}".format(X.columns.values[i],r.importances_mean[i],r.importances_std[i]))

### SHAP Feature Importance

In [None]:
import shap 

shap_values = shap.TreeExplainer(xg).shap_values(X)
pd.DataFrame((zip(X.columns[np.argsort(np.abs(shap_values).mean(0))], 
                  np.abs(shap_values).mean(0)[np.argsort(np.abs(shap_values).mean(0))])), 
             columns=["Feature", "Importance" ]).sort_values(by=['Importance'], ascending=False)

In [None]:
shap.summary_plot(shap_values, X, plot_type="bar")

### Performance Metrics

In [None]:
import xgboost as xgb
from sklearn import model_selection
from sklearn import metrics

xg1 = xgb.XGBClassifier(random_state=1)
Y_proba = model_selection.cross_val_predict(xg1, X, Y['Target'], cv=model_selection.LeaveOneOut(), n_jobs=-1, method='predict_proba')

In [None]:
Y_hat = np.argsort(Y_proba,axis=1)[:,1]
Y_proba1 = Y_proba[:,1]

print("AUC        : ", metrics.roc_auc_score(Y['Target'], Y_proba1))
print("Accuracy   : ", metrics.accuracy_score(Y['Target'], Y_hat))
print("Precision  : ", metrics.precision_score(Y['Target'], Y_hat))
print("Recall     : ", metrics.recall_score(Y['Target'], Y_hat))
print("F-score    : ", metrics.f1_score(Y['Target'], Y_hat))
print("Brier Score: ", metrics.brier_score_loss(Y['Target'], Y_hat))

### Prediction Thresholds

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(Y['Target'], Y_proba1)

In [None]:
print('{:<30}{:<30}'.format('FPR', 'TPR', 'Threshold'))
for x, y, z in zip(fpr,tpr,thresholds):
    print('{:<30}{:<30}{:<30}'.format(x, y, z))

### Reliability Diagram

In [None]:
from sklearn import calibration

probs = xg.predict_proba(X)[:,1]
fraction_of_positives, mean_predicted_value = calibration.calibration_curve(Y['Target'], probs, n_bins = 10)

ax1 = plt.figure()

plt.plot(mean_predicted_value, fraction_of_positives, marker = '.', label = 'XGBoost')
plt.xlabel('Mean Predicted Value')
plt.ylabel('Fraction of Positives')
plt.tight_layout()
plt.show()