In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score

from imblearn.over_sampling import SMOTE

import pickle

In [2]:
df_train = pd.read_csv("df_train", low_memory = False);
df_test = pd.read_csv("df_test", low_memory = False);

In [3]:
df_test = df_test.sort_values(by=['ROI'])
df_train = df_train.sort_values(by=['ROI'])

df_test['Rank'] = np.arange(0,df_test.shape[0])
df_test['Rank'] = df_test['Rank']/df_test.shape[0]

df_train['Rank'] = np.arange(0,df_train.shape[0])
df_train['Rank'] = df_train['Rank']/df_train.shape[0]

In [65]:
zip_df = pd.read_csv("../data/2014_loan_data/pct_accepted_by_zip3_2014_BP.csv", low_memory = False);
zip_df.columns = ['ZIP'] + list(zip_df.columns[1:zip_df.shape[1]-2]) + ['num_loan_apps_considered'] + ['pct_accepted'];

In [66]:
zip_df.head()

Unnamed: 0,ZIP,2014_accpt_loans,2014_rejected_loans,num_loan_apps_considered,pct_accepted
0,7,2,63.0,65.0,0.030769
1,8,1,45.0,46.0,0.021739
2,10,379,2741.0,3120.0,0.121474
3,11,115,1367.0,1482.0,0.077598
4,12,93,803.0,896.0,0.103795


In [67]:
mean_accept_rate = np.sum(zip_df.pct_accepted*zip_df.num_loan_apps_considered)/np.sum(zip_df.num_loan_apps_considered)
print(mean_accept_rate)

0.10874439151451491


In [68]:
df_train.ZIP = df_train.ZIP.astype(int)
df_test.ZIP = df_test.ZIP.astype(int)

In [69]:
pct_acc_mean = zip_df.pct_accepted.mean();
pct_acc_std = zip_df.pct_accepted.std();

In [70]:
merged_df_train = pd.merge(df_train, zip_df, on=['ZIP'])
merged_df_test = pd.merge(df_test, zip_df, on=['ZIP'])

In [98]:
def clipped_pct(x):
    std_mult = 1
    if x > pct_acc_mean + std_mult*pct_acc_std:
        return pct_acc_mean + std_mult*pct_acc_std
    elif x < pct_acc_mean - std_mult*pct_acc_std:
        return pct_acc_mean - std_mult*pct_acc_std
    else:
        return x

In [99]:
merged_df_train['clipped_pct'] = merged_df_train.pct_accepted.map(clipped_pct)
merged_df_train['adjustment_factor'] = merged_df_train['clipped_pct'].mean()/merged_df_train['clipped_pct']

merged_df_test['clipped_pct'] = merged_df_test.pct_accepted.map(clipped_pct)
merged_df_test['adjustment_factor'] = merged_df_test['clipped_pct'].mean()/merged_df_test['clipped_pct']

In [100]:
def adjust_ROI(df):
    df['adjusted_ROI'] = 0
    mean_adj_factor = df.adjustment_factor.mean()
    for i in range(len(df)):
        adjustment = abs(df.iloc[i].ROI)*(df.iloc[i].adjustment_factor - mean_adj_factor + 1)
        df.adjusted_ROI = df.ROI + adjustment
    return df

In [101]:
alphas = [1]
scores_pure_model = []
for alpha in alphas:
    mean_adj_factor = merged_df_train.adjustment_factor.mean();
    def adjust_ROI(x):
        adjustment = abs(x.ROI)*alpha*(x.adjustment_factor - mean_adj_factor);
        return x.ROI + adjustment;
    merged_df_train['adjusted_ROI'] = merged_df_train.apply(adjust_ROI, axis=1);
    merged_df_test['adjusted_ROI'] = merged_df_test.apply(adjust_ROI, axis=1);
    
    train_final = merged_df_train.sort_values(by=['adjusted_ROI'])
    train_final['adjusted_Rank'] = np.arange(0,train_final.shape[0])
    train_final['adjusted_Rank'] = (train_final['adjusted_Rank']/train_final.shape[0])*100
    
    test_final = merged_df_test.sort_values(by=['adjusted_ROI'])
    test_final['adjusted_Rank'] = np.arange(0,test_final.shape[0])
    test_final['adjusted_Rank'] = (test_final['adjusted_Rank']/test_final.shape[0])*100
    
    train_final_data = train_final.groupby('ZIP').agg({
                'ZIP' : {'Count' : 'count'},
                'adjusted_Rank' : np.mean,
                'Rank' : np.mean
            })
    ranked_df = pd.DataFrame(train_final_data.to_records())
    ranked_df.columns = ['ZIP', 'Count', 'adj_rank_mean', 'rank_mean']
    ranked_df['Predicted_mean'] = 50
    adj_score = r2_score((ranked_df.adj_rank_mean*ranked_df.Count), (ranked_df.Predicted_mean*ranked_df.Count))
    scores_pure_model.append(adj_score)

  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


In [102]:
ranked_df['rank_mean'] = ranked_df['rank_mean']*100
train_final['Rank'] = train_final['Rank']*100
test_final['Rank'] = test_final['Rank']*100

In [103]:
train_final[train_final.ZIP == 7]

Unnamed: 0,proba,true_val,int_rate,amnt,total_pymnt,term,ZIP,ROI,Real_ROI,annualized_amnt,Rank,2014_accpt_loans,2014_rejected_loans,num_loan_apps_considered,pct_accepted,clipped_pct,adjustment_factor,adjusted_ROI,adjusted_Rank
218169,0.664469,0,0.1449,17500.0,19074.72,5.0,7,-0.317661,304.183974,3500.0,10.736706,2,63.0,65.0,0.030769,0.068506,1.643473,-0.125649,51.364754
218170,0.63986,0,0.1824,18000.0,25561.684514,5.0,7,-0.287025,1307.938115,3600.0,15.868328,2,63.0,65.0,0.030769,0.068506,1.643473,-0.113531,54.956935


In [104]:
test_final[test_final.ZIP == 7]

Unnamed: 0,proba,true_val,int_rate,amnt,total_pymnt,term,ZIP,ROI,Real_ROI,annualized_amnt,Rank,2014_accpt_loans,2014_rejected_loans,num_loan_apps_considered,pct_accepted,clipped_pct,adjustment_factor,adjusted_ROI,adjusted_Rank
345718,0.43399,1,0.1269,20000,13923.86,3.0,7,-0.167404,-2274.171365,6666.666667,29.964636,2,63.0,65.0,0.030769,0.068506,1.638306,-0.067081,63.66321


In [105]:
mean_ROI_train = train_final.adjusted_ROI.mean()
std_ROI_train = train_final.adjusted_ROI.std()

mean_ROI_test = test_final.adjusted_ROI.mean()
std_ROI_test = test_final.adjusted_ROI.std()

In [106]:
loans_to_buy_train = train_final[train_final.adjusted_ROI > (mean_ROI_train + 1.5*std_ROI_train)]
loans_to_buy_test = test_final[test_final.adjusted_ROI > (mean_ROI_test + 1.5*std_ROI_test)]

In [107]:
loans_to_buy_test.shape

(1700, 19)

In [108]:
loans_to_buy_test.head()

Unnamed: 0,proba,true_val,int_rate,amnt,total_pymnt,term,ZIP,ROI,Real_ROI,annualized_amnt,Rank,2014_accpt_loans,2014_rejected_loans,num_loan_apps_considered,pct_accepted,clipped_pct,adjustment_factor,adjusted_ROI,adjusted_Rank
283818,0.043623,0,0.0532,15000,16068.421629,3.0,404,0.026832,348.004305,5000.0,98.665347,95,1005.0,1100.0,0.086364,0.086364,1.299544,0.033823,99.508294
7001,0.034754,0,0.0532,12025,13033.110153,3.0,112,0.032194,327.060552,4008.333333,99.472427,2382,20744.0,23126.0,0.103001,0.103001,1.089634,0.033823,99.508583
49874,0.035353,0,0.0532,25000,26989.724449,3.0,190,0.031831,646.384908,8333.333333,99.434352,763,6729.0,7492.0,0.101842,0.101842,1.102034,0.033837,99.508872
189982,0.0414,0,0.0603,24000,26056.256722,3.0,207,0.034982,666.725601,8000.0,99.67088,1091,8695.0,9786.0,0.111486,0.111486,1.006705,0.033852,99.509162
158896,0.060165,0,0.0649,6600,7230.465614,3.0,441,0.02783,203.797491,2200.0,98.836109,810,8254.0,9064.0,0.089365,0.089365,1.255905,0.033866,99.509451


In [109]:
return_ROI = np.sum(loans_to_buy_train.Real_ROI)/np.sum(loans_to_buy_train.annualized_amnt)
portfolio_ROI = np.sum(train_final.Real_ROI)/np.sum(train_final.annualized_amnt)
print(return_ROI - portfolio_ROI)

0.026560826235351664


In [110]:
return_ROI = np.sum(loans_to_buy_test.Real_ROI)/np.sum(loans_to_buy_test.annualized_amnt)
portfolio_ROI = np.sum(test_final.Real_ROI)/np.sum(test_final.annualized_amnt)
print(return_ROI - portfolio_ROI)

0.07218479561086673


So a portfolio where we **actively** favour traditionally disfavoured communities still outperforms the overall portfolio by 2.1%. This is an interesting result, since we might be able to show that we can target disenfranchised communities with investment and still deliver above average results for our investors. This result is for the training data, needs to be replicated on the test data.

Have written subset of loans to csv, perhaps Anthony can run regressions to see how loans we buy compare with overall portfolio in terms of favouring various groups etc?

In [111]:
loans_to_buy_train.to_csv('loans_bought_2014', index = False)
loans_to_buy_test.to_csv('loans_bought_2015', index = False)

In [113]:
np.corrcoef(loans_to_buy_test.adjusted_Rank, loans_to_buy_test.pct_accepted)

array([[ 1.        , -0.39329985],
       [-0.39329985,  1.        ]])

In [125]:
np.corrcoef(test_final.adjusted_Rank, test_final.pct_accepted)

array([[ 1.        , -0.21186845],
       [-0.21186845,  1.        ]])

In [124]:
np.corrcoef(test_final.Rank, test_final.pct_accepted)

array([[1.       , 0.0171302],
       [0.0171302, 1.       ]])

In [126]:
test_final.head()

Unnamed: 0,proba,true_val,int_rate,amnt,total_pymnt,term,ZIP,ROI,Real_ROI,annualized_amnt,Rank,2014_accpt_loans,2014_rejected_loans,num_loan_apps_considered,pct_accepted,clipped_pct,adjustment_factor,adjusted_ROI,adjusted_Rank
727,0.835835,0,0.1333,33600,39519.895741,3.0,79,-0.43886,1867.571125,11200.0,0.000288,184,913.0,1097.0,0.16773,0.139538,0.80432,-0.541859,0.0
1022,0.835118,1,0.1333,28000,26759.92,3.0,600,-0.438369,-419.617104,9333.333333,0.000577,1250,7661.0,8911.0,0.140276,0.139538,0.80432,-0.541253,0.000289
7019,0.829023,0,0.1333,28000,34103.184219,3.0,220,-0.434197,1902.235985,9333.333333,0.001731,454,2502.0,2956.0,0.153586,0.139538,0.80432,-0.536102,0.000578
7632,0.828336,1,0.1333,28000,8621.26,3.0,69,-0.433726,-9092.725251,9333.333333,0.002019,140,800.0,940.0,0.148936,0.139538,0.80432,-0.53552,0.000868
7845,0.827704,0,0.1333,25000,28468.916242,3.0,941,-0.433294,1106.600138,8333.333333,0.002308,1020,4902.0,5922.0,0.172239,0.139538,0.80432,-0.534987,0.001157


In [1]:
plt.plot(test_final.Rank, test_final.pct_accepted, 'o')
plt.show()

NameError: name 'plt' is not defined