In [27]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score

from imblearn.over_sampling import SMOTE

import pickle

In [2]:
df_train = pd.read_csv("df_train", low_memory = False);
df_test = pd.read_csv("df_test", low_memory = False);

In [3]:
df_test = df_test.sort_values(by=['ROI'])
df_train = df_train.sort_values(by=['ROI'])

df_test['Rank'] = np.arange(0,df_test.shape[0])
df_test['Rank'] = df_test['Rank']/df_test.shape[0]

df_train['Rank'] = np.arange(0,df_train.shape[0])
df_train['Rank'] = df_train['Rank']/df_train.shape[0]

In [7]:
zip_df = pd.read_csv("../data/2015_loan_data/pct_accepted_by_zip3_2015.csv", low_memory = False);
zip_df.columns = ['ZIP'] + list(zip_df.columns[1:]);

In [8]:
mean_accept_rate = np.sum(zip_df.pct_accepted*zip_df.num_loan_apps_considered)/np.sum(zip_df.num_loan_apps_considered)
print(mean_accept_rate)

0.0879457740068084


In [9]:
df_train.ZIP = df_train.ZIP.astype(int)
df_test.ZIP = df_test.ZIP.astype(int)

In [10]:
pct_acc_mean = zip_df.pct_accepted.mean();
pct_acc_std = zip_df.pct_accepted.std();

In [116]:
merged_df_train = pd.merge(df_train, zip_df, on=['ZIP'])
merged_df_test = pd.merge(df_test, zip_df, on=['ZIP'])

In [125]:
def clipped_pct(x):
    std_mult = 1.5
    if x > pct_acc_mean + std_mult*pct_acc_std:
        return pct_acc_mean + std_mult*pct_acc_std
    elif x < pct_acc_mean - std_mult*pct_acc_std:
        return pct_acc_mean - std_mult*pct_acc_std
    else:
        return x

In [127]:
std_mult = 1.5
merged_df_train['clipped_pct'] = merged_df_train.pct_accepted.map(clipped_pct)
merged_df_train['adjustment_factor'] = merged_df_train['clipped_pct'].mean()/merged_df_train['clipped_pct']

merged_df_test['clipped_pct'] = merged_df_test.pct_accepted.map(clipped_pct)
merged_df_test['adjustment_factor'] = merged_df_test['clipped_pct'].mean()/merged_df_test['clipped_pct']

In [128]:
def adjust_ROI(df):
    df['adjusted_ROI'] = 0
    mean_adj_factor = df.adjustment_factor.mean()
    for i in range(len(df)):
        adjustment = abs(df.iloc[i].ROI)*(df.iloc[i].adjustment_factor - mean_adj_factor + 1)
        df.adjusted_ROI = df.ROI + adjustment
    return df

In [133]:
alphas = [1]
scores_pure_model = []
for alpha in alphas:
    mean_adj_factor = merged_df_train.adjustment_factor.mean();
    def adjust_ROI(x):
        adjustment = abs(x.ROI)*alpha*(x.adjustment_factor - mean_adj_factor);
        return x.ROI + adjustment;
    merged_df_train['adjusted_ROI'] = merged_df_train.apply(adjust_ROI, axis=1);
    merged_df_test['adjusted_ROI'] = merged_df_test.apply(adjust_ROI, axis=1);
    
    train_final = merged_df_train.sort_values(by=['adjusted_ROI'])
    train_final['adjusted_Rank'] = np.arange(0,train_final.shape[0])
    train_final['adjusted_Rank'] = (train_final['adjusted_Rank']/train_final.shape[0])*100
    
    test_final = merged_df_test.sort_values(by=['adjusted_ROI'])
    test_final['adjusted_Rank'] = np.arange(0,test_final.shape[0])
    test_final['adjusted_Rank'] = (test_final['adjusted_Rank']/test_final.shape[0])*100
    
    train_final_data = train_final.groupby('ZIP').agg({
                'ZIP' : {'Count' : 'count'},
                'adjusted_Rank' : np.mean,
                'Rank' : np.mean
            })
    ranked_df = pd.DataFrame(train_final_data.to_records())
    ranked_df.columns = ['ZIP', 'Count', 'adj_rank_mean', 'rank_mean']
    ranked_df['Predicted_mean'] = 50
    adj_score = r2_score((ranked_df.adj_rank_mean*ranked_df.Count), (ranked_df.Predicted_mean*ranked_df.Count))
    scores_pure_model.append(adj_score)

  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


In [134]:
ranked_df['rank_mean'] = ranked_df['rank_mean']*100
train_final['Rank'] = train_final['Rank']*100
test_final['Rank'] = test_final['Rank']*100

In [135]:
train_final[train_final.ZIP == 7]

Unnamed: 0,proba,true_val,int_rate,amnt,total_pymnt,term,ZIP,ROI,Real_ROI,annualized_amnt,Rank,pct_accepted,num_loan_apps_considered,clipped_pct,adjustment_factor,adjusted_ROI,adjusted_Rank
218170,0.63986,0,0.1824,18000.0,25561.684514,5.0,7,-0.287025,1307.938115,3600.0,15.868328,0.022472,91.0,0.037509,2.456851,0.116041,99.986721
218169,0.664469,0,0.1449,17500.0,19074.72,5.0,7,-0.317661,304.183974,3500.0,10.736706,0.022472,91.0,0.037509,2.456851,0.128427,99.990842


In [136]:
test_final[test_final.ZIP == 7]

Unnamed: 0,proba,true_val,int_rate,amnt,total_pymnt,term,ZIP,ROI,Real_ROI,annualized_amnt,Rank,pct_accepted,num_loan_apps_considered,clipped_pct,adjustment_factor,adjusted_ROI,adjusted_Rank
345718,0.43399,1,0.1269,20000,13923.86,3.0,7,-0.167404,-2274.171365,6666.666667,29.964636,0.022472,91.0,0.037509,2.446901,0.066014,99.900791


In [138]:
mean_ROI_train = train_final.adjusted_ROI.mean()
std_ROI_train = train_final.adjusted_ROI.std()

mean_ROI_test = test_final.adjusted_ROI.mean()
std_ROI_test = test_final.adjusted_ROI.std()

In [139]:
loans_to_buy_train = train_final[train_final.adjusted_ROI > (mean_ROI_train + 1.5*std_ROI_train)]
loans_to_buy_test = test_final[test_final.adjusted_ROI > (mean_ROI_test + 1.5*std_ROI_test)]

In [142]:
loans_to_buy_test.shape

(1726, 17)

In [148]:
return_ROI = np.sum(loans_to_buy_train.Real_ROI)/np.sum(loans_to_buy_train.annualized_amnt)
portfolio_ROI = np.sum(train_final.Real_ROI)/np.sum(train_final.annualized_amnt)
print(return_ROI - portfolio_ROI)

0.02166518755344178


In [147]:
return_ROI = np.sum(loans_to_buy_test.Real_ROI)/np.sum(loans_to_buy_test.annualized_amnt)
portfolio_ROI = np.sum(test_final.Real_ROI)/np.sum(test_final.annualized_amnt)
print(return_ROI - portfolio_ROI)

0.016110064996070168


So a portfolio where we **actively** favour traditionally disfavoured communities still outperforms the overall portfolio by 2.1%. This is an interesting result, since we might be able to show that we can target disenfranchised communities with investment and still deliver above average results for our investors. This result is for the training data, needs to be replicated on the test data.

Have written subset of loans to csv, perhaps Anthony can run regressions to see how loans we buy compare with overall portfolio in terms of favouring various groups etc?

In [149]:
loans_to_buy_train.to_csv('loans_bought_2014', index = False)
loans_to_buy_test.to_csv('loans_bought_2015', index = False)