In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score

from imblearn.over_sampling import SMOTE

import pickle

In [2]:
df_train = pd.read_csv("df_train", low_memory = False);
df_test = pd.read_csv("df_test", low_memory = False);

In [4]:
df_test = df_test.sort_values(by=['ROI'])
df_train = df_train.sort_values(by=['ROI'])

df_test['Rank'] = np.arange(0,df_test.shape[0])
df_test['Rank'] = df_test['Rank']/df_test.shape[0]

df_train['Rank'] = np.arange(0,df_train.shape[0])
df_train['Rank'] = df_train['Rank']/df_train.shape[0]

In [5]:
zip_df = pd.read_csv("../data/2014_loan_data/pct_accepted_by_zip3_2014_BP.csv", dtype={'3_dig_zip': str}, low_memory = False);
zip_df.columns = ['ZIP'] + list(zip_df.columns[1:]);

In [6]:
mean_accept_rate = np.sum(zip_df['2014_pct_accepted']*zip_df['2014_Total_loans_considered'])/np.sum(zip_df['2014_Total_loans_considered'])
print(mean_accept_rate)

0.10874439151451491


In [13]:
df_train.ZIP = df_train.ZIP.str.slice(0,3)
df_test.ZIP = df_test.ZIP.str.slice(0,3)

In [14]:
pct_acc_mean = zip_df['2014_pct_accepted'].mean();
pct_acc_std = zip_df['2014_pct_accepted'].std();

In [15]:
merged_df_train = pd.merge(df_train, zip_df, on=['ZIP'])
merged_df_test = pd.merge(df_test, zip_df, on=['ZIP'])

In [17]:
def clipped_pct(x):
    std_mult = 1.5
    if x > pct_acc_mean + std_mult*pct_acc_std:
        return pct_acc_mean + std_mult*pct_acc_std
    elif x < pct_acc_mean - std_mult*pct_acc_std:
        return pct_acc_mean - std_mult*pct_acc_std
    else:
        return x

In [18]:
std_mult = 1.5
merged_df_train['clipped_pct'] = merged_df_train['2014_pct_accepted'].map(clipped_pct)
merged_df_train['adjustment_factor'] = merged_df_train['clipped_pct'].mean()/merged_df_train['clipped_pct']

merged_df_test['clipped_pct'] = merged_df_test['2014_pct_accepted'].map(clipped_pct)
merged_df_test['adjustment_factor'] = merged_df_test['clipped_pct'].mean()/merged_df_test['clipped_pct']

In [19]:
def adjust_ROI(df):
    df['adjusted_ROI'] = 0
    mean_adj_factor = df.adjustment_factor.mean()
    for i in range(len(df)):
        adjustment = abs(df.iloc[i].ROI)*(df.iloc[i].adjustment_factor - mean_adj_factor + 1)
        df.adjusted_ROI = df.ROI + adjustment
    return df

In [21]:
merged_df_train['adjusted_ROI'] = 0
merged_df_test['adjusted_ROI'] = 0

alphas = [1]
scores_pure_model = []
for alpha in alphas:
    mean_adj_factor = merged_df_train.adjustment_factor.mean();
    def adjust_ROI(x):
        adjustment = abs(x.ROI)*alpha*(x.adjustment_factor - mean_adj_factor);
        return x.ROI + adjustment;
    merged_df_train['adjusted_ROI'] = merged_df_train.apply(adjust_ROI, axis=1);
    merged_df_test['adjusted_ROI'] = merged_df_test.apply(adjust_ROI, axis=1);
    
    train_final = merged_df_train.sort_values(by=['adjusted_ROI'])
    train_final['adjusted_Rank'] = np.arange(0,train_final.shape[0])
    train_final['adjusted_Rank'] = (train_final['adjusted_Rank']/train_final.shape[0])*100
    
    test_final = merged_df_test.sort_values(by=['adjusted_ROI'])
    test_final['adjusted_Rank'] = np.arange(0,test_final.shape[0])
    test_final['adjusted_Rank'] = (test_final['adjusted_Rank']/test_final.shape[0])*100
    
    train_final_data = train_final.groupby('ZIP').agg({
                'ZIP' : {'Count' : 'count'},
                'adjusted_Rank' : np.mean,
                'Rank' : np.mean
            })
    ranked_df = pd.DataFrame(train_final_data.to_records())
    ranked_df.columns = ['ZIP', 'Count', 'adj_rank_mean', 'rank_mean']
    ranked_df['Predicted_mean'] = 50
    adj_score = r2_score((ranked_df.adj_rank_mean*ranked_df.Count), (ranked_df.Predicted_mean*ranked_df.Count))
    scores_pure_model.append(adj_score)

  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


In [22]:
ranked_df['rank_mean'] = ranked_df['rank_mean']*100
train_final['Rank'] = train_final['Rank']*100
test_final['Rank'] = test_final['Rank']*100

In [25]:
# train_final[train_final.ZIP == 7]

In [26]:
# test_final[test_final.ZIP == 7]

In [27]:
mean_ROI_train = train_final.adjusted_ROI.mean()
std_ROI_train = train_final.adjusted_ROI.std()

mean_ROI_test = test_final.adjusted_ROI.mean()
std_ROI_test = test_final.adjusted_ROI.std()

In [28]:
loans_to_buy_train = train_final[train_final.adjusted_ROI > (mean_ROI_train + 1.5*std_ROI_train)]
loans_to_buy_test = test_final[test_final.adjusted_ROI > (mean_ROI_test + 1.5*std_ROI_test)]

In [29]:
loans_to_buy_test.shape

(1588, 19)

In [30]:
return_ROI = np.sum(loans_to_buy_train.Real_ROI)/np.sum(loans_to_buy_train.annualized_amnt)
portfolio_ROI = np.sum(train_final.Real_ROI)/np.sum(train_final.annualized_amnt)
print(return_ROI - portfolio_ROI)

0.00968128587913257


In [31]:
return_ROI = np.sum(loans_to_buy_test.Real_ROI)/np.sum(loans_to_buy_test.annualized_amnt)
portfolio_ROI = np.sum(test_final.Real_ROI)/np.sum(test_final.annualized_amnt)
print(return_ROI - portfolio_ROI)

0.046657071660717045


So a portfolio where we **actively** favour traditionally disfavoured communities still outperforms the overall portfolio by 2.1%. This is an interesting result, since we might be able to show that we can target disenfranchised communities with investment and still deliver above average results for our investors. This result is for the training data, needs to be replicated on the test data.

Have written subset of loans to csv, perhaps Anthony can run regressions to see how loans we buy compare with overall portfolio in terms of favouring various groups etc?

In [32]:
loans_to_buy_train.to_csv('loans_bought_2014', index = False)
loans_to_buy_test.to_csv('loans_bought_2015', index = False)
train_final.to_csv('all_loans_2014', index = False)
test_final.to_csv('all_loans_2015', index = False)