In [1]:
import category_encoders as ce
from joblib import dump, load
import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor

history = pd.read_csv('LoanStats_securev1_2019Q1.csv.zip', engine='python', skiprows=1, skipfooter=2)

condition = (history.grade.isin(['A','B','C','D'])) & (history.term==' 36 months')
history = history[condition]

history['Interest Rate'] = history['int_rate'].str.strip('%').astype(float)

history = history.rename(columns=                     
    {'annual_inc': 'Annual Income', 
     'fico_range_high': 'Credit Score', 
     'funded_amnt': 'Loan Amount', 
     'title': 'Loan Purpose'})

history['Monthly Debts'] = history['Annual Income'] / 12 * history['dti'] / 100

columns = ['Annual Income', 
           'Credit Score', 
           'Loan Amount', 
           'Loan Purpose', 
           'Monthly Debts', 
           'Interest Rate']

history = history[columns]
history = history.dropna()

X = history.drop(columns='Interest Rate')
y = history['Interest Rate']
y_log = np.log1p(y)

pipeline = make_pipeline(
    ce.OneHotEncoder(use_cat_names=True), 
    XGBRegressor(n_estimators=200, n_jobs=-1)
)

pipeline.fit(X, y_log)
dump(pipeline, 'pipeline.joblib')

history['Annual Income'] = history['Annual Income'].astype(int)
history['Monthly Debts'] = history['Monthly Debts'].round(2)
history.to_csv('lending-club.csv', index=False)