In [254]:
import pandas as pd
import LendingClubFunctions as LC
from sklearn.decomposition import PCA
import re
import sklearn as sk
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import RidgeCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import grid_search
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import nltk
from sklearn import tree
import numpy as np
from sklearn import preprocessing
import numpy as np
import math
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
import itertools
import random
import dill
%pylab inline
pylab.rcParams['figure.figsize']=(8.0,6.0)
from sklearn.ensemble import AdaBoostRegressor

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy


In [2]:
#Define data location
file1='./data/LoanStats3a.csv'
file2='./data/LoanStats3b.csv'
file3='./data/LoanStats3c.csv'
file4='./data/LoanStats3d.csv'
files=[file1,file2,file3,file4]

In [3]:
cols=['loan_amnt','funded_amnt','total_pymnt', 'loan_status', 'int_rate', 'term',
      'desc', 'grade', 'annual_inc', 'purpose', 'emp_length', 'earliest_cr_line', 
      'revol_util', 'home_ownership', 'sub_grade', 'addr_state', 'dti', 'revol_bal', 'installment', 'last_pymnt_d', 'issue_d']

In [4]:
#Load data in from files
df1=LC.GetData(cols, files)
Trouble=[ 'Charged Off', 'Default', 'Does not meet the credit policy.  Status:Charged Off']
Success=['Fully Paid', 'Does not meet the credit policy.  Status:Fully Paid']
df1=LC.CleanData(df1, Success=Success, Trouble=Trouble)

  if self.run_code(code, result):


In [27]:
class LendingClubTransformer(sk.base.BaseEstimator, sk.base.TransformerMixin):
    """
    A transformer that takes as input a dataframe and a cut-off for variance explained. Then 
    performs PCA on the relevant columns and returns a matrix explaining the requested amount
    of variance
    """
    def __init__(self, cols=[], varExp=100):
        self.cols=cols
        self.varExp=varExp
        self.means=[]
        self.stds=[]
        self.pca=PCA()
        self.ElVarExp=None
        pass

    def fit(self, X, y=[]):
        return self.fit_transform(X)
    
    def fit_transform(self, X, y=[]):
        
        self.means=X.describe().ix['mean']
        self.stds=X.describe().ix['std']
        Xs=X[self.cols].dropna().copy()
        for col in Xs.columns:
            Xs[col]= (Xs[col]-self.means[col])/self.stds[col]

        self.pca.fit(Xs)
        el=np.argmax(self.pca.explained_variance_ratio_.cumsum()>self.varExp)
        self.ElVarExp=el
        Xs=self.pca.transform(Xs)[:, :(self.ElVarExp+1)]
        return Xs

    def transform(self, X):
        Xs=X[self.cols].dropna().copy()
        for col in Xs.columns:
            Xs[col]= (Xs[col]-self.means[col])/self.stds[col]
        Xs=self.pca.transform(Xs)[:, :(self.ElVarExp+1)]
        return Xs

In [28]:
class yLendingClubTransformer(sk.base.BaseEstimator, sk.base.TransformerMixin):
    """
    A transformer that takes as input a dataframe and the desired columns as features and returns the requests label
    """
    def __init__(self, cols=[], label=None):
        self.cols=[col for col in cols]
        self.cols.append(label)
        self.label=label
        pass
    

    def transform(self, X):
        return (X[self.cols].dropna())[self.label].as_matrix()

In [229]:
class CompositePredictor(sk.base.BaseEstimator, sk.base.TransformerMixin):
    """
    A composite predictor that takes in cats, the number of categorical variables that different models
    will be fit on, features, the features to fit to, and Model, the model to be used. The fit model fits a model 
    to every unique occurance for each category in cats.
    """
    def __init__(self, cats=[], features=[],Model=linear_model.Ridge()):
        self.cats=[cat for cat in cats]
        self.feats=[feat for feat in features]
        self.Models={}
        self.Model=Model
        self.Transform=LendingClubTransformer(cols=self.feats, varExp=.95)
        self.Transforms={}
        pass

    def fit(self, X, y=[]):
        modelcats=[]
        for cat in self.cats:
            modelcats.append([item for item in X[cat].unique()])
        for item in itertools.product(*modelcats):
            self.Models[item]=self.Model
            
        for subcat in self.Models.keys():
            temp=X.copy()
            for c,v in zip(subcat, self.cats):
                temp=temp[temp[v]==c]
            if temp[self.feats].dropna().shape[0]>5:           
                self.Transforms[subcat]=self.Transform
                ytrain=yLendingClubTransformer(cols=self.feats, label='Fraction_Of_Total').transform(temp) 
                Xtrain=temp[self.feats].dropna()#(self.Transforms[subcat]).fit_transform(temp[self.feats].dropna())
                self.Models[subcat].fit(Xtrain, ytrain)
        
        return
    
    def predictsub(self,X):
        subcat=[]
        for cat in self.cats:
            subcat.append(X.ix[cat])
        
        subcat=tuple(subcat)
        #Xtemp=(self.Transforms[subcat]).transform(X)
        return self.Models[subcat].predict(X[self.feats])
        
    
    def predict(self, X):
        
        return (X.T).apply(self.predictsub).as_matrix()[0,:]
        

In [231]:
features=['annual_inc','int_rate']
model=grid_search.GridSearchCV(linear_model.Ridge() , {'alpha':( 0.00005,0.0001,0.001,0.11, .5,5, 10, 20, 40, 60, 90, 100, 150)}, cv=4, verbose=False)
cats=['purpose']

In [242]:
CompModel=CompositePredictor(cats=cats, features=features, Model=model)

In [243]:
dfTrain, dfTest = train_test_split(df1[df1.loan_status!=2].dropna(), test_size=0.4)

In [245]:
CompModel.fit(dfTrain)

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:    0.2s


Fitting 4 folds for each of 13 candidates, totalling 52 fits
Fitting 4 folds for each of 13 candidates, totalling 52 fits

[Parallel(n_jobs=1)]: Done  52 out of  52 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:    0.5s



Fitting 4 folds for each of 13 candidates, totalling 52 fits

[Parallel(n_jobs=1)]: Done  52 out of  52 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:    0.1s



Fitting 4 folds for each of 13 candidates, totalling 52 fits

[Parallel(n_jobs=1)]: Done  52 out of  52 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:    0.1s



Fitting 4 folds for each of 13 candidates, totalling 52 fits

[Parallel(n_jobs=1)]: Done  52 out of  52 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:    0.1s



Fitting 4 folds for each of 13 candidates, totalling 52 fits

[Parallel(n_jobs=1)]: Done  52 out of  52 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:    0.1s



Fitting 4 folds for each of 13 candidates, totalling 52 fits

[Parallel(n_jobs=1)]: Done  52 out of  52 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:    0.1s



Fitting 4 folds for each of 13 candidates, totalling 52 fits

[Parallel(n_jobs=1)]: Done  52 out of  52 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:    0.1s



Fitting 4 folds for each of 13 candidates, totalling 52 fits

[Parallel(n_jobs=1)]: Done  52 out of  52 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:    0.1s



Fitting 4 folds for each of 13 candidates, totalling 52 fits

[Parallel(n_jobs=1)]: Done  52 out of  52 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:    0.1s



Fitting 4 folds for each of 13 candidates, totalling 52 fits

[Parallel(n_jobs=1)]: Done  52 out of  52 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:    0.1s



Fitting 4 folds for each of 13 candidates, totalling 52 fits

[Parallel(n_jobs=1)]: Done  52 out of  52 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:    0.1s



Fitting 4 folds for each of 13 candidates, totalling 52 fits

[Parallel(n_jobs=1)]: Done  52 out of  52 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:    0.1s



Fitting 4 folds for each of 13 candidates, totalling 52 fits

[Parallel(n_jobs=1)]: Done  52 out of  52 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:    0.1s





[Parallel(n_jobs=1)]: Done  52 out of  52 | elapsed:    0.1s finished


In [247]:
predictions=CompModel.predict(dfTest)
yTest=yLendingClubTransformer(cols=features, label='Fraction_Of_Total').transform(dfTest)

In [250]:
predictions

array([ 0.92944821,  0.77891551,  0.84107634, ...,  0.95229975,
        0.89179086,  0.80910077])

In [251]:
yTest

array([ 0.96982456,  0.72403276,  0.97185285, ...,  0.95234493,
        0.95848042,  0.94267356])

In [252]:
math.sqrt(mean_squared_error(yTest, predictions))

0.21966061910694146

In [255]:
f=open('CompositeModel.txt', 'w')
print >> f, dill.dumps(CompModel)
f.close()