In [1]:
import pandas as pd
from sqlalchemy import create_engine

In [2]:
%matplotlib inline

# Querying Data

In [3]:
engine = create_engine("mysql://{}:{}@{}/{}?charset=utf8mb4".format(
    # username
    # password
    # host
    # db
), convert_unicode=True, encoding='utf-8')
con = engine.connect()

query = 'SELECT * FROM submissions'
submissions = pd.read_sql(sql=query, con=con)
query = 'SELECT * FROM user_attribution'
attribution = pd.read_sql(sql=query, con=con)

con.close()

  cursor.execute('SELECT @@tx_isolation')


# Assembling Data

In [4]:
def classify_user(block):
    ret = 'neither'
    for _, r in block.iterrows():
        if r['type'] == 'request' and r['source']:
            ret = 'borrower'
    for _, r in block.iterrows():
        if r['type'] == 'loan' and r['source']:
            if ret == 'borrower':
                ret = 'both'
            else:
                ret = 'lender'
            break
    return ret

def compute_successful_loan(block):
    if 'loan' in block['type'].values:
        return 1
    return 0

def get_first_request(block):
    block = block.sort_values('submission_datetime').reset_index(drop=True)
    return block[block['type'] == 'request']['resource_id'].iloc[0]

In [5]:
user_classes = attribution.groupby('user_id').apply(classify_user)
borrowers = attribution[attribution['user_id'].isin(user_classes[user_classes.isin(['borrower', 'both'])].index)]

scores = borrowers.groupby('user_id').apply(compute_successful_loan)
first_requests = borrowers.groupby('user_id').apply(get_first_request)

data = pd.DataFrame({'request_id': first_requests, 'recieved_loan': scores}).reset_index()
data = data.merge(submissions[['submission_id', 'title']], left_on='request_id', right_on='submission_id')

In [6]:
# Spot checks that computed first request for most borrowers is their first submission ever.

def get_first_submission(block):
    return block.sort_values('creation_datetime').reset_index(drop=True)['submission_id'].iloc[0]

first_submissions = submissions.groupby('author_id').apply(get_first_submission)

spot_check = pd.DataFrame({
    'first_submissions': first_submissions,
    'first_request': data.set_index('user_id')['request_id']}
)
spot_check = spot_check[~spot_check['first_request'].isnull()]
(spot_check['first_request'] == spot_check['first_submissions']).sum() / spot_check.shape[0]

0.8328416912487709

# Training Initial Model

In [9]:
from money_detection_model import run_inference

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [10]:
import plotly.offline as plotly
import plotly.graph_objs as go
import numpy as np
import statsmodels.api as sm
from statistics import median
plotly.init_notebook_mode(connected=True)


The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.



In [20]:
thresh = 0.4

def parse(results):
    values = []
    for r in results:
        try:
            v = float(r)
        except:
            continue
        if v >= 10 and v <= 2500:
            values.append(v)
    if len(values) > 0:
        return median(values)
    return None
    
    
def extract_strings(s):
    v = run_inference(s.lower())
    results = []
    current = None
    for i in range(len(v)):
        if v[i] > thresh and current is None:
            current = s[i]
        elif v[i] > thresh and current is not None:
            current += s[i]
        elif v[i] <= thresh and current is not None:
            results.append(current)
            current = None
        elif v[i] <= thresh and current is None:
            continue
            
    return parse(results)
        

data['request_amt'] = data['title'].apply(extract_strings)

## Logistic Regression

In [12]:
d = data[(~data['request_amt'].isnull()) & (data['request_amt'] < 10000)]
n = d['request_amt'].shape[0]
x = np.concatenate([
    np.ones((n, 1)),
    np.array(d['request_amt']).reshape((n, 1))
], axis=1)

m = sm.Logit(d['recieved_loan'], x)
r = m.fit()
print(r.summary())

Optimization terminated successfully.
         Current function value: 0.643836
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:          recieved_loan   No. Observations:                 9168
Model:                          Logit   Df Residuals:                     9166
Method:                           MLE   Df Model:                            1
Date:                Sun, 23 Jun 2019   Pseudo R-squ.:                 0.01806
Time:                        23:08:02   Log-Likelihood:                -5902.7
converged:                       True   LL-Null:                       -6011.2
                                        LLR p-value:                 3.910e-49
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.2611      0.030     -8.731      0.000      -0.320      -0.202
x1            -0.0009   6.54e

In [13]:
import math

x = -0.2628
math.exp(x) / (math.exp(x) + 1)

0.4346755311745806

In [14]:
x = -0.2628 + 25 * -0.0008
math.exp(x) / (math.exp(x) + 1)

0.4297674528528336

In [15]:
x = -0.2628 + 100 * -0.0008
math.exp(x) / (math.exp(x) + 1)

0.41512948432952357

In [16]:
x = -0.2628 + 400 * -0.0008
math.exp(x) / (math.exp(x) + 1)

0.35828856850539503

## Decision Tree

In [52]:
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor

In [61]:
clf = RandomForestRegressor(100, max_depth=4, min_samples_leaf=128)
clf = clf.fit(np.array(d['request_amt']).reshape(-1, 1), d['recieved_loan'])

In [46]:
for r in data[(data['request_amt'] > 620) & (data['request_amt'] < 700)].iterrows():
    print(r[1]['title'])
    print(r[1]['request_amt'])
    print(r[1]['recieved_loan'])
    print(r[1]['user_id'])
    print()

[REQ](#Winchester, VA, USA)$667 for debt consolidation
667.0
0
t2_10860r

[REQ] ($650) - (#Atlanta, GA, USA), (Half by 3/23, last half paid by or before 4/12), (Paypal or whatever lender is comfortable with)
650.0
1
t2_10m1c0

[REQ] ($600) - (#Las Vegas, NV, USA), ($650 by 11/17/2017) (PayPal/Zelle)
625.0
1
t2_10qecm

[REQ] ($600) (#New York, NY, USA) (700 by 1/21/19) (Cash App)
650.0
0
t2_11ck03

[REQ] ($600) - (#Seattle, WA, USA), ($735 by 10/31/2018 or before), (PayPal, zelle, cashapp)
667.5
1
t2_11kan58z

[REQ] ($650) - (#Staten Island, NY, USA), (April 3rd), (PayPal)
650.0
0
t2_11mo08

[REQ]($600)-(#Mason, MI, USA), ($750 on 11/2/18), (Paypal)
675.0
0
t2_11woba

[REQ] ($620) - (#Springfield, MO, USA), ($730 by July 31, 2017), (PayPal or Bank Transfer)
675.0
1
t2_1288oq

[REQ]($625)(#Phoenix, AZ, USA) ($725 back by December 10th)(PayPal or Venmo)
675.0
1
t2_12c6b7

[REQ] (800 ) - (Texas City, TX 77590), (4/15/2017), (Checking Account Money Transfer)
695.0
0
t2_12rj76

[REQ] ($600) 

In [62]:
plotly.iplot([
    go.Scatter(
        x=np.arange(25, 1000),
        y=clf.predict(np.arange(25, 1000).reshape(-1, 1))
    )
])