In [1]:
import csv

In [2]:
import pandas as pd

In [3]:
import numpy as np

In [4]:
from datetime import datetime
from dateutil.parser import parse

In [5]:
df = pd.read_csv("./dave_deals.csv", sep=",", parse_dates=True)

In [6]:
print(len(df))
df = df.replace(r'\n', ' ', regex=True)
print(len(df))

7775
7775


In [7]:
df['audited_changes'] = df['audited_changes'].replace('underwriter_id.*','',regex=True)

df['audited_changes'] = df['audited_changes'].replace('relation.*','',regex=True)


In [8]:
df['audited_changes'] = df['audited_changes'].replace(r'--- !ruby/hash:ActiveSupport::HashWithIndifferentAccess status: ','',regex=True)
df['audited_changes'] = df['audited_changes'].replace("--- status: - ","",regex=True)
df['audited_changes'] = df['audited_changes'].replace("--- status: ","",regex=True)
df['audited_changes'] = df['audited_changes'].replace("^(- )","",regex=True)
df['audited_changes'] = df['audited_changes'].replace("('' - )|( - '')","",regex=True)
df['audited_changes'] = df['audited_changes'].str.rstrip()
df['created_at'] = pd.to_datetime(df['created_at'])
df = df.sort_values(by='created_at')

In [9]:
df = df.loc[df['audited_changes']!="--- !ruby/hash:ActiveSupport::HashWithIndifferentAccess"]

In [10]:
print(len(df.loc[~df.audited_changes.str.contains("-")]))
print(len(df.loc[df.audited_changes.str.contains("-")]))
assert(len(df.loc[~df.audited_changes.str.contains("-")]) \
      + len(df.loc[df.audited_changes.str.contains("-")])\
     ==len(df))
assert(len(df.loc[df.audited_changes.str.contains("-{1}")])==len(df.loc[df.audited_changes.str.contains("-")]))

2989
4173


In [11]:
approved_statuses = ['Approved',
                     'Contracts Back',
                     'Contract Sent',
                     'Soft Approval',
                     'Ready to Fund',
                     'Lost Deal',
                     'Contract Returned',
                     'Funding Call',
                     'Open Approval']
declined_statuses = ['Negative Balances',
                     'Declined',
                     'Bad Credit',
                     'Too Small',
                     'No Room',
                     'Previous Default',
                     'Merchant Declined',
                     'Declined Previously',
                     'Too Few Deposits',
                     'SIC Code',
                     'Declined Bad Iso',
                     'Suspected Fraud',
                     'Auto-declined',
                     'Fraud',
                     'Missing Stips',
                     'Merchant Declined',
                     'No Logins',
                     'No COJ',
                     'Negative Banks',
                     'New MCA',
                     'Poor Landlord',
                     'Contracts Back Declined']
funded_statuses = ['Funded']

In [12]:
grouped_status_df = df['audited_changes'].str.split(" - ", n=1, expand=True)
grouped_status_df.columns = ['left','right']
null_idx = grouped_status_df['right'].isnull()
grouped_status_df.loc[null_idx, 'right'] = grouped_status_df['left']
grouped_status_df.loc[null_idx, 'left'] = np.nan


In [13]:
df['old_status'] = grouped_status_df['left']
df['new_status'] = grouped_status_df['right']
df = df.set_index(df.created_at)

In [14]:
def status_groups(g):
    if g in approved_statuses:
        return "approved"
    elif g in declined_statuses:
        return "declined"
    elif g in funded_statuses:
        return "funded"
    else:
        return "submission"
    
df['group_status'] = df['new_status'].apply(status_groups)

In [15]:
df.index

DatetimeIndex(['2019-07-01 14:41:31', '2019-07-01 14:41:31',
               '2019-07-01 14:45:52', '2019-07-01 14:45:52',
               '2019-07-01 14:45:52', '2019-07-01 14:56:31',
               '2019-07-01 14:56:31', '2019-07-01 14:58:33',
               '2019-07-01 14:58:33', '2019-07-01 15:12:47',
               ...
               '2019-10-23 15:03:18', '2019-10-23 15:04:19',
               '2019-10-23 15:07:00', '2019-10-23 15:07:00',
               '2019-10-23 15:10:09', '2019-10-23 15:16:38',
               '2019-10-23 15:16:38', '2019-10-23 15:35:34',
               '2019-10-23 15:46:35', '2019-10-23 15:47:58'],
              dtype='datetime64[ns]', name='created_at', length=7162, freq=None)

In [16]:
grouped_df = df.groupby(df.id)

In [17]:
df = grouped_df.tail(1)

In [20]:
# setting declined deals with funded amount over 0 to 0
df.loc[(df.group_status=="declined") & (df.funded_amount!=0) & (df.status!="Funded"), 'funded_amount'].apply(0)
# setting funded deals with a group status of declined to "funded"
df.loc[(df.group_status=="declined") & (df.funded_amount!=0),'group_status']="funded"

In [21]:
def total_in_submissions(df):
    return len(df.loc[df.group_status=='submission'])
def total_declined(df):
    return len(df.loc[df.group_status=='declined'])
def total_approved(df):
    return len(df.loc[df.group_status=='approved'])
def total_funded(df):
    return len(df.loc[df.group_status=='funded'])

def submission_metric(df):
    return total_in_submissions(df) / len(df)
def declined_metric(df):
    return total_declined(df) / len(df)
def approved_metric(df):
    return total_approved(df) / len(df)
def funded_metric(df):
    return total_funded(df) / len(df)

In [22]:
import openpyxl as xl
wb = xl.Workbook()
ws = wb.active
ws.append(['Data Start Date',df.created_at.iloc[0].strftime("%m/%d")])
ws.append(['Data End Date', df.created_at.iloc[-1].strftime('%m/%d')])
ws.append(['Submissions', total_in_submissions(df),'% of Total',submission_metric(df)])
ws.append(['Declined', total_declined(df),'% of Total',declined_metric(df)])
ws.append(['Approved',total_approved(df),'% of Total',approved_metric(df)])
ws.append(['Funded',total_funded(df),'% of Total',funded_metric(df)])
wb.create_sheet("cleaned_data")
print(wb.active)
wb.active = 1
ws = wb.active
print(ws)
wb.save('abc1234_report.xlsx')
from openpyxl.utils.dataframe import dataframe_to_rows
for r in dataframe_to_rows(df):
    ws.append(r)
wb.save('abc1234_report.xlsx')

<Worksheet "Sheet">
<Worksheet "cleaned_data">


In [None]:
print(len(df.Iso.unique()))
for iso in sorted(df.Iso.unique()):
    print(iso,
          sum(df.loc[(df['Iso']==iso) & (df['group_status']=='submission')].count()),
          sum(df.loc[(df['Iso']==iso) & (df['group_status']=='declined')].count()),
          sum(df.loc[(df['Iso']==iso) & (df['group_status']=='approved')].count()),
          sum(df.loc[(df['Iso']==iso) & (df['group_status']=='funded')].count()),
          sum(df.loc[(df['Iso']==iso) & df['group_status'].isin(['funded'])]['funded_amount']),
          sum(df.loc[(df['Iso']==iso) & df['group_status'].isin(['approved'])]['funded_amount']))

In [None]:
df.created_at.iloc[0].strftime("%m/%d")

In [None]:
df.created_at.iloc[-1].strftime('%m/%d')

In [None]:
df.pivot_table(df,values='funded_amount',index=['Iso'],columns='group_status',aggfunc='sum').reset_index()

In [23]:
def groupby_printer(df):
    for key, item in df.groupby(pd.Grouper(freq='M')):
        print(df.groupby(pd.Grouper(freq='M')).get_group(key), "\n\n")

In [24]:
groupby_printer(df)

                        id                                               name  \
created_at                                                                      
2019-07-01 14:41:31  46014                        XL CONSTRUCTION SUPPLY, LLC   
2019-07-01 14:45:52  46016                                  JOSE LUIS PEREIRA   
2019-07-01 14:56:31  46018                                STONE LEIGH WAY LLC   
2019-07-01 14:58:33  46020                             TROPICAL PET OASIS LLC   
2019-07-01 15:12:47  46023                       TREEFROG DATA SOLUTIONS INC    
2019-07-01 16:04:57  46039                                          XORCO LLC   
2019-07-01 16:13:18  46040                        ART OF GIFTING INCORPORATED   
2019-07-01 16:14:35  46041                              CINEMILLS CORPORATION   
2019-07-01 16:16:16  46042                  PULMONARY AND SLEEP MEDICAL, P.C.   
2019-07-01 16:26:30  46047                          SHILOH TRAVEL & TOURS LLC   
2019-07-01 16:31:25  46050  