# ARIMA Time Series Analysis
##### Ben Wilson #####

1) Import libraries and collect data

In [8]:
import pandas as pd
import numpy as np

df = pd.read_csv('https://raw.githubusercontent.com/bwilson668/thinkful/master/unit-2/LoanStats3a.csv', header=1, low_memory=False)

df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,10.65%,162.87,B,B2,...,,,,,0.0,0.0,,,,
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,15.27%,59.83,C,C4,...,,,,,0.0,0.0,,,,
2,1077175,1313524.0,2400.0,2400.0,2400.0,36 months,15.96%,84.33,C,C5,...,,,,,0.0,0.0,,,,
3,1076863,1277178.0,10000.0,10000.0,10000.0,36 months,13.49%,339.31,C,C1,...,,,,,0.0,0.0,,,,
4,1075358,1311748.0,3000.0,3000.0,3000.0,60 months,12.69%,67.79,B,B5,...,,,,,0.0,0.0,,,,


2) Convert date fields to a datetime instead of string to allow it to be used in the time series analysis

In [19]:
# create a new column called "issue_d_format" and populate it with the datetime object of the original string datetime
df['issue_d_format'] = pd.to_datetime(df['issue_d'])

# build a dataframe that has an index on the formated datetime
dfts = df.set_index('issue_d_format')

It's also possible to automatically set the datetime fields and index from reading the csv

In [23]:
df2 = pd.read_csv('https://raw.githubusercontent.com/bwilson668/thinkful/master/unit-2/LoanStats3a.csv', header=1, low_memory=False, parse_dates=['issue_d'], index_col='issue_d')
df2.issue_d[0]

Timestamp('2011-12-01 00:00:00')

Continue the transformations

In [22]:


# group the loans by month
year_month_summary = dfts.groupby(lambda x : x.year * 100 + x.month).count()

loan_count_summary = year_month_summary['issue_d']
print loan_count_summary

200706.0      24
200707.0      63
200708.0      74
200709.0      53
200710.0     105
200711.0     112
200712.0     172
200801.0     305
200802.0     306
200803.0     402
200804.0     259
200805.0     115
200806.0     124
200807.0     141
200808.0     100
200809.0      57
200810.0     122
200811.0     209
200812.0     253
200901.0     269
200902.0     302
200903.0     324
200904.0     333
200905.0     359
200906.0     406
200907.0     411
200908.0     446
200909.0     507
200910.0     604
200911.0     662
200912.0     658
201001.0     662
201002.0     682
201003.0     828
201004.0     912
201005.0     989
201006.0    1105
201007.0    1204
201008.0    1175
201009.0    1189
201010.0    1232
201011.0    1224
201012.0    1335
201101.0    1380
201102.0    1298
201103.0    1448
201104.0    1563
201105.0    1704
201106.0    1835
201107.0    1875
201108.0    1934
201109.0    2067
201110.0    2118
201111.0    2232
201112.0    2267
Name: issue_d, dtype: int64


3) Plot out the ACF and PACF

In [18]:
import statsmodels.api as sm
import matplotlib.pyplot as plt


fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(loan_count_summary, lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(loan_count_summary, lags=40, ax=ax2)