In [1]:
import pandas as pd
import numpy as np
import sys

sys.path.insert(1, '../../../scripts/')
from s3_support import *
%matplotlib inline

# Load data

In [2]:
# load data
q = '''select
            date_trunc('month', date) as month,
            count(id) as count,
            sum(amount) as volume,
            count(distinct org) as orgs
        from transactions
            where status='A'
            group by date_trunc('month', date)'''
df_all = redshift_query_read(q)

In [3]:
# prep & transform data
df_all['month'] = pd.to_datetime(df_all['month'])
df_all.sort_values('month', ascending=True, inplace=True)
df_all = df_all[df_all['volume']>0.]

df_all['cat_month'] = df_all['month'].dt.month
df_all['count_growth'] = df_all['count'].diff() / df_all['count'].shift()
df_all['volume_growth'] = df_all['volume'].diff() / df_all['volume'].shift()
df_all['orgs_growth'] = df_all['orgs'].diff() / df_all['orgs'].shift()

# Compute means and standard deviations

In [4]:
grpd = df_all.groupby('cat_month')[['count_growth', 'volume_growth', 'orgs_growth']].agg(['std', 'mean'])

In [5]:
grpd

Unnamed: 0_level_0,count_growth,count_growth,volume_growth,volume_growth,orgs_growth,orgs_growth
Unnamed: 0_level_1,std,mean,std,mean,std,mean
cat_month,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1,0.107311,-0.250457,0.040923,-0.604653,0.018032,-0.073351
2,0.141149,0.054919,0.139987,0.091278,0.068913,0.039595
3,0.13534,0.166255,0.0819,0.24066,0.021431,0.052825
4,0.088004,0.090812,0.131168,0.057108,0.034943,0.016562
5,0.253644,-0.103402,0.273986,-0.051807,0.107306,-0.017892
6,0.094161,-0.025768,0.119892,-0.021009,0.043644,0.006861
7,0.224533,-0.053495,0.085228,-0.093958,0.029421,0.001263
8,0.211762,0.217915,0.156885,0.227089,0.023337,0.034006
9,0.164393,0.132509,0.122412,0.115492,0.040996,0.050487
10,0.196579,0.059572,0.179282,0.028248,0.033853,0.039987


In [6]:
means = grpd['volume_growth']['mean']
error = grpd['volume_growth']['std']

In [7]:
df_all.iloc[-2:]

Unnamed: 0,month,count,volume,orgs,cat_month,count_growth,volume_growth,orgs_growth
157,2020-04-01,197843,26769412.27,2118,4,0.340954,0.431735,-0.025759
74,2020-05-01,22958,2655697.68,1376,5,-0.883958,-0.900794,-0.350331


# Computing basic projection

In [14]:
def get_projection(r):
    return r['last_volume'] + (r['last_volume'] * means[r['cat_month']])

print("Last 3 months:")
months = df_all.iloc[-5:].copy()
months['last_volume'] = months['volume'].shift(1)
months['projection'] = months.apply(get_projection, axis=1)
months[['cat_month', 'volume', 'projection']]

Last 3 months:


Unnamed: 0,cat_month,volume,projection
156,1,13583707.32,
72,2,15967177.95,14823590.0
73,3,18697187.43,19809830.0
157,4,26769412.27,19764960.0
74,5,2655697.68,25382580.0


In [8]:
current_month = df_all['cat_month'].iloc[-1]
last_volume = df_all['volume'].iloc[-2:-1].iloc[0]
current_volume = df_all['volume'].iloc[-1]
projected_volume = last_volume + (last_volume * means[current_month])

projected_high = projected_volume + (projected_volume * error[current_month])
projected_low = projected_volume - (projected_volume * error[current_month])

print("Current month: {}".format(current_month))
print("Last volume: ${:,}".format(last_volume))
print("Currently at ${:,}".format(current_volume))

print()
print("Expected growth: {:.2f}%".format(means[current_month] * 100.))
print("Projected volume: ${:,.2f}".format(projected_volume))
print("Range: between ${:,.2f} and ${:,.2f}".format(projected_low, projected_high))

Current month: 5
Last volume: $26,769,412.27
Currently at $2,655,697.68

Expected growth: -5.18%
Projected volume: $25,382,578.97
Range: between $18,428,119.72 and $32,337,038.22


# Testing last 12 months

In [10]:
test_data = df_all.tail(12).copy()
train_data = df_all.iloc[:-12]

In [11]:
len(train_data), len(test_data)

(128, 12)

In [12]:
parameters = train_data.groupby('cat_month')['volume_growth'].mean()
test_data['projected'] = test_data['cat_month'].apply(lambda x: parameters[x])
test_data['projection_error'] = test_data['volume_growth'] - test_data['projected']

In [13]:
print("MAE: {:.2f}".format(test_data['projection_error'].abs().mean()))
test_data[['cat_month', 'projection_error']]

MAE: 0.11


Unnamed: 0,cat_month,projection_error
149,5,0.021762
150,6,-0.180478
68,7,-0.066556
151,8,0.044249
69,9,-0.004086
152,10,0.02478
70,11,-0.118718
71,12,0.170564
153,1,-0.014157
72,2,0.091841


A simple mean projection gives us an average error of 11%. This is really good considering the simplicity of the model.

# Random forest modeling

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

  from numpy.core.umath_tests import inner1d


In [15]:
df_all['target'] = df_all['volume_growth'].shift(-1)
X = df_all.replace([np.inf, -np.inf], np.nan).dropna()[['cat_month', 'count_growth', 'volume_growth', 'orgs_growth']]
y = df_all.replace([np.inf, -np.inf], np.nan).dropna()['target']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

rf = RandomForestRegressor()
rf.fit(X_train, y_train)

rf_score = rf.score(X_test, y_test)

print("{} training observations; {} test observations".format(len(X_train), len(X_test)))
print("{:.4f} test error".format(rf_score))

103 training observations; 35 test observations
0.9447 test error


Perhaps the model could do better if it had the mean growth from all prior months as a feature.

In [22]:
row_data = []

for _, r in df_all.iterrows():
    r['prior_years_month_mean'] = df_all[(df_all['cat_month']==r['cat_month'])&(df_all['month']<r['month'])]['volume_growth'].mean()
    row_data.append(r)

In [24]:
df_rolling_mean = pd.DataFrame(row_data)
len(df_rolling_mean), len(df_rolling_mean.dropna())

(140, 126)

In [99]:
test_errors = []
for i in range(0,100):
    df_rolling_mean['target'] = df_rolling_mean['volume_growth'].shift(-1)
    X = df_rolling_mean.replace([np.inf, -np.inf], np.nan).dropna()[['volume_growth', 'orgs_growth', 'prior_years_month_mean']]
    y = df_rolling_mean.replace([np.inf, -np.inf], np.nan).dropna()['target']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    rf = RandomForestRegressor()
    rf.fit(X_train, y_train)

    test_errors.append(abs(rf.score(X_test, y_test)))

print("{} training observations; {} test observations".format(len(X_train), len(X_test)))
print("{:.4f} test error".format(np.mean(test_errors)))

100 training observations; 26 test observations
0.3326 test error


In [93]:
for i in zip(['volume_growth', 'orgs_growth', 'prior_years_month_mean'], rf.feature_importances_):
    print("{}: {:.4f}".format(i[0], i[1]))

volume_growth: 0.4175
orgs_growth: 0.1622
prior_years_month_mean: 0.4202


In [101]:
df_rolling_mean['target'] = df_rolling_mean['volume_growth'].shift(-1)
df_rolling_mean['forward_prior_years_month_mean'] = df_rolling_mean['prior_years_month_mean'].shift(-1)

test_errors = []
for i in range(0,100):
    X = df_rolling_mean.replace([np.inf, -np.inf], np.nan).dropna()[['volume_growth', 'orgs_growth', 'forward_prior_years_month_mean']]
    y = df_rolling_mean.replace([np.inf, -np.inf], np.nan).dropna()['target']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    rf = RandomForestRegressor()
    rf.fit(X_train, y_train)

    test_errors.append(abs(rf.score(X_test, y_test)))

print("{} training observations; {} test observations".format(len(X_train), len(X_test)))
print("{:.4f} test error".format(np.mean(test_errors)))

100 training observations; 26 test observations
0.8585 test error


Looking at a variety of feature sets, it is pretty clear that a random forest model cannot reliably beat a mean prediction of prior years' monthly growth. Let's try a more simple, linear model in order to be thorough.

# OLS

In [102]:
from sklearn.linear_model import LinearRegression

In [103]:
df_rolling_mean['target'] = df_rolling_mean['volume_growth'].shift(-1)
df_rolling_mean['forward_prior_years_month_mean'] = df_rolling_mean['prior_years_month_mean'].shift(-1)

test_errors = []
for i in range(0,100):
    X = df_rolling_mean.replace([np.inf, -np.inf], np.nan).dropna()[['volume_growth', 'orgs_growth', 'prior_years_month_mean']]
    y = df_rolling_mean.replace([np.inf, -np.inf], np.nan).dropna()['target']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    lr = LinearRegression()
    lr.fit(X_train, y_train)

    test_errors.append(abs(rf.score(X_test, y_test)))

print("{} training observations; {} test observations".format(len(X_train), len(X_test)))
print("{:.4f} test error".format(np.mean(test_errors)))

100 training observations; 26 test observations
1.6641 test error


In [104]:
df_rolling_mean['target'] = df_rolling_mean['volume_growth'].shift(-1)
df_rolling_mean['forward_prior_years_month_mean'] = df_rolling_mean['prior_years_month_mean'].shift(-1)

test_errors = []
for i in range(0,100):
    X = df_rolling_mean.replace([np.inf, -np.inf], np.nan).dropna()[['volume_growth', 'orgs_growth', 'forward_prior_years_month_mean']]
    y = df_rolling_mean.replace([np.inf, -np.inf], np.nan).dropna()['target']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    lr = LinearRegression()
    lr.fit(X_train, y_train)

    test_errors.append(abs(rf.score(X_test, y_test)))

print("{} training observations; {} test observations".format(len(X_train), len(X_test)))
print("{:.4f} test error".format(np.mean(test_errors)))

100 training observations; 26 test observations
0.9508 test error


Clearly the linear model performs worse than the random forest. I suspect there is simply insufficient training data to support models more complicated than prior year's monthly mean, even when that value is provided to the model as feature.