In [1]:
import os
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import sys
sys.path.insert(0, '..')

In [2]:
model_path = os.path.join('..', 'model', 'estimator.pkl')
model = joblib.load(model_path)

In [6]:
test_df = pd.read_csv(os.path.join('..', 'data', 'training', 'test.csv'))

In [7]:
test_df.head()

Unnamed: 0,year,home_team,month,day,attend,day_of_week,opponent,temp,skies,day_night,cap,shirt,fireworks,bobblehead
0,2012,Cincinnati Reds,JUN,25,34485,Monday,Milwaukee Brewers,84,Clear,Night,NO,NO,NO,NO
1,2012,Toronto Blue Jays,MAY,16,28915,Wednesday,New York Yankees,68,Dome,Night,NO,NO,NO,NO
2,2012,Arizona Diamondbacks,AUG,29,18451,Wednesday,Cincinnati Reds,76,Dome,Day,NO,NO,NO,NO
3,2012,New York Yankees,AUG,5,45878,Sunday,Seattle Mariners,89,Cloudy,Day,NO,NO,NO,NO
4,2012,Toronto Blue Jays,JUN,28,24668,Thursday,Los Angeles Angels,73,Clear,Night,NO,NO,NO,NO


In [62]:
test_df['predicted_attend'] = np.round(model.predict(test_df))

In [10]:
train_df = pd.read_csv(os.path.join('..', 'data', 'training', 'train.csv'))

In [63]:
train_df['predicted_attend'] = np.round(model.predict(train_df))

In [64]:
full_df = pd.concat([train_df, test_df], keys=['train', 'test'])
full_df.index = full_df.index.get_level_values(0)
full_df = full_df.reset_index().rename(columns={'index': 'data'})

In [65]:
full_df

Unnamed: 0,data,year,home_team,month,day,attend,day_of_week,opponent,temp,skies,day_night,cap,shirt,fireworks,bobblehead,predicted_attend
0,train,2012,New York Mets,SEP,8,25603,Saturday,Atlanta Braves,83,Cloudy,Day,NO,NO,NO,NO,25319.0
1,train,2012,Los Angeles Dodgers,MAY,18,40906,Friday,St. Louis Cardinals,64,Clear,Night,NO,NO,YES,NO,37437.0
2,train,2012,Pittsburgh Pirates,MAY,13,27517,Sunday,Houston Astros,62,Rainy,Day,NO,YES,NO,NO,28241.0
3,train,2012,Chicago White Sox,SEP,5,17336,Wednesday,Minnesota Twins,85,Cloudy,Day,NO,NO,NO,NO,19535.0
4,train,2012,Atlanta Braves,JUN,12,41452,Tuesday,New York Yankees,84,Cloudy,Night,NO,NO,NO,NO,38581.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2416,test,2012,Atlanta Braves,MAY,25,37663,Friday,Washington Nationals,88,Cloudy,Night,NO,NO,YES,NO,36919.0
2417,test,2012,Washington Nationals,SEP,4,17648,Tuesday,Chicago Cubs,82,Cloudy,Night,NO,NO,NO,NO,21351.0
2418,test,2012,Pittsburgh Pirates,JUN,10,25752,Sunday,Kansas City Royals,85,Clear,Day,NO,NO,NO,NO,34625.0
2419,test,2012,St. Louis Cardinals,MAY,1,36345,Tuesday,Pittsburgh Pirates,83,Cloudy,Night,NO,NO,NO,NO,39639.0


In [77]:
import altair as alt

scatter = alt.Chart(full_df).mark_circle(size=60).encode(
    x=alt.X('attend', axis=alt.Axis(format="~s", title='Actual Attendance')),
    y=alt.X('predicted_attend', axis=alt.Axis(format="~s", title='Predicted Attendance')),
    color=alt.Color('data', scale=alt.Scale(range=['#d62728', '#7f7f7f'])),
    tooltip=['home_team', 'opponent', 'month', 'day',
             'attend', 'predicted_attend',
             'day_of_week', 'temp', 'skies', 'day_night', 
             'cap', 'shirt', 'fireworks', 'bobblehead']
).properties(
    title='Predictive Performance'
)

overlay_df = pd.DataFrame({
    'x': np.round(np.linspace(0, 60_000, 100))
})

overlay_df['y'] = overlay_df.x

line = alt.Chart(overlay_df).mark_line(color='black', strokeWidth=0.25).encode(
    x='x', y='y'
)

scatter + line