In [2]:
import pandas as pd
from sklearn.preprocessing import Imputer, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import confusion_matrix
import featuretools as ft
from featuretools import primitives as prims
from featuretools.selection.variance_selection import (
    select_high_variance_features,
    select_percent_null)
import os
from ml import (bin_labels,
                TimeSeriesSplitByDate,
                fit_and_score)
from utils import (build_baseline_features,
                   load_entityset,
                   get_feature_importances,
                   plot_confusion_matrix)
from IPython.display import display
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
ROOT_DATA_FOLDER = os.path.expanduser("~/olympic_games_data")

# Step 1: Load in data

Check out LoadEntityset.ipynb for a walkthrough of how to set up a dataset for use by Featuretools.

`EntitySet` is the in-memory data structure Featuretools uses to build and calculate features. It essentially consists of a dictionary of [Pandas DataFrames](https://pandas.pydata.org/pandas-docs/stable/dsintro.html) with associated metadata on how they are linked, what semantic types they contain, and how they vary in time. For an in-depth guide on. For more details and tutorials, please check out the [documentation](https://docs.featuretools.com/loading_data/using_entitysets.html).

In [4]:
es = load_entityset()
es

Entityset: Olympic Games
  Entities:
    disciplines (shape = [67, 3])
    countries (shape = [220, 8])
    olympic_games (shape = [27, 3])
    medals_won (shape = [11532, 6])
    sports (shape = [43, 2])
    ...And 3 more
  Relationships:
    medaling_athletes.Athlete -> athletes.Athlete
    medals_won.Country Olympic ID -> countries_at_olympic_games.Country Olympic ID
    countries_at_olympic_games.Olympic Games ID -> olympic_games.Olympic Games ID
    medals_won.Discipline -> disciplines.Discipline
    disciplines.Sport -> sports.Sport
    ...and 2 more

In [5]:
# This data is available from Wikipedia [here](https://en.wikipedia.org/wiki/List_of_participating_nations_at_the_Summer_Olympic_Games#List_of_nations)

# I converted the wikitable to a CSV using this online [tool](http://wikitable2csv.ggor.de/)

is_competing = Last(Feature(es['countries_at_olympic_games']['Country']), es['countries'])
Last(Feature(es['countries_at_olympic_games']['Country']), es['countries'])

Entity: disciplines
  Variables:
    Discipline (dtype: index)
    Sport (dtype: id)
    Debut Year (dtype: datetime_time_index)
  Shape:
    (67, 3)
Entity: countries
  Variables:
    Subregion ID (dtype: numeric)
    Least Developed Countries (LDC) (dtype: numeric)
    Code (dtype: index)
    Small Island Developing States (SIDS) (dtype: numeric)
    Country (dtype: categorical)
    ...And 3 more
  Shape:
    (220, 8)
Entity: olympic_games
  Variables:
    City (dtype: categorical)
    Olympic Games ID (dtype: index)
    Year (dtype: datetime)
  Shape:
    (27, 3)
Entity: medals_won
  Variables:
    Discipline (dtype: id)
    Country Olympic ID (dtype: id)
    medal_id (dtype: index)
    Year (dtype: datetime_time_index)
    Medal (dtype: categorical)
    ...And 1 more
  Shape:
    (11532, 6)
Entity: sports
  Variables:
    Sport (dtype: index)
    Debut Year (dtype: datetime_time_index)
  Shape:
    (43, 2)
Entity: countries_at_olympic_games
  Variables:
    Country Olympic ID (dtyp