In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [3]:
import statsmodels.api as smi
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold

In [3]:
def create_single_game_dimension(original_df):
    all_countries = list("C-%s" % country for country in original_df["Country"].unique().tolist())
    df_index = ["Season", "Home_Team", "Away_Team", "Game_Date"] + all_countries
    new_df = pd.DataFrame(index=df_index)
    
    for game in original_df["Game_ID"].unique():
        
        game_series = pd.Series(index = df_index)
        game_specific_rows = original_df.loc[original_df["Game_ID"] == game, :]
        # Since most of the data for a game is repeated, we just get the first information from the first row of the game 
        game_series.loc[["Season", "Home_Team", "Away_Team", "Game_Date"]] = game_specific_rows.iloc[0, :].loc[["Season", "Home_Team", "Away_Team", "Game_Date"]]

        for idx, row in original_df.iterrows():
            game_series.loc["C-%s" % row["Country"]] = row["Rounded Viewers"]
        new_df[game] = game_series

    # At this point the columns are game_ids and the rows are the information/data
    # we need to swap these in order to make it easier to apply linear regression
    new_df = new_df.T
    # Add the total viewers by summing over the country columns
    country_cols = [country for country in new_df.columns if "C-" in country]
    new_df["Aggregate_Total_Viewers_By_Country"] = new_df.loc[:, country_cols].sum(axis=1)
    return new_df

In [4]:
def create_team_vector(single_game_df):
    """Function for creating a two-hot vector with bits representing the presence of specific teams in the game"""
    # Converting teams to integer encoding
    team_encoder = LabelEncoder()
    team_encoder.fit(np.ravel([single_game_df["Home_Team"], single_game_df["Away_Team"]]))
    single_game_df["Home_Team_Code"] = team_encoder.transform(single_game_df["Home_Team"])
    single_game_df["Away_Team_Code"] = team_encoder.transform(single_game_df["Away_Team"])

    vector_encoder = OneHotEncoder(sparse=False)
    away_vectors = vector_encoder.fit_transform(single_game_df["Away_Team_Code"].values.reshape(-1, 1))
    home_vectors = vector_encoder.fit_transform(single_game_df["Home_Team_Code"].values.reshape(-1, 1))
    combined_vector = away_vectors + home_vectors
    combined_df = pd.DataFrame(data=combined_vector, index=single_game_df.index,
                               columns=["T-%s" % e for e in team_encoder.classes_])

    merged_df = pd.concat([single_game_df, comb_df], axis=1)
    return merged_df

In [5]:
# Original training set given by NBA
training_filename = "training_set.csv"
original_df = pd.read_csv(training_filename)
full_df = create_single_game_dimension(original_df=train_df)
# there are some N/A values for numeric dtypes - we replace them with n/a
full_df = full_df.fillna(0)
# Intermediate saving since create_single_game_df takes a few seconds to run
full_df.to_csv("training_set_games.csv", index=True)
full_df = pd.read_csv("training_set_games.csv", index_col=0)
team_df = create_team_vector(full_df)