In [None]:
import pandas as pd
import numpy as np
import psycopg2

pd.set_option('display.max_columns', None)

In [None]:
conn = psycopg2.connect(
        host="localhost",
        database="bloodmoneydb",
        user="postgres",
        password="password"
)

In [None]:
sql_query = """
    SELECT *
    FROM joined_fight_event_fighters_data
"""

df = pd.read_sql_query(sql_query, con=conn)

Possible avenues for feature engineering and preprocessing:

* Encode whether a given fighter is local (city/state) or from the same country that is hosting an event

* Age at time of the fight

* Win streak (can encode a losing streak as a negative win streak)

* Define a way to measure the typical length of a fighter's fights

In [None]:
def convert_event_date_and_fighter_dob_to_datetime(df, date_col='date', dob_col='date'):
    '''
    Convert date and dob columns to pandas datetime
    '''
    df[date_col] = pd.to_datetime(df[date_col])
    df[dob_col] = pd.to_datetime(df[dob_col])
    
    return df

def calculate_age_of_fighter(df, date_col='date', dob_col='dob'):
    '''
    Calculate the age of the fighter in years:
    Input:
        df: pd.DataFrame
        date_col: str (Datetime column name for event date)
        dob_col: str (Datetime column name for date of birth)
    Output:
        df: pd.DatFrame
            'age' column is an Int of the age of the fighter in years at the time of the event
    '''
    df = convert_event_date_and_fighter_dob_to_datetime(df=df, date_col='date', dob_col='dob')
    df['age'] = df['date'].dt.year - df['dob'].dt.year
    
    return df

In [None]:
df = calculate_age_of_fighter(df=df)

The event data doesn't provide what state the event is in for USA and other countries. Without geotagging the cities to their respective states it isn't possible to expand the definition of local to be that of fighters from the same state as the event. Not sure if the juice is worth the squeeze here in figuring out how to geocode all the cities (including the international ones). This is one potential option to return to however depending on how relevant the theoretical home town effect is in the modeling. 

In [None]:
def fighter_is_local(row):
    '''
    Define whether a given fighter is local which is whether the city of the event matches
    either the city they're either fighting out of or were born in
    '''
    if (row['city'] == row['foo_city']) | (row['city'] == row['born_city']):
        return True
    else:
        return False

    

def fighter_is_national(row):
    '''
    Define whether a given fighter's represented nation is the same nation hosting the event
    '''
    if (row['country'] == row['foo_country']) | (row['country'] == row['born_country']):
        return True
    else:
        return False

In [None]:
df['is_local'] = df.apply(fighter_is_local, axis=1)
df['is_national'] = df.apply(fighter_is_national, axis=1)