## Reading and Filtering Data

In [1]:
import pandas as pd
import numpy as np
import datetime

In [2]:
'''
Read in datasets from .csv's saved in same directory
Change all column names to lowercase

Initiation.csv downloaded Jan 5, 2020
from https://datacatalog.cookcountyil.gov/Courts/Initiation/7mck-ehwz

Dispositions.csv downloaded Jan 5, 2020
from https://datacatalog.cookcountyil.gov/Courts/Dispositions/apwk-dzx8

MHI_true.csv created via MHI_building.ipynb notebook
'''

initiation = pd.read_csv('Initiation.csv', low_memory=False)
initiation.columns = [x.lower() for x in initiation.columns]

dispositions = pd.read_csv('Dispositions.csv', low_memory=False)
dispositions.columns = [x.lower() for x in dispositions.columns]

sentencing = pd.read_csv('Dispositions.csv', low_memory=False)
sentencing.columns = [x.lower() for x in sentencing.columns]

MHI_true = pd.read_csv('MHI_true.csv', low_memory=False)
MHI_true.columns = [x.lower() for x in MHI_true.columns]

In [3]:
'''
Filter initiation dataset 
Keep only the initiation records with
case_participant_id's that exist in either sentencing or dispositions
'''

#save cp id values as lists
init_ids = list(initiation.case_participant_id.values)
dispo_ids = list(dispositions.case_participant_id.values)
sent_ids = list(sentencing.case_participant_id.values)

#create list of unique cp ids that are in both init & dispo
ids_to_keep = list(set(init_ids).intersection(set(dispo_ids).union(set(sent_ids))))

#create filtered df with only records from the intersection of cp ids
init_filtered = initiation[initiation['case_participant_id'].isin(ids_to_keep)]

del dispositions
del sentencing

## Data Cleaning

In [4]:
#create new df to work on for processing/cleaning
init_clean = init_filtered.copy()

del init_filtered

In [5]:
'''
Cleaning age_at_incident column
'''

#print unique values of age
print('Unique age values: ', init_clean['age_at_incident'].unique())

#turn nan to '0' in order to process into ints
init_clean.loc[init_clean['age_at_incident'].isnull(), 'age_at_incident'] = '0'

#change type to int
init_clean.age_at_incident = init_clean.age_at_incident.astype(int)

#look at unique values again
print('Unique age values after cleaning: ', init_clean['age_at_incident'].unique())

#how many values do we think are messed up?
print('Number of records with null age: ', len(init_clean[init_clean['age_at_incident']==0]))
print('Number of records with age > 100: ', len(init_clean[init_clean['age_at_incident']>100]))

Unique age values:  [ 22.  nan  29.  34.  27.  41.  17.  20.  25.  59.  19.  43.  31.  23.
  32.  18.  30.  21.  28.  39.  26.  46.  37.  44.  35.  49.  42.  24.
  52.  51.  38.  33.  57.  58.  45.  48.  50.  65.  47.  53.  40.  60.
  36.  56.  55.  67.  64.  54.  63.  70.  62.  61.  71.  66.  74.  75.
  69.  68.  73.  72. 111.  85.  84.  86.  78.  77.  76.  79. 156.  81.
  80.  82.  83. 112. 113. 114. 120. 127. 115. 130. 116.  96.  87. 117.
 125. 118. 119.]
Unique age values after cleaning:  [ 22   0  29  34  27  41  17  20  25  59  19  43  31  23  32  18  30  21
  28  39  26  46  37  44  35  49  42  24  52  51  38  33  57  58  45  48
  50  65  47  53  40  60  36  56  55  67  64  54  63  70  62  61  71  66
  74  75  69  68  73  72 111  85  84  86  78  77  76  79 156  81  80  82
  83 112 113 114 120 127 115 130 116  96  87 117 125 118 119]
Number of records with null age:  10954
Number of records with age > 100:  37


In [6]:
'''
Create 2 new age-related binary features: age > 100, age_ unknown
Use median to impute missing or presumed-incorrect age values
'''

#creating new binary columns for age over 100 and age unknown (1 = true)
init_clean['age_over_100'] = (init_clean.age_at_incident > 100).astype(int) 
init_clean['age_unknown'] = (init_clean.age_at_incident == 0).astype(int)

#replacing > 100 and 0 values with nan, calculating median, then changing nan to mean
init_clean.age_at_incident.replace(0, np.NaN, inplace=True)
a = np.array(init_clean['age_at_incident'].values.tolist())
init_clean['age_at_incident'] = np.where(a > 100, np.nan, a).tolist()
print('Number of records with missing/incorrect age: ', init_clean.age_at_incident.isna().sum())
median = init_clean['age_at_incident'].median()
print('Original median age: ', median)
print('Original mean age: ', init_clean['age_at_incident'].mean())
init_clean.age_at_incident.replace(np.NaN, median, inplace=True) 
print('New median age: ', median)
print('New mean age: ', init_clean['age_at_incident'].mean())

Number of records with missing/incorrect age:  10991
Original median age:  29.0
Original mean age:  32.10448913979419
New median age:  29.0
New mean age:  32.058614903926305


  del sys.path[0]


In [7]:
'''
Convert time features to datetime objects
'''

#list of time features
time_features = ['event_date', 'incident_begin_date', 'arrest_date', 
                 'received_date', 'arraignment_date', 'incident_end_date']

for col in time_features:
    #turn nan to '01/01/1900 12:00:00 AM' so can process them as strings
    init_clean.loc[init_clean[col].isnull(), col] = '01/01/1900 12:00:00 AM'

    #make sure everything is strings
    init_clean[col] = init_clean[col].astype(str)

    #change type to datetime
    init_clean[col] = init_clean[col].map(lambda x: pd.datetime.strptime(x, '%m/%d/%Y %I:%M:%S %p'))

In [8]:
'''
Clean gender column
'''

#look at unique values
print(init_clean['gender'].unique())

#how many are messed up?
print('Number of records with gender=Male name, no gender given: ', len(init_clean[init_clean['gender']=='Male name, no gender given']))
print('Number of records with gender=Unknown: ', len(init_clean[init_clean['gender']=='Unknown']))
print('Number of records with gender=Unknown Gender: ', len(init_clean[init_clean['gender']=='Unknown Gender']))
print('Number of records with null Gender: ', len(init_clean[init_clean['gender'].isnull()]))

#turning all except 'Male' and 'Female' to 'Unknown'
init_clean.loc[init_clean['gender'].isnull(), 'gender'] = 'Unknown'
init_clean.loc[init_clean['gender']=='Male name, no gender given', 'gender'] = 'Unknown'
init_clean.loc[init_clean['gender']=='Unknown Gender', 'gender'] = 'Unknown'

['Male' 'Female' nan 'Male name, no gender given' 'Unknown Gender'
 'Unknown']
Number of records with gender=Male name, no gender given:  4
Number of records with gender=Unknown:  7
Number of records with gender=Unknown Gender:  2
Number of records with null Gender:  2722


In [9]:
'''
Clean race column
'''

#look at unique values
print(init_clean['race'].unique())

#change all to lower, to combine ASIAN and Asian
init_clean['race']=init_clean['race'].str.lower()

#look at unique values again
print(init_clean['race'].unique())

#how many are messed up?
print(init_clean['race'].value_counts())
print('Number of records with null race: ', len(init_clean[init_clean['race'].isnull()]))

#change nan to 'unknown'
init_clean['race'].fillna('unknown', inplace = True)

['Black' 'HISPANIC' 'White [Hispanic or Latino]'
 'White/Black [Hispanic or Latino]' 'White' nan 'Unknown' 'Asian'
 'Biracial' 'American Indian' 'Albino' 'ASIAN']
['black' 'hispanic' 'white [hispanic or latino]'
 'white/black [hispanic or latino]' 'white' nan 'unknown' 'asian'
 'biracial' 'american indian' 'albino']
black                               494039
white [hispanic or latino]          128261
white                                97785
hispanic                              9259
asian                                 5017
white/black [hispanic or latino]      4011
unknown                               1119
american indian                        385
biracial                               102
albino                                   1
Name: race, dtype: int64
Number of records with null race:  3825


In [16]:
'''
Convert charge_count to int
'''

#change type to int
init_clean.charge_count = init_clean.charge_count.astype(int)

In [17]:
'''
Convert all non-id string features to lowercase
'''

string_cols = ['offense_category', 'charge_offense_title', 'chapter', 'act', 
               'section', 'class', 'aoic', 'event', 'gender', 'race', 
               'law_enforcement_agency', 'unit', 'incident_city', 'updated_offense_category']

for col in string_cols:
    init_clean[col]=init_clean[col].str.lower()
    #note that str.lower() automatically ignores null values

In [18]:
'''
Convert all remaining null values to 'unknown'
'''

#where are there still null values?
print(init_clean.columns[init_clean.isna().any()].tolist())

#all null values are in string columns
#replace remaining null values with 'unknown' since this seems to be what Cook County uses
init_clean.fillna('unknown', inplace = True)

['act', 'section', 'event', 'law_enforcement_agency', 'unit', 'incident_city']


In [19]:
'''
FEATURE ENGINEERING: Binary 402 Indicator Feature

If an entry in section contains 402, then 402 column value will be 1. Else, it will be 0.

'''
# initialize empty column 402 
init_clean["402"] = 0

def sectionconverter(x): 
    if "402" in x: 
        new_row = 1
    else: 
        new_row = 0
    return new_row

init_clean['402'] = init_clean['section'].apply(sectionconverter)

## Groupby and Aggregate

In [21]:
'''
Create dummy variables for categorical columns
'''

init_clean_copy = init_clean.copy()

#create list of all categorical columns to get dummies for
#excluding ID numbers, numerical features, datetime features, binary features
cat_cols = ['offense_category', 'charge_offense_title', 'chapter', 'act', 
            'section', 'class', 'aoic', 'event', 'gender', 'race', 
            'law_enforcement_agency', 'unit', 'incident_city', 'updated_offense_category']
dummy_init = pd.get_dummies(init_clean_copy, columns=cat_cols)

del init_clean_copy

In [22]:
'''
Drop datetime features and the ID numbers we aren't using
Datetime features will be reintroduced after aggregation

List of time features defined in cleaning section:
time_features = ['event_date', 'incident_begin_date', 'arrest_date', 
                 'received_date', 'arraignment_date', 'incident_end_date']
'''

#list of id features, including primary_charge flag
id_features = ['case_id', 'primary_charge', 'charge_id', 'charge_version_id']

to_drop = time_features + id_features
dummy_init = dummy_init.drop(columns=to_drop)

print(dummy_init.shape)

(743804, 5609)


In [23]:
'''
Group by CP ID
'''

init_grouped = dummy_init.groupby('case_participant_id')

In [24]:
'''
Preparing for aggregation
Checking which features are the same for every unique CP ID
'''

clean_groups = init_clean.groupby('case_participant_id')
same_within_group = []
diff_within_group = []
for col in init_clean.columns:
    if any(clean_groups[col].nunique()>1):
        diff_within_group.append(col)
    else:
        same_within_group.append(col)

print('same: ', same_within_group)
print('different: ', diff_within_group)

same:  ['case_id', 'case_participant_id', 'offense_category', 'event', 'event_date', 'age_at_incident', 'gender', 'race', 'incident_begin_date', 'incident_end_date', 'arrest_date', 'law_enforcement_agency', 'unit', 'incident_city', 'received_date', 'arraignment_date', 'updated_offense_category', 'age_over_100', 'age_unknown']
different:  ['primary_charge', 'charge_id', 'charge_version_id', 'charge_offense_title', 'chapter', 'act', 'section', 'class', 'aoic', 'charge_count', '402']


In [25]:
'''
Creating dictionary for aggregation

First, create "short" dictionary:
Keys are columns in initiation, values are aggregation methods
'''

agg_dict_short = {}

#we take the median for anything that's always the same within unique CP ID groups
#excluding datetime features and case_id
#note that every individual value should be equal to the median
to_median = ['offense_category', 'event', 'age_at_incident', 
             'gender', 'race','law_enforcement_agency', 'unit', 'incident_city',
             'updated_offense_category', 'age_over_100', 'age_unknown']
agg_dict_short = {x : 'median' for x in to_median}

#we take the highest charge_count to represent the total number of charges
agg_dict_short['charge_count'] = 'max'

#we take the highest "402" to maintain binary feature
agg_dict_short['402'] = 'max'

#list of categorical features that vary within unique case_participant_id groups, excluding id numbers
to_sum = ['charge_offense_title', 'chapter', 'act', 'section', 'class', 'aoic']
#we will sum these features when we aggregrate their dummies
for item in to_sum:
    agg_dict_short[item] = 'sum'

print(agg_dict_short)

{'offense_category': 'median', 'event': 'median', 'age_at_incident': 'median', 'gender': 'median', 'race': 'median', 'law_enforcement_agency': 'median', 'unit': 'median', 'incident_city': 'median', 'updated_offense_category': 'median', 'age_over_100': 'median', 'age_unknown': 'median', 'charge_count': 'max', '402': 'max', 'charge_offense_title': 'sum', 'chapter': 'sum', 'act': 'sum', 'section': 'sum', 'class': 'sum', 'aoic': 'sum'}


In [26]:
'''
Creating dictionary for aggregation

Use "short" dictionary to create "long" dictionary:
Keys are columns in dummy_init, values are aggregation methods
'''

agg_dict_long = {}

#loop through every key, val pair in agg_dict_short
for key, val in agg_dict_short.items():

    #check if key is not in the list of cat_cols that got turned into dummy variables
    if key not in cat_cols:

        #add key, val pair to agg_dict_long
        agg_dict_long[key] = val

    else:

        #loop through every column in dummy_init
        for col in dummy_init:

            #check if the key is a substring of the dummy_init column name
            if key+'_' in col:

                #add val to agg_dict_long with the dummy_init column name as new key
                agg_dict_long[col] = val

del dummy_init

In [27]:
'''
Aggregate the post-dummy groupby object
'''

init_squish = init_grouped.aggregate(agg_dict_long)
# note that index is now cp id

In [28]:
'''
Append MHI column to aggregated dataframe
'''

#initialize MHI column to false
init_squish['MHI'] = 0

#change MHI to 1 for the cp ids which appear in MHI_true
init_squish.loc[init_squish.index.isin(MHI_true.case_participant_id.values), 'MHI']=1

print('Number of positive cases in aggregated dataset: ', len(init_squish[init_squish['MHI']==1]))
print('Number of negative cases in aggregated dataset: ', len(init_squish[init_squish['MHI']==0]))

Number of positive cases in aggregated dataset:  2212
Number of negative cases in aggregated dataset:  286882


In [29]:
'''
Investigate discrepancy between number of positive cases 
in MHI_true vs. init_squish
'''

missing_positives = list(set(MHI_true.case_participant_id.values).difference(set(init_squish.index.values)))
print(len(missing_positives))
print(any(initiation.case_participant_id.isin(missing_positives)))
#'False' confirms that none of the 'missing positives' are present in initiation

227
False


In [30]:
'''
Adding datetime features to aggregated dataframe
Could not be aggregated because they are non-numeric
Easily reintroduced because they are always the same within unique CP ID groups

List of time features defined in cleaning section:
time_features = ['event_date', 'incident_begin_date', 'arrest_date', 
                 'received_date', 'arraignment_date', 'incident_end_date']
                 
Also adding location-related categorical features
As with time features, these are always the same within unique CP ID groups
Currently duplicates the information from dummy variables
Will be used for feature engineering, then removed
'''

#making df copies to work on
total_df = init_squish.copy()
time_df = init_clean.copy()

#make list of columns to keep/delete from init_clean
#keep time features, cp id, non-dummy location features
to_keep = time_features + ['case_participant_id', 'incident_city', 'unit']
to_delete  = list(set(time_df.columns) - set(to_keep))

#drop columns in to_delete list
time_df.drop(columns=to_delete, inplace=True)

#drop rows with with duplicate cp id's
time_df.drop_duplicates(subset='case_participant_id', keep='first', inplace=True)

#confirm that time_df now has as many rows as init_squish
print(len(init_squish)==len(time_df))
#test works, prints True

del init_squish
del init_clean

#set cp id to index of time_df
time_df.set_index('case_participant_id', inplace=True)

#join our time and location columns from time_df onto total_df
total_df = total_df.join(time_df, on='case_participant_id')

del time_df

print(total_df.shape)

True
(289094, 5617)


## Feature Engineering

In [31]:
''' 
Creating a binary 'weekday' feature: 0 = Sat/Sun, 1 = M/T/W/Th/F
'''

#converting to datetime and replacing 1900 with NaT
arrest_date_dt = pd.to_datetime(total_df.arrest_date) 
arrest_date_dt = arrest_date_dt.replace(datetime.datetime(1900, 1, 1, 0, 00), np.nan)

#instantiating new column, converting NaT to np.nan, checking if weekday 
total_df['weekday'] = arrest_date_dt 
total_df['weekday'] = np.where(total_df.weekday.isnull(), np.nan, total_df.weekday.dt.dayofweek < 5)

total_df.weekday.head()

case_participant_id
260122253823    0.0
272161011760    1.0
864286527653    1.0
882206007016    1.0
882242005211    0.0
Name: weekday, dtype: float64

In [32]:
'''
Creating a 'season' feature: spring, summer, fall, winter

'''
from datetime import date, datetime

arrest_date_dt = pd.to_datetime(total_df.arrest_date) #converting to datetime
total_df['day_of_year'] = arrest_date_dt.dt.dayofyear #making day of year column with ordinal date

#checking for season
total_df['season'] = np.where(total_df['day_of_year'] <= 80, 'winter', total_df['day_of_year'])
total_df['season'] = np.where((173 > total_df['day_of_year']) &  (total_df['day_of_year'] > 80), 'spring', total_df['season'])
total_df['season'] = np.where((267 > total_df['day_of_year']) &  (total_df['day_of_year'] >= 173), 'summer', total_df['season'])
total_df['season'] = np.where((356 > total_df['day_of_year']) &  (total_df['day_of_year'] >= 267), 'fall', total_df['season'])
total_df['season'] = np.where(356 <= total_df['day_of_year'], 'winter', total_df['season'])

total_df = total_df.drop(columns = 'day_of_year') #dropping day of year column  
total_df = pd.get_dummies(total_df, columns = ['season']) #create dummy variables
total_df.head()

Unnamed: 0_level_0,offense_category_aggravated assault police officer,offense_category_aggravated assault police officer firearm,offense_category_aggravated battery,offense_category_aggravated battery police officer,offense_category_aggravated battery police officer firearm,offense_category_aggravated battery with a firearm,offense_category_aggravated discharge firearm,offense_category_aggravated dui,offense_category_aggravated fleeing and eluding,offense_category_aggravated identity theft,...,arrest_date,unit,incident_city,received_date,arraignment_date,weekday,season_fall,season_spring,season_summer,season_winter
case_participant_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
260122253823,0,0,0,0,0,0,0,0,0,0,...,2011-05-22 18:51:00,unknown,unknown,2011-05-24,2011-07-11 00:00:00,0.0,0,1,0,0
272161011760,0,0,0,0,0,0,0,0,0,0,...,2009-07-14 14:34:00,unknown,unknown,2012-01-27,1900-01-01 00:00:00,1.0,0,0,1,0
864286527653,0,0,0,0,0,0,0,0,0,0,...,2010-04-07 20:44:00,unknown,unknown,2011-01-31,2010-05-19 00:00:00,1.0,0,1,0,0
882206007016,0,0,0,0,0,0,0,0,0,0,...,2010-12-15 20:40:00,unknown,chicago,2011-01-31,2011-01-31 00:00:00,1.0,1,0,0,0
882242005211,0,0,0,0,0,0,0,0,0,0,...,2011-07-17 19:05:00,unknown,chicago,2011-07-17,1900-01-01 00:00:00,0.0,0,0,1,0


In [33]:
'''
Creating an 'incident length' feature which is the difference between the incident begin and end date. 
If either is null, then incident length = 0.
'''

#convert begin and end dates to datetime
begin_date_dt = pd.to_datetime(total_df.incident_begin_date)
begin_date_dt = begin_date_dt.replace(datetime(1900, 1, 1, 0, 00), np.nan) #replacing 1900 with NaT
end_date_dt = pd.to_datetime(total_df.incident_end_date)
end_date_dt = end_date_dt.replace(datetime(1900, 1, 1, 0, 00), np.nan) #replacing 1900 with NaT

#create incident_length column by subtracting, this will be NaT if either is NaT
total_df['incident_length'] = (end_date_dt - begin_date_dt)

#replace NaT with 0 
total_df['incident_length'] = np.where(total_df.incident_length.isnull(), 0, total_df.incident_length)

#convert all timestamps to days 
total_df['incident_length'] = total_df.incident_length.dt.days

total_df.incident_length.value_counts().head()

0    279416
1      1998
2       394
3       245
4       197
Name: incident_length, dtype: int64

In [None]:
'''
Geoencoding incident city to create latitude and longitude columns.
'''

from geopy.extra.rate_limiter import RateLimiter
from geopy.geocoders import Nominatim

#creating array of unique cities and adding state to city names in list
uniqueCities = total_df['incident_city'].unique()
uniquePlaces =  [(i + ", Illinois") for i in uniqueCities]
#drop Unknown and convert to df
uniquePlaces_df = pd.DataFrame(uniquePlaces[1:])

#geoencode unique list
#1 - convenient function to delay between geocoding calls
locator = Nominatim(user_agent='myGeocoder')
geocode = RateLimiter(locator.geocode, min_delay_seconds=1)
#2 - create location column
uniquePlaces_df['location'] = uniquePlaces_df[0].apply(geocode)
#3 - create longitude, laatitude and altitude from location column (returns tuple)
uniquePlaces_df['point'] = uniquePlaces_df['location'].apply(lambda loc: tuple(loc.point) if loc else None)
#4 - split tuple into longitude and latitude (and altitude)
uniquePlaces_df[['latitude', 'longitude', 'altitude']] = pd.DataFrame(uniquePlaces_df['point'].tolist(), index=uniquePlaces_df.index)

#remove state from city names
uniquePlaces_df[0]=uniquePlaces_df[0].str[:-10]

#set default values (will remain 0 if incident_city is unknown)
total_df['latitude'] = 0
total_df['longitude'] = 0

#assign appropriate long/lat
for i in range(len(uniquePlaces_df)):
    samePlace = uniquePlaces_df[0][i]
    lat = uniquePlaces_df['latitude'][i]
    lon = uniquePlaces_df['longitude'][i]
    
    #update lat and lon if df['incident_city']==uniquePlaces_df['0']
    total_df['latitude'] = np.where((total_df.incident_city == samePlace), lat, total_df.latitude)
    total_df['longitude'] = np.where((total_df.incident_city == samePlace), lon, total_df.longitude)

del uniquePlaces_df

total_df.head(20)

RateLimiter caught an error, retrying (0/2 tries). Called with (*('palatine, Illinois',), **{}).
Traceback (most recent call last):
  File "/Users/kelseymarkey/anaconda3/lib/python3.7/urllib/request.py", line 1317, in do_open
    encode_chunked=req.has_header('Transfer-encoding'))
  File "/Users/kelseymarkey/anaconda3/lib/python3.7/http/client.py", line 1229, in request
    self._send_request(method, url, body, headers, encode_chunked)
  File "/Users/kelseymarkey/anaconda3/lib/python3.7/http/client.py", line 1275, in _send_request
    self.endheaders(body, encode_chunked=encode_chunked)
  File "/Users/kelseymarkey/anaconda3/lib/python3.7/http/client.py", line 1224, in endheaders
    self._send_output(message_body, encode_chunked=encode_chunked)
  File "/Users/kelseymarkey/anaconda3/lib/python3.7/http/client.py", line 1016, in _send_output
    self.send(msg)
  File "/Users/kelseymarkey/anaconda3/lib/python3.7/http/client.py", line 956, in send
    self.connect()
  File "/Users/kelseymar

In [None]:
total_df.to_csv('total_df.csv')

### KM Notes: 
- Does it look weird that we create new df and then delete? I know this is an artifact of before when we exported datasets at intermediary stages, but now that its all combined should we fix. Perhaps not because the dataframes get renamed to more applicable things at each step? 
- Check later if we reintroduce primary_charge?? Could be interesting to keep in but we delete in second cell in groupby/aggregate section
- In first cell of Feature Engineering the Weekday column is floats. Do they need to be ints? Might be hard with NaNs


*** RERUN FROM START ONCE DONE!!! CHECK SEASON HEAD!! 