# Augmenting music
The music is the performance history of a horse before a race. For each race run by the horse, the music provides the result position and the type of the race. To give more information on the performances to the prediction models, we want to *augment the music* by manually recomposing the music with the races data, and adding new fields. 

In [32]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import re

In [33]:
df = pd.read_csv("../data/interim/performances.csv", index_col=0)
df.head()

  df = pd.read_csv("../data/interim/performances.csv", index_col=0)


Unnamed: 0,raceId,date,horse.genyId,musique,results.position,priceFirst
0,774350,1451650500000,2410353.0,3aDmDmDa4mDaDa8a7m6a,,16200.0
1,774350,1451650500000,2392855.0,5mDm4mDmDm3m(14)5m1m3m,,16200.0
2,774350,1451650500000,2368443.0,1m6mDa0a1m5m3mDm(14)Da,6.0,16200.0
3,774350,1451650500000,2375355.0,5m4m2m0m4m7a3m3mDm4m,1.0,16200.0
4,774350,1451650500000,2374477.0,0m6a7mDa4aDaDa8a7aDa,5.0,16200.0


## Preprocessing


### Date extraction from string data

In [34]:
def extract_date(raceId):
    print(raceId)
    date_word = re.search(r'(?<=\:)(.*?)(?=\/)', raceId).group(0)
    result = pd.to_datetime(date_word, format="%m%d%Y")
    
    return result
extract_date("pmu_race:01012016/R1/C1")

pmu_race:01012016/R1/C1


Timestamp('2016-01-01 00:00:00')

Date ms to datetime

In [35]:
df["date"] = pd.to_datetime(df["date"], unit="ms")

### Results position
The result positions of the performances are biaised by the number of the competing horses in the race.  

In [36]:
def clean_results_position(results_position):
    if pd.isnull(results_position):
        return 10
    else:
        return (max(int(results_position), 10))

df["results.position"] = df["results.position"].apply(clean_results_position)

##  Music augmentation

## Music cleaning

In [37]:
import re
def clean_music_to_list(musique):
    '''Nettoie la musique (retire les lettres et supprime les nombres entre parenthèses)
    '''
    if pd.isnull(musique):
      return []
    
    musique = re.sub(r'\([^)]*\)', "", musique)
    musique = re.sub("[^1-9]", " ", musique)
    musique = musique.split()
    return musique

df["cleaned_music"] = df["musique"].apply(clean_music_to_list)

In [38]:
df.head()

Unnamed: 0,raceId,date,horse.genyId,musique,results.position,priceFirst,cleaned_music
0,774350,2016-01-01 12:15:00,2410353.0,3aDmDmDa4mDaDa8a7m6a,10,16200.0,"[3, 4, 8, 7, 6]"
1,774350,2016-01-01 12:15:00,2392855.0,5mDm4mDmDm3m(14)5m1m3m,10,16200.0,"[5, 4, 3, 5, 1, 3]"
2,774350,2016-01-01 12:15:00,2368443.0,1m6mDa0a1m5m3mDm(14)Da,10,16200.0,"[1, 6, 1, 5, 3]"
3,774350,2016-01-01 12:15:00,2375355.0,5m4m2m0m4m7a3m3mDm4m,10,16200.0,"[5, 4, 2, 4, 7, 3, 3, 4]"
4,774350,2016-01-01 12:15:00,2374477.0,0m6a7mDa4aDaDa8a7aDa,10,16200.0,"[6, 7, 4, 8, 7]"


In [39]:
musics = pd.DataFrame()
selected_columns = ['results.position', 'priceFirst', 'date']
def get_music(horse):
  first_music = horse['cleaned_music'].iloc[0]
  augmented_music = horse[['results.position', 'priceFirst', 'date']].values
  if first_music:
    augmented_music = np.vstack([[[float(result), np.NaN , np.NaN] for result in first_music], augmented_music])
  return augmented_music

In [40]:
musique = df.groupby(['horse.genyId']).apply(get_music)

In [42]:
musique.to_csv("../data/interim/musique.csv")