In [218]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
pd.set_option('display.width', 260)
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 80)
from category_encoders import TargetEncoder
from IPython.display import display

pref='../'

In [219]:

class v75():
    def __init__(self, filnamn='all_data.csv', pref=''):
        self.pref=pref
        self.filnamn = pref+filnamn
        self.df = self.load_df()
        self.work_df = self.df.copy()
        
    
    def _remove_features(self,remove=['startnr', 'vodds', 'podds', 'bins', 'h1_dat',
                'h2_dat', 'h3_dat', 'h4_dat', 'h5_dat']):                 # use in prepare_data()
        
        self.work_df.drop(remove, axis=1, inplace=True)
    
        return self.work_df
           
    def prepare_data(self): # This is the general preparation of data
        # remove omgångar som saknar avdelningar
        saknas = ['2015-08-15', '2016-08-13', '2017-08-12']
        self.work_df = self.work_df[~self.work_df.datum.isin(saknas)]
    
        # remove_number_from_hx_bana (i.e Åby-1 -> Åby, etc)
        self.work_df['h1_bana'] = self.work_df.h1_bana.str.split('-').str[0]
        self.work_df['h2_bana'] = self.work_df.h2_bana.str.split('-').str[0]
        self.work_df['h3_bana'] = self.work_df.h3_bana.str.split('-').str[0]
        self.work_df['h4_bana'] = self.work_df.h4_bana.str.split('-').str[0]
        self.work_df['h5_bana'] = self.work_df.h5_bana.str.split('-').str[0]
        
        # lower case for häst, bana, kusk and hx_bana
        for f in ['häst','bana', 'kusk', 'h1_kusk', 'h2_kusk', 'h3_kusk', 'h4_kusk', 'h5_kusk', 'h1_bana', 'h2_bana', 'h3_bana', 'h4_bana', 'h5_bana']:
            self.work_df[f] = self.work_df[f].str.lower()
        
        _=self._remove_features()
        
        return self.work_df       

    def concat(self, ny_df):
        features = list(self.df.columns)
        assert set(features) == set(list(ny_df.columns)), 'Features in ny_df is not the same as in self.df'
        assert features == list(ny_df.columns), 'Features in ny_df and self.df are not equal'
        
        self.df = pd.concat([self.df, ny_df], axis=0)
        self.work_df = self.df.copy()
        return self.df
    
    def train_test_split(self, test_size=0.2):
        datumar=self.work_df.datum.unique()
        antal = len(datumar)
        antal_train = int(antal*(1-test_size))
        datum_train = datumar[:antal_train]
        print('Antal train datum: ', antal_train)
        print('Antal test datum: ', antal-antal_train)
        print('Antal datum totalt: ', antal)
    
        X_train = self.work_df[self.work_df.datum.isin(datum_train)]
        X_test = self.work_df[~self.work_df.datum.isin(datum_train)]
        
        y_train = (X_train.pop('plac')==1) * 1   # make plac=(0,1) instead of true/false (for catboost)
        y_test = (X_test.pop('plac')==1) * 1      # make plac=(0,1) instead of true/false (for catboost)
        
        assert round(len(X_test)/(len(X_train)+len(X_test)),2) == test_size, f'{round(len(X_test)/(len(X_train)+len(X_test)),2)} Test size is not correct'
        
        return X_train, X_test, y_train, y_test
        
    def load_df(self):
        self.df = pd.read_csv(self.filnamn)
        return self.df
    
    def save_df(self):
        self.df.to_csv(self.filnamn, index=False)
        
    

In [220]:
v75_obj = v75(pref='../')
v75_obj.prepare_data()
X_train, X_test, y_train, y_test = v75_obj.train_test_split()


Antal train datum:  440
Antal test datum:  110
Antal datum totalt:  550


In [227]:
numeric_features = list(X_train.select_dtypes(include=['int64', 'float64']).columns)
cat_features = list(X_train.select_dtypes(include=['object']).columns)
print(numeric_features)
print(cat_features)
print('bana',len(X_train.bana.unique()))
print('kusk',len(X_train.kusk.value_counts()))
print('häst',len(X_train.häst.value_counts()))
print('h1_bana',len(X_train.h1_bana.unique()))
print('h2_bana',len(X_train.h2_bana.value_counts()))
print('h3_bana',len(X_train.h3_bana.value_counts()))
print('h4_bana',len(X_train.h4_bana.value_counts()))
print('h5_bana',len(X_train.h5_bana.value_counts()))
print('h1_kusk',len(X_train.h1_kusk.value_counts()))
print('h2_kusk',len(X_train.h2_kusk.value_counts()))
print('h3_kusk',len(X_train.h3_kusk.value_counts()))
print('h4_kusk',len(X_train.h4_kusk.value_counts()))
print('h5_kusk',len(X_train.h5_kusk.unique()))


['avd', 'streck', 'kr', 'spår', 'dist', 'lopp_dist', 'start', 'ålder', 'pris', 'h1_spår', 'h1_plac', 'h1_pris', 'h1_odds', 'h1_kmtid', 'h2_spår', 'h2_plac', 'h2_pris', 'h2_odds', 'h2_kmtid', 'h3_spår', 'h3_plac', 'h3_pris', 'h3_odds', 'h3_kmtid', 'h4_spår', 'h4_plac', 'h4_pris', 'h4_odds', 'h4_kmtid', 'h5_spår', 'h5_plac', 'h5_pris', 'h5_odds', 'h5_kmtid', 'h1_dist', 'h2_dist', 'h3_dist', 'h4_dist', 'h5_dist', 'h1_auto', 'h2_auto', 'h3_auto', 'h4_auto', 'h5_auto', 'h1_perf', 'h2_perf', 'h3_perf', 'h4_perf', 'h5_perf', 'senast', 'delta1', 'delta2', 'delta3', 'delta4']
['datum', 'bana', 'häst', 'kusk', 'kön', 'h1_kusk', 'h1_bana', 'h2_kusk', 'h2_bana', 'h3_kusk', 'h3_bana', 'h4_kusk', 'h4_bana', 'h5_kusk', 'h5_bana']
bana 32
kusk 1032
häst 8744
h1_bana 119
h2_bana 119
h3_bana 122
h4_bana 125
h5_bana 136
h1_kusk 1524
h2_kusk 1594
h3_kusk 1665
h4_kusk 1771
h5_kusk 1820


In [231]:
from category_encoders import TargetEncoder
enc = TargetEncoder(cols=['bana','kusk','kön','h1_kusk','h2_kusk','h3_kusk','h4_kusk','h5_kusk','h1_bana','h2_bana','h3_bana','h4_bana','h5_bana',    ]).fit(X_train, y_train)
X_train=enc.fit_transform(X_train, y_train)
X_test = enc.transform(X_test)
# X_train.info()
print('bana',len(X_train.bana.unique()))
print('kusk',len(X_train.kusk.value_counts()))
print('häst',len(X_train.häst.value_counts()))
print('h1_bana',len(X_train.h1_bana.unique()))
print('h2_bana',len(X_train.h2_bana.value_counts()))
print('h3_bana',len(X_train.h3_bana.value_counts()))
print('h4_bana',len(X_train.h4_bana.value_counts()))
print('h5_bana',len(X_train.h5_bana.value_counts()))
print('h1_kusk',len(X_train.h1_kusk.value_counts()))
print('h2_kusk',len(X_train.h2_kusk.value_counts()))
print('h3_kusk',len(X_train.h3_kusk.value_counts()))
print('h4_kusk',len(X_train.h4_kusk.value_counts()))
print('h5_kusk',len(X_train.h5_kusk.unique()))




bana 31
kusk 188
häst 8744
h1_bana 87
h2_bana 93
h3_bana 91
h4_bana 96
h5_bana 95
h1_kusk 242
h2_kusk 249
h3_kusk 259
h4_kusk 253
h5_kusk 257
