#### Importing required libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
pd.set_option('display.max_columns', None)
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold,KFold

#### Loading the data

In [2]:
train = pd.read_csv("Train.csv")
test  = pd.read_csv("Test.csv")
sub = pd.read_excel("Sample_submission.xlsx")

In [3]:
train.head()

Unnamed: 0,Team1,Team2,Stadium,HostCountry,Team1_Venue,Team2_Venue,Team1_Innings,Team2_Innings,MonthOfMatch,MatchWinner
0,5,4,37,4,Home,Away,Second,First,Dec,4
1,1,14,84,7,Neutral,Neutral,First,Second,Sep,1
2,9,15,47,9,Home,Away,First,Second,Feb,9
3,7,2,102,6,Home,Away,First,Second,Aug,2
4,6,8,46,5,Home,Away,First,Second,Aug,6


In [4]:
test.head()

Unnamed: 0,Team1,Team2,Stadium,HostCountry,Team1_Venue,Team2_Venue,Team1_Innings,Team2_Innings,MonthOfMatch
0,2,4,34,1,Home,Away,First,Second,Oct
1,14,1,19,15,Home,Away,First,Second,Mar
2,9,10,130,14,Neutral,Neutral,Second,First,Dec
3,9,10,8,9,Home,Away,First,Second,Dec
4,5,15,130,14,Neutral,Neutral,First,Second,Oct


In [5]:
train1=train.copy()

#### First important learning from this hackathon is to convert this multi class classification to binary classification. Because predicting for other teams when team-1 is playing against team-2 doesn't make any sense. This helped me to get average cross validation score 0.61 from 0.68

In [6]:
train.loc[train.MatchWinner == train.Team1 ,'MatchWinner']=0
train.loc[train.MatchWinner == train.Team2,'MatchWinner']=1

#### Concating data

In [7]:
train['train_or_test']='train'
test['train_or_test']='test'
df=pd.concat([train,test])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


In [8]:
train1['MatchWinner'].value_counts()

1     379
5     318
10    316
14    262
13    246
12    245
4     237
9     223
15     87
2      70
6      33
0      27
7      24
11     18
8      15
3       8
Name: MatchWinner, dtype: int64

#### Created Team Rank based on Team Winning Numbers.

In [9]:
data = {'Team1':[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15], 'Team1_Rank':['Rank_12','Rank_1','Rank_10','Rank_16','Rank_7','Rank_2','Rank_11','Rank_13','Rank_15','Rank_8','Rank_3','Rank_14','Rank_6','Rank_5','Rank_4','Rank_9']} 
team1rank=pd.DataFrame(data)
data = {'Team2':[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15], 'Team2_Rank':['Rank_12','Rank_1','Rank_10','Rank_16','Rank_7','Rank_2','Rank_11','Rank_13','Rank_15','Rank_8','Rank_3','Rank_14','Rank_6','Rank_5','Rank_4','Rank_9']} 
team2rank=pd.DataFrame(data)

In [10]:
df=df.merge(team1rank,on='Team1',how='left')
df=df.merge(team2rank,on='Team2',how='left')

#### Got this useful code snippet from :https://www.kaggle.com/willkoehrsen/introduction-to-manual-feature-engineering ,which calculates aggregate features based on numerical columns

In [11]:
def agg_numeric(df, parent_var, df_name):
    """
    Groups and aggregates the numeric values in a child dataframe
    by the parent variable.
    
    Parameters
    --------
        df (dataframe): 
            the child dataframe to calculate the statistics on
        parent_var (string): 
            the parent variable used for grouping and aggregating
        df_name (string): 
            the variable used to rename the columns
        
    Return
    --------
        agg (dataframe): 
            a dataframe with the statistics aggregated by the `parent_var` for 
            all numeric columns. Each observation of the parent variable will have 
            one row in the dataframe with the parent variable as the index. 
            The columns are also renamed using the `df_name`. Columns with all duplicate
            values are removed. 
    
    """
    
            
    # Only want the numeric variables
    parent_ids = df[parent_var].copy()
    numeric_df = df.select_dtypes('number').drop(columns={'MatchWinner'}).copy()
    numeric_df[parent_var] = parent_ids

    # Group by the specified variable and calculate the statistics
    agg = numeric_df.groupby(parent_var).agg(['count', 'mean', 'max', 'min', 'sum'])

    # Need to create new column names
    columns = []

    # Iterate through the variables names
    for var in agg.columns.levels[0]:
        if var != parent_var:
            # Iterate through the stat names
            for stat in agg.columns.levels[1]:
                # Make a new column name for the variable and stat
                columns.append('%s_%s_%s' % (df_name, var, stat))
    
    agg.columns = columns
    
    # Remove the columns with all redundant values
    _, idx = np.unique(agg, axis = 1, return_index=True)
    agg = agg.iloc[:, idx]
    
    return agg

#### Fetaure selection technique is recomended after this. I had no time to work on itensive feature selection.All other features are tested individually with validation set .Removing/Adding

In [12]:
previous_agg = agg_numeric(df, 'Team1', 'previous')
print('Previous aggregation shape: ', previous_agg.shape)
df=df.merge(previous_agg, on ='Team1', how = 'left')

Previous aggregation shape:  (16, 13)


In [13]:
previous_agg = agg_numeric(df, 'Team2', 'previous')
print('Previous aggregation shape: ', previous_agg.shape)
df=df.merge(previous_agg, on ='Team2', how = 'left')

Previous aggregation shape:  (16, 63)


#### This generates aggregate features on categorical column.

In [14]:
def agg_categorical(df, parent_var, df_name):
    """
    Aggregates the categorical features in a child dataframe
    for each observation of the parent variable.
    
    Parameters
    --------
    df : dataframe 
        The dataframe to calculate the value counts for.
        
    parent_var : string
        The variable by which to group and aggregate the dataframe. For each unique
        value of this variable, the final dataframe will have one row
        
    df_name : string
        Variable added to the front of column names to keep track of columns

    
    Return
    --------
    categorical : dataframe
        A dataframe with aggregated statistics for each observation of the parent_var
        The columns are also renamed and columns with duplicate values are removed.
        
    """
    
    # Select the categorical columns
    categorical = pd.get_dummies(df.select_dtypes('object'))

    # Make sure to put the identifying id on the column
    categorical[parent_var] = df[parent_var]

    # Groupby the group var and calculate the sum and mean
    categorical = categorical.groupby(parent_var).agg(['sum', 'count', 'mean'])
    
    column_names = []
    
    # Iterate through the columns in level 0
    for var in categorical.columns.levels[0]:
        # Iterate through the stats in level 1
        for stat in ['sum', 'count', 'mean']:
            # Make a new column name
            column_names.append('%s_%s_%s' % (df_name, var, stat))
    
    categorical.columns = column_names
    
    # Remove duplicate columns by values
    _, idx = np.unique(categorical, axis = 1, return_index = True)
    categorical = categorical.iloc[:, idx]
    
    return categorical

#### Feature selection technique is recomended after this

In [15]:
previous_counts = agg_categorical(df, 'Team1', 'previous')
print('Previous counts shape: ', previous_counts.shape)
previous_counts.head()
df=df.merge(previous_counts, on ='Team1', how = 'left')

previous_counts = agg_categorical(df, 'Team2', 'previous')
print('Previous counts shape: ', previous_counts.shape)
previous_counts.head()
df=df.merge(previous_counts, on ='Team2', how = 'left')


Previous counts shape:  (16, 102)
Previous counts shape:  (16, 102)


#### Coverting numerical columns as str to use it for inetraction features. LGBM works well with interaction features.
#### Useful feature engineering tips from the winner:https://www.kaggle.com/c/ieee-fraud-detection/discussion/108575

In [16]:
df['Team1']=df['Team1'].astype(str)
df['Team2']=df['Team2'].astype(str)
df['Stadium']=df['Stadium'].astype(str)
df['HostCountry']=df['HostCountry'].astype(str)

In [17]:
interaction1=df['Team1']+'_'+df['Team2']
interaction2=df['Team1']+'_'+df['HostCountry']

interaction3=df['Team2']+'_'+df['Team1_Venue']
interaction4=df['Team2']+'_'+df['Team2_Venue']

df['T1_T2']=interaction1
df['T1_H']=interaction2
df['T2_T1V']=interaction3
df['T2_T2V']=interaction4

In [18]:
df.columns

Index(['HostCountry', 'MatchWinner', 'MonthOfMatch', 'Stadium', 'Team1',
       'Team1_Innings', 'Team1_Venue', 'Team2', 'Team2_Innings', 'Team2_Venue',
       ...
       'previous_Team1_Innings_First_sum_y',
       'previous_Team1_Innings_Second_sum_y',
       'previous_train_or_test_train_sum_y',
       'previous_Team2_Rank_Rank_12_sum_y', 'previous_Team1_Venue_Home_sum_y',
       'previous_MonthOfMatch_Apr_count_y', 'T1_T2', 'T1_H', 'T2_T1V',
       'T2_T2V'],
      dtype='object', length=297)

#### Label Encoding

In [19]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for col in ['Team1_Innings','Team2_Innings', 'T1_T2','T2_T1V','T2_T2V']:
    df[col]=  df[col].astype('str')
    df[col]= le.fit_transform(df[col])    

#### Mapping

In [20]:
venue_dict = {"Home":2,"Away":0,"Neutral":1}

df["Team1_Venue"] = df["Team1_Venue"].map(venue_dict)
df["Team2_Venue"] = df["Team2_Venue"].map(venue_dict)


#### Frequency Encoding

In [21]:
fe_pol = (df.groupby('Team1_Rank').size()) / len(df)
df['Team1_Rank'] = df['Team1_Rank'].apply(lambda x : fe_pol[x])

In [22]:
fe_pol = (df.groupby('Team2_Rank').size()) / len(df)
df['Team2_Rank'] = df['Team2_Rank'].apply(lambda x : fe_pol[x])

In [23]:
fe_pol = (df.groupby('MonthOfMatch').size()) / len(df)
df['MonthOfMatch'] = df['MonthOfMatch'].apply(lambda x : fe_pol[x])

In [24]:
fe_pol = (df.groupby('Team1').size()) / len(df)
df['Team1_fe'] = df['Team1'].apply(lambda x : fe_pol[x])

In [25]:
fe_pol = (df.groupby('Team2').size()) / len(df)
df['Team2_fe'] = df['Team2'].apply(lambda x : fe_pol[x])

In [26]:
fe_pol = (df.groupby('HostCountry').size()) / len(df)
df['HostCountry_fe'] = df['HostCountry'].apply(lambda x : fe_pol[x])

In [27]:
fe_pol = (df.groupby('T1_H').size()) / len(df)
df['T1_H'] = df['T1_H'].apply(lambda x : fe_pol[x])

In [28]:
fe_pol = (df.groupby('T2_T2V').size()) / len(df)
df['T2_T2V_fe'] = df['T2_T2V'].apply(lambda x : fe_pol[x])

In [29]:
df['Team1']=df['Team1'].astype(int)
df['Team2']=df['Team2'].astype(int)
df['Stadium']=df['Stadium'].astype(int)
df['HostCountry']=df['HostCountry'].astype(int)

#### Group by features 

In [30]:
df['Unique_Stadium_team1']=df.groupby(['Team1'])['Stadium'].transform('nunique')
df['Unique_Stadium_team2']=df.groupby(['Team2'])['Stadium'].transform('nunique')

df['Team1_unique_Team2']=df.groupby(['Team1'])['Team2'].transform('nunique')
df['Team2_unique_Team1']=df.groupby(['Team2'])['Team1'].transform('nunique')

df['Unique_Stadium_team1_team2']=df.groupby(['Team1','Team2'])['Stadium'].transform('nunique')

#Noteuseful
#df['Unique_Stadium_team1_team2']=df.groupby(['Team1'])['MonthOfMatch'].transform('nunique')
#df['Unique_Country_team1']=df.groupby(['Team1'])['HostCountry'].transform('nunique')
#df['Unique_Country_team2']=df.groupby(['Team2'])['HostCountry'].transform('nunique')

#### Based on the number of Unique stadium a team played ,Genearted mapping as below. Generally Experienced team would have played in many stadium and new team would have  played in less stadium

In [31]:
df['Unique_Stadium_team1'].value_counts()

67    551
78    513
53    370
55    359
45    359
38    262
33    259
28    246
35    205
3     188
21    115
17     61
14     36
8      32
11     27
Name: Unique_Stadium_team1, dtype: int64

In [32]:
map={3:'VeryNewTeam',8:'NewTeam',11:'NewTeam',14:'NewTeam',17:'NewTeam',21:'Moderate',28:'Moderate',33:'Moderate',35:'Moderate',38:'Moderate',45:'Experienced',53:'Experienced',55:'Experienced',67:'HighlyExperienced',78:'HighlyExperienced'}

In [33]:
df['Team1_Experience']=df['Unique_Stadium_team1'].map(map)

In [34]:
df['Unique_Stadium_team2'].value_counts()

87    477
92    475
81    467
67    335
88    315
76    297
71    287
70    281
89    277
49    116
21     92
41     68
17     61
24     35
Name: Unique_Stadium_team2, dtype: int64

In [35]:
map={17:'NewTeamT2',21:'NewTeamT2',24:'NewTeamT2',41:'ModerateT2',49:'ExperiencedT2',67:'HighlyExperiencedT2',70:'HighlyExperiencedT2',71:'HighlyExperiencedT2',76:'HighlyExperiencedT2',81:'HighlyExperiencedT2',87:'HighlyExperiencedT2',88:'HighlyExperiencedT2',89:'HighlyExperiencedT2',92:'HighlyExperiencedT2'}

In [36]:
df['Team2_Experience']=df['Unique_Stadium_team2'].map(map)

#### OHE above 2 features

In [37]:
for col in ['Team1_Experience','Team2_Experience']:
    df = pd.get_dummies(df, columns=[col])

#### Getting back train and test data

In [38]:
train=df.loc[df.train_or_test.isin(['train'])]
test=df.loc[df.train_or_test.isin(['test'])]
train.drop(columns={'train_or_test'},axis=1,inplace=True)
test.drop(columns={'train_or_test'},axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


#### All the below fetaures are target based features, which i thought was working well initially. One has to be careful while generating target based Feature.
#### This was the most importatnt learning from this hackathon. One has to validate this with validation set ,by setting validating target as NA ,macthing to the test data.
#### Since validation data already knows the actual target ,  features  generated will be based on its own target and your model will predict good and so the accuracy increases.
#### On the other side ,test set doesnt have the target feature ,and target based feature is from the aggregate of trian data.
#### When i set validation target as NA,I got the clear picture that cze of data leakage these features were performing well, so i ignored all the below target based features[This validation is not shown in the code]

#### [Learning] https://www.kaggle.com/abhilashawasthi/feature-engineering-lgb-model - this is a time series problem where the author fills validation target as NA before generating target based feature

In [39]:
#gdf = train.groupby(["Team1","Team2"])["MatchWinner"].count().reset_index()
#gdf.columns = ["Team1", "Team2","Team1_against_Team2"]
#train = train.merge(gdf, on=["Team1","Team2"], how="left")
#test = test.merge(gdf, on=["Team1","Team2"], how="left")

In [40]:
#train['Team1_won']=0
#for i in range(len(train)):
    #if train.loc[i,'Team1']==train.loc[i,'MatchWinner']:
        #train.loc[i,'Team1_won']=1
    

In [41]:
##P.S: Name convention are #not proper

In [42]:
# Team1win = train.groupby(['Team1', 'Team2','Stadium']).agg({'Team1_won': ['mean']})
# Team1win.columns = ['grpd_by_Product_Brand_Day_' + '_'.join(c).strip('_') for c in Team1win.columns] 
# train = train.merge(Team1win, on=["Team1","Team2","Stadium"], how="left")
# test = test.merge(Team1win, on=["Team1","Team2","Stadium"], how="left")

In [43]:
# Team1win = train.groupby(['Team1', 'Team2','HostCountry']).agg({'Team1_won': ['mean']})
# Team1win.columns = ['grpd_by_team1_' + '_'.join(c).strip('_') for c in Team1win.columns]
# train = train.merge(Team1win, on=["Team1","Team2",'HostCountry'], how="left")
# test = test.merge(Team1win, on=["Team1","Team2",'HostCountry'], how="left")

In [44]:
# Team1win = train.groupby(['Team1', 'Team2','Team1_Innings']).agg({'Team1_won': ['mean']})
# Team1win.columns = ['grpd_by_team1_venue' + '_'.join(c).strip('_') for c in Team1win.columns]
# #train = train.merge(Team1win, on=["Team1","Team2",'Team1_Innings'], how="left")
# #test = test.merge(Team1win, on=["Team1","Team2",'Team1_Innings'], how="left")

In [45]:
# Team1win = train.groupby(['Team1', 'Team2','MonthOfMatch']).agg({'Team1_won': ['mean']})
# Team1win.columns = ['grpd_by_team1_month' + '_'.join(c).strip('_') for c in Team1win.columns]
# train = train.merge(Team1win, on=["Team1","Team2",'MonthOfMatch'], how="left")
# test = test.merge(Team1win, on=["Team1","Team2",'MonthOfMatch'], how="left")

In [46]:
#initially since numeric feat were overfitting ,tried to map it as categorical col

In [47]:
# train.loc[ train['grpd_by_team1_monthTeam1_won_mean'] == 0.0, 'Team1_Winning_against_Team2_month'] = 'Very_Less_Chance'
# train.loc[(train['grpd_by_team1_monthTeam1_won_mean'] > 0.0) & (train['grpd_by_team1_monthTeam1_won_mean'] <0.50), 'Team1_Winning_against_Team2_month'] = 'less_chance'
# train.loc[train['grpd_by_team1_monthTeam1_won_mean'] == 0.50, 'Team1_Winning_against_Team2_month'] = 'Equal_Chance'
# train.loc[(train['grpd_by_team1_monthTeam1_won_mean'] > 0.50) & (train['grpd_by_team1_monthTeam1_won_mean'] <= 0.99), 'Team1_Winning_against_Team2_month'] = 'Good_Chance'
# train.loc[train['grpd_by_team1_monthTeam1_won_mean'] ==1.0, 'Team1_Winning_against_Team2_month'] = 'High_Chance'

In [48]:
# test.loc[ test['grpd_by_team1_monthTeam1_won_mean'] == 0.0, 'Team1_Winning_against_Team2_month'] = 'Very_Less_Chance'
# test.loc[(test['grpd_by_team1_monthTeam1_won_mean'] > 0.0) & (test['grpd_by_team1_monthTeam1_won_mean'] <0.50), 'Team1_Winning_against_Team2_month'] = 'less_chance'
# test.loc[test['grpd_by_team1_monthTeam1_won_mean'] == 0.50, 'Team1_Winning_against_Team2_month'] = 'Equal_Chance'
# test.loc[(test['grpd_by_team1_monthTeam1_won_mean'] > 0.50) & (test['grpd_by_team1_monthTeam1_won_mean'] <= 0.99), 'Team1_Winning_against_Team2_month'] = 'Good_Chance'
# test.loc[test['grpd_by_team1_monthTeam1_won_mean'] ==1.0, 'Team1_Winning_against_Team2_month'] = 'High_Chance'

In [49]:
# test['Team1_Winning_against_Team2_month'].fillna(-99,inplace=True)
# fe_pol = (train.groupby('Team1_Winning_against_Team2_month').size()) / len(train)
# train['Team1_Winning_against_Team2_month'] = train['Team1_Winning_against_Team2_month'].apply(lambda x : fe_pol[x])
# fe_pol = (test.groupby('Team1_Winning_against_Team2_month').size()) / len(test)
# test['Team1_Winning_against_Team2_month'] = test['Team1_Winning_against_Team2_month'].apply(lambda x : fe_pol[x])

In [50]:
# train.loc[ train['grpd_by_team1_venueTeam1_won_mean'] == 0.0, 'Team1_Winning_against_Team2_venue'] = 'Very_Less_Chance'
# train.loc[(train['grpd_by_team1_venueTeam1_won_mean'] > 0.0) & (train['grpd_by_team1_venueTeam1_won_mean'] <0.50), 'Team1_Winning_against_Team2_venue'] = 'less_chance'
# train.loc[train['grpd_by_team1_venueTeam1_won_mean'] == 0.50, 'Team1_Winning_against_Team2_venue'] = 'Equal_Chance'
# train.loc[(train['grpd_by_team1_venueTeam1_won_mean'] > 0.50) & (train['grpd_by_team1_venueTeam1_won_mean'] <= 0.99), 'Team1_Winning_against_Team2_venue'] = 'Good_Chance'
# train.loc[train['grpd_by_team1_venueTeam1_won_mean'] ==1.0, 'Team1_Winning_against_Team2_venue'] = 'High_Chance'

In [51]:
# test.loc[ test['grpd_by_team1_venueTeam1_won_mean'] == 0.0, 'Team1_Winning_against_Team2_venue'] = 'Very_Less_Chance'
# test.loc[(test['grpd_by_team1_venueTeam1_won_mean'] > 0.0) & (test['grpd_by_team1_venueTeam1_won_mean'] <0.50), 'Team1_Winning_against_Team2_venue'] = 'less_chance'
# test.loc[test['grpd_by_team1_venueTeam1_won_mean'] == 0.50, 'Team1_Winning_against_Team2_venue'] = 'Equal_Chance'
# test.loc[(test['grpd_by_team1_venueTeam1_won_mean'] > 0.50) & (test['grpd_by_team1_venueTeam1_won_mean'] <= 0.99), 'Team1_Winning_against_Team2_venue'] = 'Good_Chance'
# test.loc[test['grpd_by_team1_venueTeam1_won_mean'] ==1.0, 'Team1_Winning_against_Team2_venue'] = 'High_Chance'

In [52]:
# test['Team1_Winning_against_Team2_venue'].fillna(-99,inplace=True)
# fe_pol = (train.groupby('Team1_Winning_against_Team2_venue').size()) / len(train)
# train['Team1_Winning_against_Team2_venue'] = train['Team1_Winning_against_Team2_venue'].apply(lambda x : fe_pol[x])
# fe_pol = (test.groupby('Team1_Winning_against_Team2_venue').size()) / len(test)
# test['Team1_Winning_against_Team2_venue'] = test['Team1_Winning_against_Team2_venue'].apply(lambda x : fe_pol[x])

In [53]:
# train.loc[ train['grpd_by_team1_Team1_won_mean'] == 0.0, 'Team1_Winning_against_Team2'] = 'Very_Less_Chance'
# train.loc[(train['grpd_by_team1_Team1_won_mean'] > 0.0) & (train['grpd_by_team1_Team1_won_mean'] <0.50), 'Team1_Winning_against_Team2'] = 'less_chance'
# train.loc[train['grpd_by_team1_Team1_won_mean'] == 0.50, 'Team1_Winning_against_Team2'] = 'Equal_Chance'
# train.loc[(train['grpd_by_team1_Team1_won_mean'] > 0.50) & (train['grpd_by_team1_Team1_won_mean'] <= 0.99), 'Team1_Winning_against_Team2'] = 'Good_Chance'
# train.loc[train['grpd_by_team1_Team1_won_mean'] ==1.0, 'Team1_Winning_against_Team2'] = 'High_Chance'

In [54]:
# test.loc[ test['grpd_by_team1_Team1_won_mean'] == 0.0, 'Team1_Winning_against_Team2'] = 'Very_Less_Chance'
# test.loc[(test['grpd_by_team1_Team1_won_mean'] > 0.0) & (test['grpd_by_team1_Team1_won_mean'] <0.50), 'Team1_Winning_against_Team2'] = 'less_chance'
# test.loc[test['grpd_by_team1_Team1_won_mean'] == 0.50, 'Team1_Winning_against_Team2'] = 'Equal_Chance'
# test.loc[(test['grpd_by_team1_Team1_won_mean'] > 0.50) & (test['grpd_by_team1_Team1_won_mean'] <= 0.99), 'Team1_Winning_against_Team2'] = 'Good_Chance'
# test.loc[test['grpd_by_team1_Team1_won_mean'] ==1.0, 'Team1_Winning_against_Team2'] = 'High_Chance'

In [55]:
# test['Team1_Winning_against_Team2'].fillna(-99,inplace=True)
# fe_pol = (train.groupby('Team1_Winning_against_Team2').size()) / len(train)
# train['Team1_Winning_against_Team2'] = train['Team1_Winning_against_Team2'].apply(lambda x : fe_pol[x])
# fe_pol = (test.groupby('Team1_Winning_against_Team2').size()) / len(test)
# test['Team1_Winning_against_Team2'] = test['Team1_Winning_against_Team2'].apply(lambda x : fe_pol[x])

In [56]:
# train.loc[ train['grpd_by_Product_Brand_Day_Team1_won_mean'] == 0.0, 'Team1_Winning_against_Team2_in_Stadium'] = 'Very_Less_Chance'
# train.loc[(train['grpd_by_Product_Brand_Day_Team1_won_mean'] > 0.0) & (train['grpd_by_Product_Brand_Day_Team1_won_mean'] <0.50), 'Team1_Winning_against_Team2_in_Stadium'] = 'less_chance'
# train.loc[train['grpd_by_Product_Brand_Day_Team1_won_mean'] == 0.50, 'Team1_Winning_against_Team2_in_Stadium'] = 'Equal_Chance'
# train.loc[(train['grpd_by_Product_Brand_Day_Team1_won_mean'] > 0.50) & (train['grpd_by_Product_Brand_Day_Team1_won_mean'] <= 0.99), 'Team1_Winning_against_Team2_in_Stadium'] = 'Good_Chance'
# train.loc[train['grpd_by_Product_Brand_Day_Team1_won_mean'] ==1.0, 'Team1_Winning_against_Team2_in_Stadium'] = 'High_Chance'

In [57]:
# test.loc[ test['grpd_by_Product_Brand_Day_Team1_won_mean'] == 0.0, 'Team1_Winning_against_Team2_in_Stadium'] = 'Very_Less_Chance'
# test.loc[(test['grpd_by_Product_Brand_Day_Team1_won_mean'] > 0.0) & (test['grpd_by_Product_Brand_Day_Team1_won_mean'] <0.50), 'Team1_Winning_against_Team2_in_Stadium'] = 'less_chance'
# test.loc[test['grpd_by_Product_Brand_Day_Team1_won_mean'] == 0.50, 'Team1_Winning_against_Team2_in_Stadium'] = 'Equal_Chance'
# test.loc[(test['grpd_by_Product_Brand_Day_Team1_won_mean'] > 0.50) & (test['grpd_by_Product_Brand_Day_Team1_won_mean'] <= 0.99), 'Team1_Winning_against_Team2_in_Stadium'] = 'Good_Chance'
# test.loc[test['grpd_by_Product_Brand_Day_Team1_won_mean'] ==1.0, 'Team1_Winning_against_Team2_in_Stadium'] = 'High_Chance'

In [58]:
# test['Team1_Winning_against_Team2_in_Stadium'].fillna(-99,inplace=True)

In [59]:
# fe_pol = (train.groupby('Team1_Winning_against_Team2_in_Stadium').size()) / len(train)
# train['Team1_Winning_against_Team2_in_Stadium'] = train['Team1_Winning_against_Team2_in_Stadium'].apply(lambda x : fe_pol[x])

In [60]:
# fe_pol = (test.groupby('Team1_Winning_against_Team2_in_Stadium').size()) / len(test)
# test['Team1_Winning_against_Team2_in_Stadium'] = test['Team1_Winning_against_Team2_in_Stadium'].apply(lambda x : fe_pol[x])

In [61]:
# Team1winstadium = train.groupby(['Team1', 'Team2','Stadium']).agg({'Team1_won': ['mean']})
# Team1winstadium.columns = ['grpd_by_Product_Brand_Day_' + '_'.join(c).strip('_') for c in Team1win.columns]
# train = train.merge(Team1winstadium, on=["Team1","Team2",'Stadium'], how="left")
# test = test.merge(Team1winstadium, on=["Team1","Team2",'Stadium'], how="left")

In [62]:
#del train['Team1_won']

In [63]:
x=train.drop(columns={'MatchWinner',#'T1_T2'
                     },axis=1)
y=train.loc[:,['MatchWinner']]
test=test.drop(columns={'MatchWinner',#'T1_T2'
                       },axis=1)
testmain=test.copy()

In [64]:
from sklearn.metrics import log_loss

err = [] 
y_pred_tot_lgb = []
l=[]


fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2020)
i = 1

for train_index, test_index in fold.split(x, y):
    x_train, x_val = x.iloc[train_index], x.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    m = LGBMClassifier(
                       max_depth=2,
                       num_leaves=5,
                       learning_rate=0.08,
                       n_estimators=3000,
                       min_data_in_leaf = 95,
                       feature_fraction = 0.05,
                       min_child_weight=5,
                       min_child_samples=15,
                       colsample_bytree=0.15,
                       reg_alpha=0.5,
                       reg_lambda=2,
                       random_state=2020
                       )
    m.fit(x_train, y_train,eval_set=[(x_train,y_train),(x_val, y_val)],early_stopping_rounds=100, verbose=200)
    pred_y = m.predict_proba(x_val)
    pred_y=np.clip(pred_y,0.025,0.975)
    feature_importances = pd.DataFrame(m.feature_importances_,
                                       index = x.columns,
                                        columns=['importance'])
    sum=feature_importances.values
    l.append(sum)
    print(i, " err_lgm: ", log_loss(y_val,pred_y))
    err.append(log_loss(y_val,pred_y))
    pred_test = m.predict_proba(test)
    i = i + 1
    y_pred_tot_lgb.append(pred_test)
(err[0]+err[1]+err[2]+err[3]+err[4])/5 

  return f(**kwargs)


Training until validation scores don't improve for 100 rounds
[200]	training's binary_logloss: 0.575725	valid_1's binary_logloss: 0.598383
Early stopping, best iteration is:
[210]	training's binary_logloss: 0.574664	valid_1's binary_logloss: 0.597642
1  err_lgm:  0.5976420505710122


  return f(**kwargs)


Training until validation scores don't improve for 100 rounds
[200]	training's binary_logloss: 0.577205	valid_1's binary_logloss: 0.588539
Early stopping, best iteration is:
[242]	training's binary_logloss: 0.57398	valid_1's binary_logloss: 0.58779
2  err_lgm:  0.5877902128186612


  return f(**kwargs)


Training until validation scores don't improve for 100 rounds
[200]	training's binary_logloss: 0.574764	valid_1's binary_logloss: 0.597979
[400]	training's binary_logloss: 0.562449	valid_1's binary_logloss: 0.599005
Early stopping, best iteration is:
[310]	training's binary_logloss: 0.567589	valid_1's binary_logloss: 0.597026
3  err_lgm:  0.597026090352419


  return f(**kwargs)


Training until validation scores don't improve for 100 rounds
[200]	training's binary_logloss: 0.574061	valid_1's binary_logloss: 0.60403
[400]	training's binary_logloss: 0.561903	valid_1's binary_logloss: 0.599476
Early stopping, best iteration is:
[445]	training's binary_logloss: 0.560118	valid_1's binary_logloss: 0.599021
4  err_lgm:  0.5990227103811417


  return f(**kwargs)


Training until validation scores don't improve for 100 rounds
[200]	training's binary_logloss: 0.574895	valid_1's binary_logloss: 0.612228
Early stopping, best iteration is:
[266]	training's binary_logloss: 0.569948	valid_1's binary_logloss: 0.610585
5  err_lgm:  0.6105849158938431


0.5984131960034154

#### Finalmodel

In [65]:
m = LGBMClassifier(max_depth=2,
                   num_leaves=5,
                   learning_rate=0.08,
                   n_estimators=300,
                   min_data_in_leaf = 95,
                   feature_fraction = 0.05,
                   min_child_weight=5,
                   min_child_samples=15,
                   colsample_bytree=0.15,
                   reg_alpha=0.5,
                   reg_lambda=2,
                   random_state=2020)
m.fit(x,y)
pred =m.predict_proba(test)

  return f(**kwargs)


In [66]:
pred=pd.DataFrame(pred)

In [67]:
pred=pred.clip(0.025,0.975) #https://medium.com/@egor_vorobiev/how-to-improve-log-loss-score-kaggle-trick-3f95577839f1

In [68]:
sub=sub.clip(0,0)

In [69]:
test1=test.loc[:,['Team1','Team2']]
test1.reset_index(drop=True,inplace=True)
test1=test1.join(pred)

In [70]:
for i in range(len(test1)):
    pos=test1.loc[i,'Team1']
    pos1=test1.loc[i,'Team2']
    sub.loc[i,pos]=test1.loc[i,0]
    sub.loc[i,pos1]=test1.loc[i,1]

In [71]:
sub.to_excel('lgbmbinary.xlsx',index=False)