In [358]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

## Load Data

In [359]:
account = pd.read_csv('data/account.csv', encoding='ISO-8859-1')
concerts_1415=pd.read_csv('data/concerts_2014-15.csv')
concerts=pd.read_csv('data/concerts.csv')
sample_submission=pd.read_csv('data/sample_submission.csv')
subscriptions=pd.read_csv('data/subscriptions.csv')
test=pd.read_csv('data/test.csv')
tickets_all=pd.read_csv('data/tickets_all.csv')
train=pd.read_csv('data/train.csv')
zipcodes=pd.read_csv('data/zipcodes.csv')

#### Merge Subscription and Construct Train Data

In [360]:
# Handle subscription data
def handle_subscription(group):
    # Safely handle NaN values in cities
    shipping_city = group['shipping.city'].fillna("")
    billing_city = group['billing.city'].fillna("")

    # If there are any subscription data
    if group['season'].notna().any():
        return pd.DataFrame({

            # Total season:
            'total_season': [group.shape[0]],

            # Package features
            'full_package': [(group['package'] == 'Full').sum()],
            'quartet_package': [(group['package'].isin(['Quartet', 'Quartet A', 'Quartet B', 'Quartet CC'])).sum()],
            'trio_package': [(group['package'].isin(['Trio', 'Trio A', 'Trio B'])).sum()],
            'cyo_package': [(group['package'] == 'CYO').sum()],
            'full_upgrade_package': [(group['package'] == 'Full upgrade').sum()],

            # Seat features
            'total_seats_sub': [group['no.seats'].sum()],

            # Location features
            'location_num_sub': [group['location'].nunique()],
            # 'location_near_resident_num_sub': [(group['location'] == shipping_city).sum() if shipping_city.any() else (group['location'] == billing_city).sum()],
            # 'location_not_resident_num_sub': [group.shape[0] - (group['location'] == shipping_city).sum() if shipping_city.any() else group.shape[0] - (group['location'] == billing_city).sum()],

            # Section features
            'section_type_numbers': [group['section'].nunique()],
            'premium_orchestra_number': [(group['section'] == 'Premium Orchestra').sum()],
            'orchestra_number': [(group['section'].isin(['Orchestra', 'Orchestra Front', 'Orchestra Rear'])).sum()],
            'balcony_number': [(group['section'].isin(['Balcony Front', 'Balcony Rear', 'Balcony', 'Santa Rosa'])).sum()],
            'dress_circle_number': [(group['section'] == 'Dress Circle').sum()],
            'Gallery_number': [(group['section'] == 'Gallery').sum()],
            'Box_number': [(group['section'].isin(['Box', 'Box House Left', 'Box House Right'])).sum()],
            'floor_number': [(group['section'] == 'Floor').sum()],

            # Price features (some seasons do not have price level data using another feature to present it and calculate mean without them)
            'mean_price_level': [-1 if (~group['price.level'].isna()).sum()==0 else group[~group['price.level'].isna()]['price.level'].mean()],

            'non_price_level_subscription': [group['season'].isin(['2002-2003','2003-2004','2004-2005']).sum()],

            # Subscription tier features
            'mean_subscription_tier': [group['subscription_tier'].mean()],
            'multiple_subs_number': [(group['multiple.subs'] == 'yes').sum()],

            # other features
            'account.id': [group['account.id'].iloc[0]],
            'label': [group['label'].iloc[0]] if 'label' in group.columns else [-1],
            'shipping.zip.code': [group['shipping.zip.code'].iloc[0]],
            'billing.zip.code': [group['billing.zip.code'].iloc[0]],
            'shipping_city': [shipping_city.iloc[0]],
            'billing_city': [billing_city.iloc[0]],
            'relationship': [group['relationship'].iloc[0]],
            'amount.donated.2013': [group['amount.donated.2013'].iloc[0]],
            'amount.donated.lifetime': [group['amount.donated.lifetime'].iloc[0]],
            'no.donations.lifetime': [group['no.donations.lifetime'].iloc[0]],
            'first.donated': [group['first.donated'].iloc[0]],
        })
    
    # no subscription data, just fill with -1
    else:
        return pd.DataFrame({
            # Total season:
            'total_season': [0],

            # Package features
            'full_package': [0],
            'quartet_package': [0],
            'trio_package': [0],
            'cyo_package': [0],
            'full_upgrade_package': [0],

            # Seat features
            'total_seats_sub': [0],

            # Location features
            'location_num_sub': [0],
            # 'location_near_resident_num_sub': [-1],
            # 'location_not_resident_num_sub': [-1],

            # Section features
            'section_type_numbers': [0],
            'premium_orchestra_number': [0],
            'orchestra_number': [0],
            'balcony_number': [0],
            'dress_circle_number': [0],
            'Gallery_number': [0],
            'Box_number': [0],
            'floor_number': [0],

            # Price features
            'mean_price_level': [0],

            'non_price_level_subscription': [0],

            # Subscription tier features
            'mean_subscription_tier': [0],
            'multiple_subs_number': [0],

            # other features
            'account.id': [group['account.id'].iloc[0]],
            
            'label': [group['label'].iloc[0]] if 'label' in group.columns else [-1],
            
            'shipping.zip.code': [group['shipping.zip.code'].iloc[0]],
            'billing.zip.code': [group['billing.zip.code'].iloc[0]],
            'shipping_city': [shipping_city.iloc[0]],
            'billing_city': [billing_city.iloc[0]],
            'relationship': [group['relationship'].iloc[0]],
            'amount.donated.2013': [group['amount.donated.2013'].iloc[0]],
            'amount.donated.lifetime': [group['amount.donated.lifetime'].iloc[0]],
            'no.donations.lifetime': [group['no.donations.lifetime'].iloc[0]],
            'first.donated': [group['first.donated'].iloc[0]],
        })


In [361]:
# Merge train with account
train_merged = pd.merge(train, account, on='account.id', how='left')

# Merge train with subscriptions
train_merged = pd.merge(train_merged, subscriptions, on='account.id', how='left')

# Apply the function using groupby
train_merged = train_merged.groupby('account.id',group_keys=False).apply(handle_subscription)

train_merged=train_merged.reset_index(drop=True)


  train_merged = train_merged.groupby('account.id',group_keys=False).apply(handle_subscription)


#### Merge Tickets 

In [362]:
tickets_all['multiple.tickets'].isna().sum()

print(tickets_all['price.level'].unique())

['4' '1' '3' '2' nan '0' 'Adult' 'Youth' 'GA' '4.0']


In [363]:
## The player in season 2014-2015 
player_1415=['Nicholas McGegan','Steven Isserlis','Julian Wachner','Andreas Scholl','Dominique Labelle','Christopher Ainslie','Thomas Cooley','Dashon Burton',\
             'Bruce Lamott','Sherezade Panthaki','Clifton Massey','Brian Thorsett','Jeffrey Fields',\
                'Rachel Podger',' Ted Huffman']

key_content_1415 = ['LÕestro armonico','VIVALDI','HAYDN','HANDEL','BACH','CANTATA','TELEMANN']


In [364]:
import re

def handle_tickets(group):
    
    other_columns = group.columns.difference([
        'no.seats', 'price.level', 'location', 'set', 'multiple.tickets', 'season','marketing.source'
    ])
    if group['season'].notna().any():
        
        has_previous_player = False
        has_previous_key_content = False

        previous_player_ticket_number = 0
        previous_key_content_ticket_number = 0
    
        for no_seats,season, location in zip(group['no.seats'],group['season'],group['location']):

            result = concerts.query('season == @season and location == @location')

            player_content = result['who']

            key_content = result['what']
            
            # Join list items into a single regex pattern
            pattern_player = "|".join(map(re.escape, player_1415))
            pattern_content = "|".join(map(re.escape, key_content_1415))

            # Check if any word is in the string
            has_previous_player = player_content.str.contains(pattern_player,regex=True).any()
            has_previous_key_content = key_content.str.contains(pattern_content,regex=True).any()

            if has_previous_player :
                previous_player_ticket_number += 1
            
            if has_previous_key_content:
                previous_key_content_ticket_number += 1

            

        aggregated_data=pd.DataFrame({

            # tickets sum
            'sum_tickets': [group['no.seats'].sum()],

            # price features
            'average_price_level': [group['price.level'].mean()],
            
            # seats number features

            'total_seats_ticket': [group['no.seats'].sum()],

            # localtion features
            'location_num_ticket': [group['location'].nunique()],
            # 'location_near_resident_num_ticket': [(group['location'] == group['shipping_city']).sum() if group['shipping_city'].notna().any() else (group['location'] == group['billing_city']).sum()],
            # 'location_not_resident_num_ticket': [group.shape[0] - (group['location'] == group['shipping_city']).sum() if group['shipping_city'].notna().any() else group.shape[0] - (group['location'] == group['billing_city']).sum()],
            
            # set features

            'set sum': [group['set'].sum()],

            # multiple_tickets features
            'multiple_tickets_num': [(group['multiple.tickets'] == 'yes').sum()],
            
            # # Whether the 1415_concerts contain the same player for their previous tickets
            # 'has_previous_player': [has_previous_player],

            # # Whether the 1415_concerts contain the same key_content for their previous tickets
            # 'has_previous_key_content': [has_previous_key_content],
            
            # previous player and key_content ticket number
            'previous_player_ticket_number': [previous_player_ticket_number],

            'previous_key_content_ticket_number': [previous_key_content_ticket_number], 
        })
    
    else:
        aggregated_data=pd.DataFrame({
            # tickets sum
            'sum_tickets': [0],

            # price features
            'average_price_level': [0],
            
            # seats number features

            'total_seats_ticket': [0],

            # localtion features
            'location_num_ticket': [0],
            # 'location_near_resident_num_ticket': [-1],
            # 'location_not_resident_num_ticket': [-1],
            
            # set features

            'set sum': [0],

            # multiple_tickets features
            'multiple_tickets_num': [0],

            # # Whether the 1415_concerts contain the same player for their previous tickets
            # 'has_previous_player': [False],

            # # Whether the 1415_concerts contain the same key_content for their previous tickets
            # 'has_previous_key_content': [False],

            # previous player and key_content ticket number
            'previous_player_ticket_number': [0],

            'previous_key_content_ticket_number': [0], 

        })

    other_features = group.iloc[0][other_columns].to_frame().T.reset_index(drop=True)

    # Set each column in `other_features` to its original data type
    for col in other_columns:
        other_features[col] = other_features[col].astype(group[col].dtype)


    final_result = pd.concat([aggregated_data, other_features], axis=1)

    return final_result

In [365]:
# to calculate the mean for the set column, first fillna with the mean of the value
tickets_all['set']=tickets_all.groupby('account.id')['set'].transform(lambda x: x.fillna(x.mean()))

def handle_price_level(x):
    if x in ["Adult", "Youth", "GA"]:
        return float('nan')  # Set these values to NaN
    return float(x)  # Convert other values to float

# Apply the function to replace "Adult", "Youth", and "GA" with NaN, preparing for mean replacement
tickets_all['price.level'] = tickets_all['price.level'].apply(handle_price_level)

# Fill NaN values in 'price.level' with the group mean based on 'account.id'
tickets_all['price.level'] = tickets_all.groupby('account.id')['price.level'].transform(lambda x: x.fillna(x.mean()) if x.notna().any() else x.fillna(-1))

# merge train with tickets all

train_merged = pd.merge(train_merged,tickets_all,on='account.id',how='left')

train_merged = train_merged.groupby('account.id',group_keys=False).apply(handle_tickets)

train_merged=train_merged.reset_index(drop=True)

  train_merged = train_merged.groupby('account.id',group_keys=False).apply(handle_tickets)


#### Missing Values,Duplicates and Outliers



##### Description

In [366]:
train_merged[train_merged['mean_price_level'].isna()]['account.id']

Series([], Name: account.id, dtype: object)

In [367]:
train_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6941 entries, 0 to 6940
Data columns (total 39 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   sum_tickets                         6941 non-null   float64
 1   average_price_level                 6941 non-null   float64
 2   total_seats_ticket                  6941 non-null   float64
 3   location_num_ticket                 6941 non-null   int64  
 4   set sum                             6941 non-null   float64
 5   multiple_tickets_num                6941 non-null   int64  
 6   previous_player_ticket_number       6941 non-null   int64  
 7   previous_key_content_ticket_number  6941 non-null   int64  
 8   Box_number                          6941 non-null   int64  
 9   Gallery_number                      6941 non-null   int64  
 10  account.id                          6941 non-null   object 
 11  amount.donated.2013                 6941 no

##### Duplicates

In [368]:
# Delete Duplicate Rows
train_merged=train_merged.drop_duplicates()

# Delete shipping_city and billing_city because shipping_city is the same as shipping zip code and billing_city is the same as billing zip code

train_merged=train_merged.drop(['shipping_city','billing_city'],axis=1)

##### Missing Values


In [369]:
train_merged.isnull().sum()

sum_tickets                              0
average_price_level                      0
total_seats_ticket                       0
location_num_ticket                      0
set sum                                  0
multiple_tickets_num                     0
previous_player_ticket_number            0
previous_key_content_ticket_number       0
Box_number                               0
Gallery_number                           0
account.id                               0
amount.donated.2013                      0
amount.donated.lifetime                  0
balcony_number                           0
billing.zip.code                       980
cyo_package                              0
dress_circle_number                      0
first.donated                         5007
floor_number                             0
full_package                             0
full_upgrade_package                     0
label                                    0
location_num_sub                         0
mean_price_

In [370]:
# Fill NA with empty string
train_merged['shipping.zip.code']=train_merged['shipping.zip.code'].fillna("")
train_merged['billing.zip.code']=train_merged['billing.zip.code'].fillna("")
train_merged['relationship']=train_merged['relationship'].fillna("")
train_merged['first.donated']=train_merged.apply(lambda x: '1800-01-01 00:00:00' if x['no.donations.lifetime']==0 else (0 if pd.isna(x['first.donated']) else x['first.donated']),axis=1)

train_merged.head(5)

Unnamed: 0,sum_tickets,average_price_level,total_seats_ticket,location_num_ticket,set sum,multiple_tickets_num,previous_player_ticket_number,previous_key_content_ticket_number,Box_number,Gallery_number,...,non_price_level_subscription,orchestra_number,premium_orchestra_number,quartet_package,relationship,section_type_numbers,shipping.zip.code,total_season,total_seats_sub,trio_package
0,0.0,0.0,0.0,0,0.0,0,0,0,0,0,...,0,0,0,0,,0,94102.0,0,0.0,0
1,0.0,0.0,0.0,0,0.0,0,0,0,0,0,...,2,4,4,0,,3,,11,20.0,0
2,0.0,0.0,0.0,0,0.0,0,0,0,0,0,...,3,13,4,0,,3,,21,39.0,0
3,0.0,0.0,0.0,0,0.0,0,0,0,0,0,...,1,0,2,3,,3,,13,25.0,0
4,0.0,0.0,0.0,0,0.0,0,0,0,0,0,...,3,6,4,0,,4,,19,35.0,0


Turn first donated to timestamp data

In [371]:
train_merged['first.donated'] = pd.to_datetime(train_merged['first.donated'])

train_merged.head(5)

Unnamed: 0,sum_tickets,average_price_level,total_seats_ticket,location_num_ticket,set sum,multiple_tickets_num,previous_player_ticket_number,previous_key_content_ticket_number,Box_number,Gallery_number,...,non_price_level_subscription,orchestra_number,premium_orchestra_number,quartet_package,relationship,section_type_numbers,shipping.zip.code,total_season,total_seats_sub,trio_package
0,0.0,0.0,0.0,0,0.0,0,0,0,0,0,...,0,0,0,0,,0,94102.0,0,0.0,0
1,0.0,0.0,0.0,0,0.0,0,0,0,0,0,...,2,4,4,0,,3,,11,20.0,0
2,0.0,0.0,0.0,0,0.0,0,0,0,0,0,...,3,13,4,0,,3,,21,39.0,0
3,0.0,0.0,0.0,0,0.0,0,0,0,0,0,...,1,0,2,3,,3,,13,25.0,0
4,0.0,0.0,0.0,0,0.0,0,0,0,0,0,...,3,6,4,0,,4,,19,35.0,0


In [372]:
# check missing values
train_merged.isnull().sum()

sum_tickets                           0
average_price_level                   0
total_seats_ticket                    0
location_num_ticket                   0
set sum                               0
multiple_tickets_num                  0
previous_player_ticket_number         0
previous_key_content_ticket_number    0
Box_number                            0
Gallery_number                        0
account.id                            0
amount.donated.2013                   0
amount.donated.lifetime               0
balcony_number                        0
billing.zip.code                      0
cyo_package                           0
dress_circle_number                   0
first.donated                         0
floor_number                          0
full_package                          0
full_upgrade_package                  0
label                                 0
location_num_sub                      0
mean_price_level                      0
mean_subscription_tier                0


#### Feature Encoding

In [373]:
# ## WOE Encoding

import category_encoders as ce

# Use WoE to encode shipping zip code and billing zip code

woe = ce.WOEEncoder([ 'shipping.zip.code', 'billing.zip.code','relationship'])

train_merged['shipping.zip.code'] = train_merged['shipping.zip.code'].astype(str)
train_merged['billing.zip.code'] = train_merged['billing.zip.code'].astype(str)

woe.fit(train_merged[['shipping.zip.code', 'billing.zip.code','relationship']], train_merged['label'])

train_merged[['shipping.zip.code', 'billing.zip.code','relationship']] = woe.transform(train_merged[['shipping.zip.code', 'billing.zip.code','relationship']])

#### Construct More Features

In [374]:
# Calculate the number of days since the first donation

now = pd.to_datetime('2014-09-01')

train_merged['days_since_first_donation'] = (now - train_merged['first.donated']).dt.days

# drop first donated column

train_merged=train_merged.drop(['first.donated'],axis=1)

train_merged.head(5)

Unnamed: 0,sum_tickets,average_price_level,total_seats_ticket,location_num_ticket,set sum,multiple_tickets_num,previous_player_ticket_number,previous_key_content_ticket_number,Box_number,Gallery_number,...,orchestra_number,premium_orchestra_number,quartet_package,relationship,section_type_numbers,shipping.zip.code,total_season,total_seats_sub,trio_package,days_since_first_donation
0,0.0,0.0,0.0,0,0.0,0,0,0,0,0,...,0,0,0,0.031785,0,1.375911,0,0.0,0,78405
1,0.0,0.0,0.0,0,0.0,0,0,0,0,0,...,4,4,0,0.031785,3,-0.013762,11,20.0,0,78405
2,0.0,0.0,0.0,0,0.0,0,0,0,0,0,...,13,4,0,0.031785,3,-0.013762,21,39.0,0,8460
3,0.0,0.0,0.0,0,0.0,0,0,0,0,0,...,0,2,3,0.031785,3,-0.013762,13,25.0,0,6346
4,0.0,0.0,0.0,0,0.0,0,0,0,0,0,...,6,4,0,0.031785,4,-0.013762,19,35.0,0,9190


#### Check TrainSet

In [375]:
train_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6941 entries, 0 to 6940
Data columns (total 37 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   sum_tickets                         6941 non-null   float64
 1   average_price_level                 6941 non-null   float64
 2   total_seats_ticket                  6941 non-null   float64
 3   location_num_ticket                 6941 non-null   int64  
 4   set sum                             6941 non-null   float64
 5   multiple_tickets_num                6941 non-null   int64  
 6   previous_player_ticket_number       6941 non-null   int64  
 7   previous_key_content_ticket_number  6941 non-null   int64  
 8   Box_number                          6941 non-null   int64  
 9   Gallery_number                      6941 non-null   int64  
 10  account.id                          6941 non-null   object 
 11  amount.donated.2013                 6941 no

In [376]:
# train_merged['has_previous_key_content'].sum()

#### TestSet

In [377]:
# change the name

test['account.id']=test['ID']
test.drop(['ID'],axis=1,inplace=True)

In [378]:
# Merge test with account
test_merged = pd.merge(test, account, on='account.id', how='left')

# Merge test with subscriptions
test_merged = pd.merge(test_merged, subscriptions, on='account.id', how='left')

# Apply the function using groupby
test_merged = test_merged.groupby('account.id',group_keys=False).apply(handle_subscription)

test_merged=test_merged.reset_index(drop=True)

test_merged.drop(['label'],axis=1,inplace=True)

  test_merged = test_merged.groupby('account.id',group_keys=False).apply(handle_subscription)


In [379]:
# merge test with tickets all

test_merged = pd.merge(test_merged,tickets_all,on='account.id',how='left')

test_merged = test_merged.groupby('account.id',group_keys=False).apply(handle_tickets)

test_merged=test_merged.reset_index(drop=True)

  test_merged = test_merged.groupby('account.id',group_keys=False).apply(handle_tickets)


In [380]:
# Delete Duplicate Rows
test_merged=test_merged.drop_duplicates()

# Delete shipping_city and billing_city because shipping_city is the same as shipping zip code and billing_city is the same as billing zip code

test_merged=test_merged.drop(['shipping_city','billing_city'],axis=1)


In [381]:
# Fill NA with empty string
test_merged['shipping.zip.code']=test_merged['shipping.zip.code'].fillna("")
test_merged['billing.zip.code']=test_merged['billing.zip.code'].fillna("")
test_merged['relationship']=test_merged['relationship'].fillna("")
test_merged['first.donated']=test_merged.apply(lambda x: '1800-01-01 00:00:00' if x['no.donations.lifetime']==0 else (0 if pd.isna(x['first.donated']) else x['first.donated']),axis=1)

# change to the type of datetime
test_merged['first.donated'] = pd.to_datetime(test_merged['first.donated'])


In [382]:
# encode

test_merged[['shipping.zip.code', 'billing.zip.code','relationship']] = woe.transform(test_merged[['shipping.zip.code', 'billing.zip.code','relationship']])

In [383]:
# Calculate the number of days since the first donation

now = pd.to_datetime('2014-09-01')

test_merged['days_since_first_donation'] = (now - test_merged['first.donated']).dt.days

# drop first donated column

test_merged=test_merged.drop(['first.donated'],axis=1)

test_merged.head(5)



Unnamed: 0,sum_tickets,average_price_level,total_seats_ticket,location_num_ticket,set sum,multiple_tickets_num,previous_player_ticket_number,previous_key_content_ticket_number,Box_number,Gallery_number,...,orchestra_number,premium_orchestra_number,quartet_package,relationship,section_type_numbers,shipping.zip.code,total_season,total_seats_sub,trio_package,days_since_first_donation
0,0.0,0.0,0.0,0,0.0,0,0,0,0,0,...,11,4,0,0.031785,3,-0.013762,19,35.0,0,10286
1,0.0,0.0,0.0,0,0.0,0,0,0,0,0,...,9,4,0,0.031785,3,-0.013762,16,30.0,1,6356
2,0.0,0.0,0.0,0,0.0,0,0,0,0,0,...,7,4,0,0.031785,4,-0.013762,16,29.0,0,5752
3,0.0,0.0,0.0,0,0.0,0,0,0,0,0,...,3,3,3,0.031785,2,-0.013762,7,14.0,4,2668
4,0.0,0.0,0.0,0,0.0,0,0,0,0,0,...,9,3,4,0.031785,4,-0.013762,21,39.0,0,9555


## Train and Test Model

#### Data Processing

In [384]:
test_merged['account.id'].head()

0    001i000000LhyPF
1    001i000000LhyPG
2    001i000000LhyPP
3    001i000000LhyPb
4    001i000000LhyPg
Name: account.id, dtype: object

In [385]:
# Split the data into X and y

X_train=train_merged.drop(['label','account.id'],axis=1)
y_train=train_merged['label']

X_test=test_merged.drop(['account.id'],axis=1)



In [386]:
# X_train.drop(['shipping.zip.code','billing.zip.code'],axis=1,inplace=True)
# X_test.drop(['shipping.zip.code','billing.zip.code'],axis=1,inplace=True)

In [387]:
# Scale data

# Scale train data
scaler = StandardScaler()

numerical_cols = X_train.select_dtypes(include=['float64', 'int64']).columns

categorical_cols = X_train.select_dtypes(include=['object']).columns

train_scaled=X_train.copy()

scaler.fit(train_scaled[numerical_cols])

train_scaled[numerical_cols] = scaler.transform(train_scaled[numerical_cols])

# Scale test data

test_scaled=X_test.copy()

test_scaled[numerical_cols] = scaler.transform(test_scaled[numerical_cols])


#### Model Training And Prediction

In [388]:
# import modules

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor as cat

from sklearn.metrics import roc_auc_score

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [389]:
# Multi-Layer Perceptron

# Split data
X_train_torch, X_val_torch, y_train_torch, y_val_torch = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Define the dataset

class TabularDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.tensor(X.values, dtype=torch.float32)  # Convert X to float32 tensor
        self.y = torch.tensor(y.values, dtype=torch.float32) if y is not None else None  # Convert y to float32 tensor if available\

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if self.y is not None:
            return self.X[idx], self.y[idx]
        else:
            return self.X[idx]
    
train_dataset = TabularDataset(X_train_torch, y_train_torch)

val_dataset = TabularDataset(X_val_torch, y_val_torch)

# Define the Loader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

# Define the MLP model
class MLP(nn.Module):
    def __init__(self, input_size):
        super(MLP, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)



In [390]:
# Initialize the model, loss function, and optimizer
input_size = X_train_torch.shape[1]
model = MLP(input_size)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


# Training function
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=2000):
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch).squeeze()
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        
        model.eval()
        val_loss = 0
        val_preds = []
        val_targets = []
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                outputs = model(X_batch).squeeze()
                loss = criterion(outputs, y_batch)
                val_loss += loss.item()
                val_preds.extend(outputs.tolist())
                val_targets.extend(y_batch.tolist())
        
        # Calculate AUROC for validation set
        val_auroc = roc_auc_score(val_targets, val_preds)
        if epoch%10==0:
            print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss/len(train_loader):.4f}, "
                f"Val Loss: {val_loss/len(val_loader):.4f}, Val AUROC: {val_auroc:.4f}")

def mlp():
    # Train the model
    train_model(model, train_loader, val_loader, criterion, optimizer)

    # Make predictions on the test set
    test_dataset = TabularDataset(X_test)
    test_loader = DataLoader(test_dataset, batch_size=32)
    model.eval()
    test_preds = []
    with torch.no_grad():
        for X_batch in test_loader:
            outputs = model(X_batch).squeeze()
            test_preds.extend(outputs.tolist())

In [391]:
# Kfold cross validation

splits = 5

kf = KFold(n_splits=splits, shuffle=True, random_state=42)

cat_features = []

def cv_model(clf, x_train, y_train, x_test, clf_name , kf):
    
    cv_scores = []

    test_all = []

    for i, (train_index, valid_index) in enumerate(kf.split(x_train, y_train)):

        print('************************************ {} ************************************'.format(str(i+1)))

        trn_x, trn_y, val_x, val_y = x_train.iloc[train_index], y_train[train_index], x_train.iloc[valid_index], y_train[valid_index]

        # LightGBM
        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y)
            valid_matrix = clf.Dataset(val_x, label=val_y)

            # training parameters
            params = {
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'auc',
                'min_child_weight': 4,
                'num_leaves': 2 ** 4,
                'lambda_l2': 10,
                'feature_fraction': 0.7,
                'bagging_fraction': 0.7,
                'bagging_freq': 10,
                'learning_rate': 0.15,
                'seed': 2022,
                'n_jobs':-1,
                'verbose':-1
            }
            # model training
            model = clf.train(params, train_matrix, 30000, valid_sets=[train_matrix, valid_matrix], 
                              categorical_feature=[])
            val_pred = model.predict(val_x, num_iteration=model.best_iteration)
            test_pred = model.predict(x_test, num_iteration=model.best_iteration)
        
        # XGBoost
        if clf_name == "xgb":
            train_matrix = clf.DMatrix(trn_x , label=trn_y)
            valid_matrix = clf.DMatrix(val_x , label=val_y)
            test_matrix = clf.DMatrix(x_test)
            
            # training parameters
            params = {'booster': 'gbtree',
                      'objective': 'binary:logistic',
                      'eval_metric': 'auc',
                      'gamma': 1,
                      'min_child_weight': 1.5,
                      'max_depth': 7,
                      'lambda': 10,
                      'subsample': 0.7,
                      'colsample_bytree': 0.7,
                      'colsample_bylevel': 0.7,
                      'eta': 0.125,
                      'tree_method': 'exact',
                      'seed': 2020,
                      'nthread': 36,
                      "silent": True,
                      }
            
            watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]

            # model training
            model = clf.train(params, train_matrix, num_boost_round=50000, evals=watchlist,verbose_eval=True)
            val_pred  = model.predict(valid_matrix)
            test_pred = model.predict(test_matrix)

        # Catboost         
        if clf_name == "cat":
            
            # training parameters
            params = {'learning_rate': 0.134, 
            'depth': 10 ,
            'l2_leaf_reg': 5, 
            'bootstrap_type': 'Bernoulli',
            'od_type': 'Iter', 
            'od_wait': 2000, 
            'random_seed': 164, 
            'allow_writing_files': False
            }

            # model training
            model = clf(iterations=30000, **params)
            model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                      cat_features=[], use_best_model=True, verbose=600)
            
            val_pred  = model.predict(val_x)
            test_pred = model.predict(x_test)
        

        test_all.append(test_pred)

        cv_scores.append(roc_auc_score(val_y, val_pred))
        
        print(cv_scores)

    # output
    print("%s_scotrainre_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))

    # Convert test_all (list of lists) to a NumPy array
    test_all_array = np.array(test_all)

    # Calculate the mean across the lists (axis=0 computes mean element-wise across all lists)
    mean_output = np.mean(test_all_array, axis=0)

    return mean_output
        
def lgb_model(x_train, y_train, x_test):
    lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb", kf)
    return lgb_test

def xgb_model(x_train, y_train, x_test):
    xgb_test = cv_model(xgb, x_train, y_train, x_test, "xgb", kf)
    return xgb_test

def cat_model(x_train, y_train, x_test):
    cat_test = cv_model(cat, x_train, y_train, x_test, "cat", kf)
    return cat_test

def mlp_model(x_train, y_train, x_test):
    mlp_test = mlp()
    return mlp_test

In [392]:
X_test.head()

Unnamed: 0,sum_tickets,average_price_level,total_seats_ticket,location_num_ticket,set sum,multiple_tickets_num,previous_player_ticket_number,previous_key_content_ticket_number,Box_number,Gallery_number,...,orchestra_number,premium_orchestra_number,quartet_package,relationship,section_type_numbers,shipping.zip.code,total_season,total_seats_sub,trio_package,days_since_first_donation
0,0.0,0.0,0.0,0,0.0,0,0,0,0,0,...,11,4,0,0.031785,3,-0.013762,19,35.0,0,10286
1,0.0,0.0,0.0,0,0.0,0,0,0,0,0,...,9,4,0,0.031785,3,-0.013762,16,30.0,1,6356
2,0.0,0.0,0.0,0,0.0,0,0,0,0,0,...,7,4,0,0.031785,4,-0.013762,16,29.0,0,5752
3,0.0,0.0,0.0,0,0.0,0,0,0,0,0,...,3,3,3,0.031785,2,-0.013762,7,14.0,4,2668
4,0.0,0.0,0.0,0,0.0,0,0,0,0,0,...,9,3,4,0.031785,4,-0.013762,21,39.0,0,9555


In [393]:
# lgb_test=lgb_model(X_train, y_train, X_test)

In [394]:
# xgb_test=xgb_model(X_train, y_train, X_test)

In [395]:
cat_test=cat_model(X_train, y_train, X_test)

************************************ 1 ************************************
0:	learn: 0.1960506	test: 0.2281617	best: 0.2281617 (0)	total: 9.1ms	remaining: 4m 32s
600:	learn: 0.0403618	test: 0.1442335	best: 0.1419598 (155)	total: 5.03s	remaining: 4m 6s
1200:	learn: 0.0336510	test: 0.1458242	best: 0.1419598 (155)	total: 10.1s	remaining: 4m 2s
1800:	learn: 0.0327626	test: 0.1461151	best: 0.1419598 (155)	total: 15.1s	remaining: 3m 56s
Stopped by overfitting detector  (2000 iterations wait)

bestTest = 0.1419598138
bestIteration = 155

Shrink model to first 156 iterations.
[0.9614805692391899]
************************************ 2 ************************************
0:	learn: 0.2025163	test: 0.2032929	best: 0.2032929 (0)	total: 9.24ms	remaining: 4m 37s
600:	learn: 0.0481144	test: 0.1403415	best: 0.1376448 (140)	total: 4.95s	remaining: 4m 2s
1200:	learn: 0.0426420	test: 0.1416438	best: 0.1376448 (140)	total: 9.95s	remaining: 3m 58s
1800:	learn: 0.0420255	test: 0.1419870	best: 0.1376448 (1

In [396]:
# mlp_test=mlp_model(X_train, y_train, X_test)

## Output Data to Final Submission

In [397]:
# merge the results to the test merged

# I choose catboost for the final model

output=pd.DataFrame({'ID':test_merged['account.id'],'Predicted':cat_test})

output.to_csv('submission.csv',index=False)

