In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

## Load Data

In [20]:
account = pd.read_csv('data/account.csv', encoding='ISO-8859-1')
concerts_1415=pd.read_csv('data/concerts_2014-15.csv')
concerts=pd.read_csv('data/concerts.csv')
sample_submission=pd.read_csv('data/sample_submission.csv')
subscriptions=pd.read_csv('data/subscriptions.csv')
test=pd.read_csv('data/test.csv')
tickets_all=pd.read_csv('data/tickets_all.csv')
train=pd.read_csv('data/train.csv')
zipcodes=pd.read_csv('data/zipcodes.csv')

In [30]:
subscriptions['section'].unique()

array(['Premium Orchestra', 'Orchestra', 'Balcony Front', 'Balcony Rear',
       'Orchestra Front', nan, 'Dress Circle', 'Balcony', 'Gallery',
       'Box', 'Orchestra Rear', 'Santa Rosa', 'Boxes House Right',
       'Boxes House Left', 'Floor'], dtype=object)

#### Merge Data and Construct Train and Test Data

In [45]:
# Merge train with account
train_merged = pd.merge(train, account, on='account.id', how='left')

# Merge train with subscriptions
train_merged = pd.merge(train_merged, subscriptions, on='account.id', how='left')

# Handle subscription data
def handle_subscription(group):

    # Safely handle NaN values in cities
    shipping_city = group['shipping.city'].fillna("")
    billing_city = group['billing.city'].fillna("")

    return pd.Series({
        # Total season:
        'total_season': group.shape[0] if group['season'].notna().any() else 0,

        # Package features
        'full_package': (group['package'] == 'Full').sum(),
        'quartet_package': (group['package'].isin(['Quartet', 'Quartet A', 'Quartet B', 'Quartet CC'])).sum(),
        'trio_package': (group['package'].isin(['Trio', 'Trio A', 'Trio B'])).sum(),
        'cyo_package': (group['package'] == 'CYO').sum(),
        'full_upgrade_package': (group['package'] == 'Full upgrade').sum(),

        # Seat features
        'total_seats': group['no.seats'].sum(),

        # Location features
        'location_num': group['location'].nunique() if group['season'].notna().any() else 0,
        'location_near_resident_sum': (group['location'] == shipping_city).sum() if shipping_city.any() else (group['location'] == billing_city).sum(),
        'location_not_resident_num': group.shape[0] - (group['location'] == shipping_city).sum() if shipping_city.any() else group.shape[0] - (group['location'] == billing_city).sum(),

        # Section features
        'section_type_numbers': group['section'].nunique() if group['season'].notna().any() else 0,
        'premium_orchestra_number': (group['section'] == 'Premium Orchestra').sum(),
        'orchestra_number': (group['section'].isin(['Orchestra', 'Orchestra Front', 'Orchestra Rear'])).sum(),
        'balcony_number': (group['section'].isin(['Balcony Front', 'Balcony Rear', 'Balcony', 'Santa Rosa'])).sum(),
        'dress_circle_number': (group['section'] == 'Dress Circle').sum(),
        'Gallery_number': (group['section'] == 'Gallery').sum(),
        'Box_number': (group['section'].isin(['Box', 'Box House Left', 'Box House Right'])).sum(),
        'Floor': (group['section'] == 'Floor').sum(),

        # Price features
        'mean price level': group['price.level'].mean() if group['season'].notna().any() else 0,

        # Subscription tier features
        'mean subscription tier': group['subscription_tier'].mean() if group['season'].notna().any() else 0,
        'multiple subs number': (group['multiple.subs'] == 'yes').sum()
    })

# Apply the function using groupby
train_merged = train_merged.groupby('account.id').apply(handle_subscription)

train_merged.head(5)

  train_merged = train_merged.groupby('account.id').apply(handle_subscription)


Unnamed: 0_level_0,total_season,full_package,quartet_package,trio_package,cyo_package,full_upgrade_package,total_seats,location_num,location_near_resident_sum,location_not_resident_num,...,premium_orchestra_number,orchestra_number,balcony_number,dress_circle_number,Gallery_number,Box_number,Floor,mean price level,mean subscription tier,multiple subs number
account.id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
001i000000LhSrQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
001i000000LhyPH,11.0,11.0,0.0,0.0,0.0,0.0,20.0,1.0,11.0,0.0,...,4.0,4.0,0.0,0.0,0.0,0.0,0.0,1.0,1.818182,0.0
001i000000LhyPI,21.0,21.0,0.0,0.0,0.0,0.0,39.0,1.0,0.0,21.0,...,4.0,13.0,0.0,0.0,0.0,0.0,0.0,1.5,1.857143,0.0
001i000000LhyPS,13.0,10.0,3.0,0.0,0.0,0.0,25.0,2.0,0.0,13.0,...,2.0,0.0,8.0,1.0,0.0,0.0,0.0,2.416667,1.692308,0.0
001i000000LhyPT,19.0,16.0,0.0,0.0,3.0,0.0,35.0,1.0,0.0,19.0,...,4.0,6.0,5.0,0.0,0.0,0.0,0.0,1.8125,1.684211,0.0
