In [None]:
import numpy as np, pandas as pd
pd.options.display.max_columns = 100
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import warnings
warnings.filterwarnings('ignore')

In [None]:
base_dir = "../input/beyond-analysis/"

In [None]:
# Load train and test sets
train = pd.read_csv(base_dir + "train.csv")
test = pd.read_csv(base_dir + "test.csv")

In [None]:
print("Train shape: ", train.shape)
print("Test shape: ", test.shape)

In [None]:
print("Total customers in train set:")
len(train['UNIQUE_IDENTIFIER'].unique())

In [None]:
print("Total customers in test set:")
len(test['UNIQUE_IDENTIFIER'].unique())

In [None]:
train.info()

In [None]:
_ = sns.barplot(train['STATUS_CHECK'].value_counts().index, train['STATUS_CHECK'].value_counts())

In [None]:
_ = sns.barplot(train['ACTIVE_YN'].value_counts().index, train['ACTIVE_YN'].value_counts())

In [None]:
# how many times a customer repeats in the dataset
plt.figure(figsize=(12, 6))
_ = sns.boxplot(y = train['UNIQUE_IDENTIFIER'].value_counts())
_ = plt.title("Customer distribution", fontsize=18)

### Features that are not constant for a single customer

In [None]:
train_grp = train.groupby("UNIQUE_IDENTIFIER").agg(lambda x: len(x.unique()))

In [None]:
round(train_grp[train_grp != 1].sum()/len(train), 2)

# Customer Statistics

In [None]:
def calc_customer_stats(df):
    
    agg_dict = {
        "ENTRY": ["mean"],#,"sum", np.std, "max", "min"],
        "REVENUE": ["mean"],#,"sum", np.std, "max", "min"],
        "WINNINGS_1": ["mean"],#,"sum", np.std, "max", "min"],
        "WINNINGS_2": ["mean"],#,"sum", np.std],
        "DISCOUNT": ["mean"],#,"sum", np.std, "max", "min"],
        "DEPOSIT": ["mean"],#,"sum", np.std, "max", "min"],
        "DEPOSIT_NUMBER": ["mean"],#,"sum", np.std, "max", "min"],
        "DEPOSIT_2": ["mean"],#,"sum", np.std, "max", "min"],
        "WITHDRAW": ["mean"],#,"sum", np.std],
        "WITHDRAW_NUMBER": ["mean"],#,"sum", np.std],
        "DEPOSIT_TRAILS": ["mean"],#,"sum", np.std, "max", "min"],
        "ENTRY_NUMBER": ["mean"],#,"sum", np.std, "max", "min"],
        "WINNINGS_NUMBER": ["mean"],#,"sum", np.std, "max", "min"],
        "PRACTICE_ENTRY": ["mean"],#,"sum"],
        "PRACTICE_WINNINGS": ["mean"],#,"sum"],
        "PRACTICE_ENTRY_NUMBER": ["mean"],#,"sum"],
        "PRACTICE_WINNINGS_NUMBER": ["mean"],#,"sum"]
    }
    
    df_grp = df.groupby("UNIQUE_IDENTIFIER").agg(agg_dict)
    df_grp.columns = ["_".join(col) for col in df_grp.columns]
    df_grp = df_grp.reset_index()
    return df_grp

In [None]:
# preprocess train data
train_ = calc_customer_stats(train)
temp = train.groupby("UNIQUE_IDENTIFIER")[["CATEGORY_1", "CATEGORY_2", "STATUS_CHECK", "Y1","Y2"]].agg(lambda x: x.unique()[0]).reset_index()
train_ = train_.merge(temp, on="UNIQUE_IDENTIFIER", how="left")

In [None]:
train_.head()

In [None]:
sns.pairplot(train_, diag_kind="hist")