In [26]:
import numpy as np
import pandas as pd
import os

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [27]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amex-default-prediction/sample_submission.csv
/kaggle/input/amex-default-prediction/train_data.csv
/kaggle/input/amex-default-prediction/test_data.csv
/kaggle/input/amex-default-prediction/train_labels.csv


In [28]:
train_data = pd.read_csv("/kaggle/input/amex-default-prediction/train_data.csv", nrows=10000)
print(train_data.shape)

(10000, 190)


In [29]:
train_labels = pd.read_csv("/kaggle/input/amex-default-prediction/train_labels.csv", nrows=10000)
print(train_labels.shape)

(10000, 2)


In [30]:
def derive_col_formats(df):

    """
    objective
    =========
    search through dataframe and return cat and numeric cols
    """
    categorical_feature_mask = df.dtypes==object
    categorical_features = df.columns[categorical_feature_mask].tolist()
    
    numeric_feature_mask = df.dtypes!=object
    numeric_features = df.columns[numeric_feature_mask].tolist()
    
    return categorical_features, numeric_features

In [31]:
categorical_features, numeric_features = derive_col_formats(train_data)

In [32]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
])

In [33]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
])

In [34]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [35]:
clf = Pipeline([
     ('preprocessor', preprocessor),
     ('clf', RandomForestClassifier())
])

In [36]:
X=train_data
y=train_labels['target']

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.33, 
                                                    random_state=4)

In [38]:
clf.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['P_2', 'D_39', 'B_1', 'B_2',
                                                   'R_1', 'S_3', 'D_41', 'B_3',
                                                   'D_42', 'D_43', 'D_44',
                                                   'B_4', 'D_45', 'B_5', 'R_2',
                                                   'D_46', 'D_47', 'D_48',
                                                   'D_49', 'B_6', 'B_7', 'B_8',
                                                   'D_50', 'D_51', 'B_9', 'R_3',
              

In [40]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

In [41]:
y_true=pd.DataFrame(y_test)
y_pred=pd.DataFrame(clf.predict(X_test), columns=['prediction'])
amex_metric(y_true, y_pred)

0.008345105430024988

In [None]:
test_data=pd.read_csv("/kaggle/input/amex-default-prediction/test_data.csv")

In [None]:
clf.predict(test_data)

In [None]:
my_submission.to_csv('submission.csv', index=False)