## Conjunto de datos

In [1]:
import pandas as pd
import os
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# URL of where the training and testing samples are located
URL_TRAIN_DATA = "https://www.famaf.unc.edu.ar/~nocampo043/travel_insurance_prediction_train.csv"
URL_TEST_DATA = "https://www.famaf.unc.edu.ar/~nocampo043/travel_insurance_prediction_test.csv"

train_df = pd.read_csv(URL_TRAIN_DATA)
test_df = pd.read_csv(URL_TEST_DATA)

def save_predictions(model, test_id, X_test, filename):
    test_pred = model.predict(X_test)

    submission = pd.DataFrame(list(zip(test_id, test_pred)),
                              columns=["Customer", "TravelInsurance"])
    submission.to_csv(filename,
                      header=True,
                      index=False)

In [2]:
train_df

Unnamed: 0,Customer,Age,Employment Type,GraduateOrNot,AnnualIncome,FamilyMembers,ChronicDiseases,FrequentFlyer,EverTravelledAbroad,TravelInsurance
0,1,33,Private Sector/Self Employed,Yes,550000,6,0,No,No,1
1,2,28,Private Sector/Self Employed,Yes,800000,7,0,Yes,No,0
2,3,31,Private Sector/Self Employed,Yes,1250000,4,0,No,No,0
3,4,31,Government Sector,No,300000,7,0,No,No,0
4,5,28,Private Sector/Self Employed,Yes,1250000,3,0,No,No,0
...,...,...,...,...,...,...,...,...,...,...
1485,1486,31,Government Sector,No,300000,5,0,No,No,0
1486,1487,31,Private Sector/Self Employed,Yes,950000,3,0,Yes,No,0
1487,1488,28,Private Sector/Self Employed,Yes,1250000,5,0,No,No,0
1488,1489,31,Government Sector,Yes,1300000,5,0,No,No,0


In [3]:
test_df

Unnamed: 0,Customer,Age,Employment Type,GraduateOrNot,AnnualIncome,FamilyMembers,ChronicDiseases,FrequentFlyer,EverTravelledAbroad
0,1491,29,Private Sector/Self Employed,Yes,1100000,4,0,No,No
1,1492,28,Private Sector/Self Employed,Yes,750000,5,1,Yes,No
2,1493,31,Government Sector,Yes,1500000,4,0,Yes,Yes
3,1494,28,Private Sector/Self Employed,Yes,1400000,3,0,No,Yes
4,1495,33,Private Sector/Self Employed,Yes,1500000,4,0,Yes,Yes
...,...,...,...,...,...,...,...,...,...
492,1983,33,Government Sector,Yes,1750000,6,0,No,No
493,1984,25,Private Sector/Self Employed,No,1150000,3,1,No,No
494,1985,29,Private Sector/Self Employed,Yes,1050000,5,0,No,No
495,1986,25,Government Sector,Yes,750000,2,0,No,Yes


In [4]:
# Delete customer id and separate target labels from the features
seed = 0
X_train_total = train_df.drop(["Customer","TravelInsurance"], axis=1)
y_train_total = train_df["TravelInsurance"]

X_train, X_val, y_train, y_val = train_test_split(X_train_total,
                                                  y_train_total,
                                                  test_size=0.2,
                                                  random_state=seed)

X_test = test_df.drop(["Customer"], axis=1)

In [5]:
# Define pipeline that discretizes columns Age and AnnaulIncome, and encode the
# rest of variables using an ohe approach
preprocessor = ColumnTransformer(
    [("discretizer",
      KBinsDiscretizer(n_bins=16, encode="ordinal",
                       strategy="quantile"), ["Age", "AnnualIncome"]),
     ("encoder",
      OneHotEncoder(categories="auto", dtype="int", handle_unknown="ignore"), [
          "Employment Type", "GraduateOrNot", "FamilyMembers", "FrequentFlyer",
          "EverTravelledAbroad"
      ])],
    remainder="passthrough")

X_train = preprocessor.fit_transform(X_train)
X_val = preprocessor.transform(X_val)
X_test = preprocessor.transform(X_test)




## Modelo con AutoML



In [6]:
from autoPyTorch.api.tabular_classification import TabularClassificationTask

api = TabularClassificationTask(
    seed=42,
)

api.search(
    X_train=X_train,
    y_train=y_train,
    X_test=X_val.copy(),
    y_test=y_val.copy(),
    dataset_name='travel_insurance',
    optimize_metric='f1',
    total_walltime_limit=500,
    func_eval_time_limit_secs=100
)

y_pred = api.predict(X_val)
score = api.score(y_pred, y_val)
print(score)
print(api.show_models())
print(api.sprint_statistics())





{'f1': 0.6349206349206349}
|    | Preprocessing                                                 | Estimator                                                       |   Weight |
|---:|:--------------------------------------------------------------|:----------------------------------------------------------------|---------:|
|  0 | SimpleImputer,NoEncoder,MinMaxScaler,KernelPCA                | no embedding,ShapedMLPBackbone,FullyConnectedHead,nn.Sequential |     0.22 |
|  1 | SimpleImputer,NoEncoder,Normalizer,TruncSVD                   | no embedding,MLPBackbone,FullyConnectedHead,nn.Sequential       |     0.18 |
|  2 | None                                                          | RFLearner                                                       |     0.18 |
|  3 | SimpleImputer,NoEncoder,NoScaler,KitchenSink                  | no embedding,MLPBackbone,FullyConnectedHead,nn.Sequential       |     0.12 |
|  4 | SimpleImputer,NoEncoder,StandardScaler,Nystroem               | no embedding,M

In [7]:
y_pred_val_automl = api.predict(X_val)
print(classification_report(y_val, y_pred_val_automl))

              precision    recall  f1-score   support

           0       0.79      0.88      0.83       192
           1       0.72      0.57      0.63       106

    accuracy                           0.77       298
   macro avg       0.75      0.72      0.73       298
weighted avg       0.76      0.77      0.76       298



In [8]:
y_pred_test_automl = api.predict(X_test)

In [9]:
save_predictions(api, test_df["Customer"], X_test, "autopytorch.csv")

## Modelo no automatizado

In [10]:
from xgboost import XGBClassifier

parameters = {
    'alpha': 0.6066724149261147,
    'booster': 'gbtree',
    'colsample_bytree': 1.0,
    'eval_metric': 'logloss',
    'gamma': 1.0,
    'learning_rate': 1.0,
    'max_depth': 4,
    'n_estimators': 500,
    'objective': 'binary:logistic',
    'subsample': 1.0,
    'use_label_encoder': False,
}

clf = XGBClassifier(**parameters)

clf.fit(X_train, y_train)

y_pred_val = clf.predict(X_val)
y_pred_test = clf.predict(X_test)

print(classification_report(y_val, y_pred_val))

              precision    recall  f1-score   support

           0       0.80      0.97      0.88       192
           1       0.92      0.57      0.70       106

    accuracy                           0.83       298
   macro avg       0.86      0.77      0.79       298
weighted avg       0.85      0.83      0.82       298



In [11]:
save_predictions(clf, test_df["Customer"], X_test, "xgb_no_automl.csv")

## Predicciones

![kaggle results](scores.png)