# Feature Engineering with Atom Package

Lets start by importing all the required packages.

In [None]:
#Import all libraries
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from raiwidgets import ErrorAnalysisDashboard
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
from src.model import select_features
from src.model import tune_parameters,show_model_results,get_monotone_constraints
from atom import ATOMClassifier

%matplotlib inline

### Read & Clean data

In [None]:
#Read dataset.
data = pd.read_csv('../data/heloc_dataset_v1.csv')

# Data Cleaning based on Error analysis.
data = data[data['NumSatisfactoryTrades']>=0]
data = data[data['ExternalRiskEstimate']>=0]

In [None]:
# Split data into Train & Test Set.
y = data['RiskPerformance'].apply(lambda x : 1 if 'Bad' in x else 0)
print(f"Class balance :\n{y.value_counts(normalize=True)}")
X = data.drop(columns='RiskPerformance')
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=345,stratify=y)
data_dict = {'xtrain': X_train, 'ytrain': y_train,'xtest' : X_test, 'ytest' : y_test}
print(f"Class balance for Train data :\n{data_dict['ytrain'].value_counts(normalize=False)}")
print(f"Class balance for Test data :\n{data_dict['ytest'].value_counts(normalize=False)}")

### Feature creation with ATOM

In [None]:
# Load the dataset into atom
atom = ATOMClassifier(data_dict['xtrain'], data_dict['ytrain'], test_size=0.2, verbose=2)

Atom works with the concept of branches. Read [this story]() to learn more.
Let us add 20 more features using the DFS set.

In [None]:
atom.branch = "dfs"

In [None]:
atom.feature_generation(
    strategy="dfs",
    n_features=20,
    operators=["add", "mul","sub"],
)

Check if the features, survive the feature selection methods.

In [None]:
#Transform the data into a format that can be used existing code.
X_train_new, y_train_new = atom.transform(data_dict['xtrain'], data_dict['ytrain'])
X_test_new, y_test_new = atom.transform(data_dict['xtest'], data_dict['ytest'])
# Update the data dictionary
dfs_data_dict= {'xtrain': X_train_new, 'ytrain': y_train_new,'xtest' : X_test_new, 'ytest' : y_test_new}


In [None]:
selected_features,fs_plot = select_features(data=dfs_data_dict,n_features=16)

In [None]:
# Subset the dataset with the selected features.
dfs_data_dict['xtrain'] = dfs_data_dict['xtrain'][selected_features]
dfs_data_dict['xtest'] = dfs_data_dict['xtest'][selected_features]

In [None]:
model = XGBClassifier()
model_param = tune_parameters(data=dfs_data_dict,model=model)
# Add monotonic constraints.
model_param['monotone_constraints']=get_monotone_constraints(data_dict=dfs_data_dict,target='RiskPerformance')
print(f"Creating model with features : {model_param}")
clf = XGBClassifier(**model_param)
model = show_model_results(data=dfs_data_dict,model=clf)

In [None]:
atom.branch = "gfg_from_master"
atom.feature_generation(
    strategy="GFG",
    n_features=5,
    operators=["add", "mul","sub"],
)

In [None]:
#Transform the data into a format that can be used existing code.
X_train_new, y_train_new = atom.transform(data_dict['xtrain'], data_dict['ytrain'])
X_test_new, y_test_new = atom.transform(data_dict['xtest'], data_dict['ytest'])
# Update the data dictionary
gfg_data_dict= {'xtrain': X_train_new, 'ytrain': y_train_new,'xtest' : X_test_new, 'ytest' : y_test_new}


In [None]:
selected_features,fs_plot = select_features(data=gfg_data_dict,n_features=13)
# Subset the dataset with the selected features.
gfg_data_dict['xtrain'] = gfg_data_dict['xtrain'][selected_features]
gfg_data_dict['xtest'] = gfg_data_dict['xtest'][selected_features]

In [None]:
model = XGBClassifier()
model_param = tune_parameters(data=gfg_data_dict,model=model)
# Add monotonic constraints.
model_param['monotone_constraints']=get_monotone_constraints(data_dict=gfg_data_dict,target='RiskPerformance')
print(f"Creating model with features : {model_param}")
clf = XGBClassifier(**model_param)
model = show_model_results(data=gfg_data_dict,model=clf)