In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [13]:
# Load the data
df = pd.read_parquet('../data/processed/num_imputed_df.parquet')
df = df.dropna(subset=['target'])
df.head()

Unnamed: 0,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,...,OCCUPATION_TYPE_Low-skill Laborers,OCCUPATION_TYPE_Managers,OCCUPATION_TYPE_Medicine staff,OCCUPATION_TYPE_Private service staff,OCCUPATION_TYPE_Realty agents,OCCUPATION_TYPE_Sales staff,OCCUPATION_TYPE_Secretaries,OCCUPATION_TYPE_Security staff,OCCUPATION_TYPE_Waiters/barmen staff,target
0,0.0,1.0,0.0,202500.0,406597.5,24700.5,351000.0,18.0,20.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,270000.0,1293502.5,35698.5,1129500.0,6.0,10.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,0.0,67500.0,135000.0,6750.0,135000.0,16.0,17.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,135000.0,312682.5,29686.5,297000.0,20.0,16.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,121500.0,513000.0,21865.5,513000.0,16.0,17.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# PCA

Creation of the training and test sets

In [None]:
x_train, x_test = train_test_split(df, test_size=0.2, random_state=42)

Standarization

In [None]:
scaler = StandardScaler()
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

PCA

In [None]:
pca = PCA(0.95)
pca.fit(x_train_scaled)
pca.n_components_

In [None]:
# Explained variance
explained_variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)

# Plotting
plt.figure(figsize=(10, 6))
plt.plot(cumulative_variance, marker='o', linestyle='--')
plt.title('Cumulative Explained Variance by PCA Components')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.grid(True)
plt.show()

In [None]:
x_train_pca = pca.transform(x_train_scaled)
x_test_pca = pca.transform(x_test_scaled)

# Create df for PCA components
pca_columns = [f'PC{i+1}' for i in range(pca.n_components_)]
x_train_pca_df = pd.DataFrame(x_train_pca, columns=pca_columns)
x_test_pca_df = pd.DataFrame(x_test_pca, columns=pca_columns)

print(x_train_pca_df.head())
print(x_test_pca_df.head())

# Feature selection

In [14]:
# Split the data

X = df.drop('target', axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [15]:
df['target'].isnull().sum()

np.int64(0)

In [17]:
# Feature selection
xgb = XGBClassifier(tree_method='hist', device = "cuda",random_state = 42)  # Configurado para usar GPU

RFE_selector = RFE(xgb, n_features_to_select=10, step=1)
RFE_selector.fit(X_train, y_train)

selected_features_mask = RFE_selector.get_support()

selected_features = X_train.columns[selected_features_mask]
selected_features

Index(['NAME_EDUCATION_TYPE', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'ORGANIZATION_TYPE',
       'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_LAST_PHONE_CHANGE',
       'OCCUPATION_TYPE_Private service staff'],
      dtype='object')

In [39]:
rfe_score_list = []

iterations = 0

for k in range(10,70,10):
    xgb = XGBClassifier(tree_method='hist', device = "cuda",random_state = 42)  # Configurado para usar GPU

    RFE_selector = RFE(xgb, n_features_to_select=k, step=1)
    RFE_selector.fit(X_train, y_train)
    
    sel_x_train = RFE_selector.transform(X_train)
    sel_x_test = RFE_selector.transform(X_test)
    
    xgb.fit(sel_x_train, y_train)
    RFE_preds = xgb.predict(sel_x_test)
    
    score = round(f1_score(y_test, RFE_preds,average="weighted"), 4)
    rfe_score_list.append(score)
    iterations += 1
    print(f'Iteration {iterations} with {k} features: {score}')

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Iteration 1 with 10 features: 0.8785
Iteration 2 with 20 features: 0.8785
Iteration 3 with 30 features: 0.8785
Iteration 4 with 40 features: 0.8785
Iteration 5 with 50 features: 0.8785
Iteration 6 with 60 features: 0.8785
