In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report,confusion_matrix
from catboost import CatBoostClassifier, Pool




In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
df = pd.read_csv('../data_preprocss/data_onevisa_postprocess_v2.csv')#.set_index('sample_id')
df.shape

  df = pd.read_csv('../data_preprocss/data_onevisa_postprocess_v2.csv')#.set_index('sample_id')


(5340, 538)

In [122]:
target_col='type_case'

X = df.drop(columns=[target_col]+['sample_id'])
y = df[target_col].values


In [123]:
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

In [None]:
missing_cat_placeholder = 'nan'

for c in cat_cols:
    X[c] = X[c].astype(str).fillna(missing_cat_placeholder)

X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y, test_size=0.2, random_state=42
    )

train_pool = Pool(data=X_train, label=y_train, cat_features=cat_cols)
test_pool = Pool(data=X_test, label=y_test, cat_features=cat_cols)

model = CatBoostClassifier(
        iterations=300,
        learning_rate=0.01,
        depth=6,
        eval_metric='Recall',
        random_seed=42,
        verbose=True
    )
    
# Fit the model
model.fit(train_pool, eval_set=test_pool)
    
# Predictions
y_pred = model.predict(test_pool)
y_pred_prob = model.predict_proba(test_pool)

    
# Compute metrics
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, pos_label='granted', zero_division=0)
rec = recall_score(y_test, y_pred, pos_label='granted', zero_division=0)
    
print(f"Accuracy:  {acc:.3f}")
print(f"Precision: {prec:.3f}")
print(f"Recall:    {rec:.3f}")
print("\nFull classification report:\n")
print(classification_report(y_test, y_pred))

0:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 109ms	remaining: 32.7s
1:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 1.87s	remaining: 4m 38s
2:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 2.07s	remaining: 3m 24s
3:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 3s	remaining: 3m 42s
4:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 5.29s	remaining: 5m 11s
5:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 8.32s	remaining: 6m 47s
6:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 9.57s	remaining: 6m 40s
7:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 13.5s	remaining: 8m 10s
8:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 13.6s	remaining: 7m 18s
9:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 17.4s	remaining: 8m 23s
10:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 18.7s	remaining: 8m 11s
11:	learn: 0.0000000	test: 0.0000000	best: 0.0

In [126]:
len(y_test)

1068

In [127]:
len(y_pred_prob[:,1])

1068

In [128]:
df_pred = pd.DataFrame()
df_pred['preds'] = y_pred
df_pred['preds_prob1'] = y_pred_prob[:,1]
df_pred['preds_prob0'] = y_pred_prob[:,0]


df_pred['gt_values'] = y_test

In [129]:
df_pred.gt_values.value_counts()

gt_values
granted    1004
refused      64
Name: count, dtype: int64

In [130]:
print('Refused',df_pred[df_pred.gt_values=='refused'].preds_prob1.median())
print('Granted',df_pred[df_pred.gt_values=='refused'].preds_prob0.median())

Refused 0.33615405174251867
Granted 0.6638459482574813


In [131]:
print('Refused',df_pred[df_pred.gt_values=='granted'].preds_prob1.median())
print('Granted',df_pred[df_pred.gt_values=='granted'].preds_prob0.median())

Refused 0.03755523798849743
Granted 0.9624447620115025


In [140]:
pred_truc = np.where(y_pred_prob[:,1] > 0.3, 'refused', 'granted')
#pred_truc = np.where(y_pred_prob[:,0] > 0.9, 'granted', 'refused')



In [141]:
acc = accuracy_score(y_test, pred_truc)
prec = precision_score(y_test, pred_truc, pos_label='granted', zero_division=0)
rec = recall_score(y_test, pred_truc, pos_label='granted', zero_division=0)
    
print(f"Accuracy:  {acc:.3f}")
print(f"Precision: {prec:.3f}")
print(f"Recall:    {rec:.3f}")
print("\nFull classification report:\n")
print(classification_report(y_test, pred_truc))


Accuracy:  0.958
Precision: 0.971
Recall:    0.984

Full classification report:

              precision    recall  f1-score   support

     granted       0.97      0.98      0.98      1004
     refused       0.69      0.55      0.61        64

    accuracy                           0.96      1068
   macro avg       0.83      0.77      0.79      1068
weighted avg       0.95      0.96      0.96      1068



In [142]:
labels = ['granted', 'refused']  # adjust order if needed

# 1) compute raw matrix
cm = confusion_matrix(y_test, pred_truc, labels=labels)

# 2) wrap in a DataFrame for clarity
cm_df = pd.DataFrame(
    cm,
    index=[f"actual_{lab}"   for lab in labels],
    columns=[f"predicted_{lab}" for lab in labels]
)


In [143]:
cm_df

Unnamed: 0,predicted_granted,predicted_refused
actual_granted,988,16
actual_refused,29,35


In [4]:
%pwd

'/Users/albafranco/Documents/Python/visa-scoring/notebooks'

In [4]:
%cd ..

/Users/albafranco/Documents/Python/visa-scoring


In [5]:
from src.model_training import train_model

In [6]:
def print_columns_with_mixed_dtypes(df: pd.DataFrame):
    """
    Prints columns in the DataFrame that contain mixed data types (e.g., strings and numbers).
    """
    mixed_cols = []

    for col in df.columns:
        types_in_col = set(type(val) for val in df[col].dropna())
        if len(types_in_col) > 1:
            mixed_cols.append((col, types_in_col))

    if mixed_cols:
        print("Columns with mixed data types:")
        for col, types_found in mixed_cols:
            print(f"  - {col}: {types_found}")
    else:
        print("No columns with mixed data types found.")


In [7]:
print_columns_with_mixed_dtypes(df)

Columns with mixed data types:
  - postal_code_cd_raiucor: {<class 'str'>, <class 'float'>}
  - business_phone_cd_ctniucor: {<class 'str'>, <class 'float'>}
  - home_phone_cd_ctniucor: {<class 'str'>, <class 'float'>}


In [8]:
postal_code = [i for i in df.columns if 'postal' in i]
mobile_phone = [i for i in df.columns if 'phone' in i]
df = df.drop(columns = postal_code + mobile_phone)

In [9]:
print_columns_with_mixed_dtypes(df)

No columns with mixed data types found.


In [10]:
df.shape

(5340, 522)

In [11]:
model_xgb, df_pred_xgb = train_model(df, model_type ='xgboost',
                                     target_col='type_case', threshold=0.3)


INFO:root:Preparing data...
INFO:root:Encoding categorical columns with OrdinalEncoder for XGBoost...
INFO:root:Splitting data into train/test sets...
INFO:root:Training XGBoostClassifier...


ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1], got ['granted' 'refused']