In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report,confusion_matrix
from catboost import CatBoostClassifier, Pool




In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
df = pd.read_csv('../data_preprocss/data_onevisa_postprocess_v2.csv')#.set_index('sample_id')
df.shape

In [None]:
target_col='type_case'

X = df.drop(columns=[target_col]+['sample_id'])
y = df[target_col].values


In [None]:
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

In [None]:
missing_cat_placeholder = 'nan'

for c in cat_cols:
    X[c] = X[c].astype(str).fillna(missing_cat_placeholder)

X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y, test_size=0.2, random_state=42
    )

train_pool = Pool(data=X_train, label=y_train, cat_features=cat_cols)
test_pool = Pool(data=X_test, label=y_test, cat_features=cat_cols)

model = CatBoostClassifier(
        iterations=300,
        learning_rate=0.01,
        depth=6,
        eval_metric='Recall',
        random_seed=42,
        verbose=True
    )
    
# Fit the model
model.fit(train_pool, eval_set=test_pool)
    
# Predictions
y_pred = model.predict(test_pool)
y_pred_prob = model.predict_proba(test_pool)

    
# Compute metrics
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, pos_label='granted', zero_division=0)
rec = recall_score(y_test, y_pred, pos_label='granted', zero_division=0)
    
print(f"Accuracy:  {acc:.3f}")
print(f"Precision: {prec:.3f}")
print(f"Recall:    {rec:.3f}")
print("\nFull classification report:\n")
print(classification_report(y_test, y_pred))

In [None]:
len(y_test)

In [None]:
len(y_pred_prob[:,1])

In [None]:
df_pred = pd.DataFrame()
df_pred['preds'] = y_pred
df_pred['preds_prob1'] = y_pred_prob[:,1]
df_pred['preds_prob0'] = y_pred_prob[:,0]


df_pred['gt_values'] = y_test

In [None]:
df_pred.gt_values.value_counts()

In [None]:
print('Refused',df_pred[df_pred.gt_values=='refused'].preds_prob1.median())
print('Granted',df_pred[df_pred.gt_values=='refused'].preds_prob0.median())

In [None]:
print('Refused',df_pred[df_pred.gt_values=='granted'].preds_prob1.median())
print('Granted',df_pred[df_pred.gt_values=='granted'].preds_prob0.median())

In [None]:
pred_truc = np.where(y_pred_prob[:,1] > 0.3, 'refused', 'granted')
#pred_truc = np.where(y_pred_prob[:,0] > 0.9, 'granted', 'refused')



In [None]:
acc = accuracy_score(y_test, pred_truc)
prec = precision_score(y_test, pred_truc, pos_label='granted', zero_division=0)
rec = recall_score(y_test, pred_truc, pos_label='granted', zero_division=0)
    
print(f"Accuracy:  {acc:.3f}")
print(f"Precision: {prec:.3f}")
print(f"Recall:    {rec:.3f}")
print("\nFull classification report:\n")
print(classification_report(y_test, pred_truc))


In [None]:
labels = ['granted', 'refused']  # adjust order if needed

# 1) compute raw matrix
cm = confusion_matrix(y_test, pred_truc, labels=labels)

# 2) wrap in a DataFrame for clarity
cm_df = pd.DataFrame(
    cm,
    index=[f"actual_{lab}"   for lab in labels],
    columns=[f"predicted_{lab}" for lab in labels]
)


In [None]:
cm_df

In [None]:
%pwd

In [None]:
%cd ..

In [None]:
from src.model_training import train_model

In [None]:
def print_columns_with_mixed_dtypes(df: pd.DataFrame):
    """
    Prints columns in the DataFrame that contain mixed data types (e.g., strings and numbers).
    """
    mixed_cols = []

    for col in df.columns:
        types_in_col = set(type(val) for val in df[col].dropna())
        if len(types_in_col) > 1:
            mixed_cols.append((col, types_in_col))

    if mixed_cols:
        print("Columns with mixed data types:")
        for col, types_found in mixed_cols:
            print(f"  - {col}: {types_found}")
    else:
        print("No columns with mixed data types found.")


In [None]:
print_columns_with_mixed_dtypes(df)

In [None]:
postal_code = [i for i in df.columns if 'postal' in i]
mobile_phone = [i for i in df.columns if 'phone' in i]
df = df.drop(columns = postal_code + mobile_phone)

In [None]:
print_columns_with_mixed_dtypes(df)

In [None]:
df.shape

In [None]:
model_xgb, df_pred_xgb = train_model(df, model_type ='xgboost',
                                     target_col='type_case', threshold=0.3)
