In [None]:
import pandas as pd
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.layouts import column
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder

output_notebook()

# Cargar datos
df = pd.read_csv("data/merged_dataset.csv")

df['Deposits'] = df['Deposits'].str.replace(',', '').astype(float)
df['Withdrawls'] = df['Withdrawls'].str.replace(',', '').astype(float)
df['Balance'] = df['Balance'].str.replace(',', '').astype(float)

vip_df = df.groupby('Customer Id')['Deposits'].sum().reset_index()
vip_df['VIP'] = vip_df['Deposits'] > 100000
df = df.merge(vip_df[['Customer Id', 'VIP']], on='Customer Id')

df['Country_Code'] = LabelEncoder().fit_transform(df['Country'])

features = ['Deposits', 'Withdrawls', 'Balance', 'Country_Code']
X = df[features]
y = df['VIP'].astype(int)

# Modelo
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
model = DecisionTreeClassifier(max_depth=4, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# ======================
# Gráfico 1: Matriz de confusión
# ======================
cm = confusion_matrix(y_test, y_pred)
cm_data = {
    'labels': ['No VIP', 'VIP'],
    'Real No VIP': cm[:, 0],
    'Real VIP': cm[:, 1],
}
source_cm = ColumnDataSource(data=cm_data)

p1 = figure(x_range=cm_data['labels'], title="Matriz de Confusión", height=350)
p1.vbar(x='labels', top='Real No VIP', width=0.4, source=source_cm, color="#718dbf", legend_label="No VIP")
p1.vbar(x='labels', top='Real VIP', width=0.4, source=source_cm, color="#e84d60", legend_label="VIP")

p1.y_range.start = 0
p1.legend.location = "top_left"
p1.legend.title = "Clase Real"
p1.xgrid.grid_line_color = None

# ======================
# Gráfico 2: Importancia de variables (sin 'Deposits')
# ======================
importances = model.feature_importances_

feature_importance_pairs = [(features[i], importances[i]) for i in range(len(features)) if features[i] != 'Deposits']

feature_importance_pairs.sort(key=lambda x: x[1])

features_sorted = [f[0] for f in feature_importance_pairs]
importances_sorted = [f[1] for f in feature_importance_pairs]

source_imp = ColumnDataSource(data={
    'features': features_sorted,
    'importance': importances_sorted,
})

p2 = figure(y_range=features_sorted, height=350, title="Importancia de Variables (sin Deposits)")
p2.hbar(y='features', right='importance', height=0.4, source=source_imp, color="green")

p2.x_range.start = 0
p2.add_tools(HoverTool(tooltips=[("Importancia", "@importance{0.000}")]))

show(column(p1, p2))

