In [88]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy import stats
from sklearn.metrics import classification_report, confusion_matrix
import plotly.io as pio
from scipy import stats
from scipy.stats import chi2_contingency, normaltest, mannwhitneyu

pio.renderers.default = "browser"

# configuración de visualización 
plt.style.use('seaborn-v0_8-dark')
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
sns.set_palette('husl')


-----CARGA DE DATOS

In [16]:
df_bank_loan = pd.read_excel('Bankloan.xlsx', sheet_name='Hoja2')
print(f'''
      Dataframe de Préstamo Bancario:\n\n
      Estadística Básica:\n {df_bank_loan.describe()}\n
      Nombre de las columnas:\n {df_bank_loan.columns}\n 
      Información general del dataframe:\n {df_bank_loan.info}\n\n
      Primera filas:\n {df_bank_loan.head(10)}\n 
      Últimas filas:\n {df_bank_loan.tail(10)}\n
      Tamaño del Dataframe: {df_bank_loan.shape}
      
      ''')


      Dataframe de Préstamo Bancario:


      Estadística Básica:
               age          ed      employ     address        income  \
count  681.000000  680.000000  700.000000  700.000000  6.630000e+02   
mean    34.898678    1.717647    8.388571    8.268571  4.574359e+07   
std      8.861849    0.925652    6.658039    6.821609  3.744108e+07   
min     20.000000    1.000000    0.000000    0.000000  1.400000e+07   
25%     28.000000    1.000000    3.000000    3.000000  2.400000e+07   
50%     34.000000    1.000000    7.000000    7.000000  3.400000e+07   
75%     40.000000    2.000000   12.000000   12.000000  5.450000e+07   
max    136.000000    5.000000   31.000000   34.000000  4.460000e+08   

            debtinc      creddebt       othdebt  
count  7.000000e+02  7.000000e+02  7.000000e+02  
mean   1.026057e+07  1.553553e+06  3.058209e+06  
std    6.827234e+06  2.117197e+06  3.287555e+06  
min    4.000000e+05  1.169600e+04  4.558400e+04  
25%    5.000000e+06  3.690592e+05  1.04417

In [27]:
# tratamiento de valores nulos
df_nulls = df_bank_loan.isnull().sum()
print(f'Total de valores nulos:\n {df_nulls[df_nulls > 0]}')

Total de valores nulos:
 age       19
ed        20
income    37
dtype: int64


----- TRATAMIENTO DE VALORES NULOS

In [44]:
# valores nulos y limpieza
def clean_dataset(df):
    """Función que permite rellenar valores nulos en variables numéricas y variables categoricas"""
    # Rellenar valores numericos con la mediana
    numeric_columns = df_bank_loan.select_dtypes(include=[np.number]).columns
    for col in numeric_columns:
        df[col] = df[col].fillna(df[col].median())
    
    # Rellenar valores categoricos con la moda 
    categorical_columns = df_bank_loan.select_dtypes(include=['object']).columns
    for col in categorical_columns:
        df[col] = df[col].fillna(df[col].mode()[0])
    
    return df

df_clean = clean_dataset(df_bank_loan.copy())   
print(f'Dataframe sin valores nulos:\n {df_clean.head(10)}')


Dataframe sin valores nulos:
     age   ed  employ  address       income   debtinc  creddebt   othdebt  \
0  41.0  3.0      17       12  176000000.0   9300000  11359392   5008608   
1  27.0  1.0      10        6   31000000.0  17300000   1362202   4000798   
2  40.0  1.0      15        7   34000000.0   5500000    856075   2168925   
3  41.0  1.0      15       14  120000000.0   2900000   2658720    821280   
4  24.0  2.0       2        0   28000000.0  17300000   1787436   3056564   
5  41.0  2.0       5        5   25000000.0  10200000    392700   2157300   
6  39.0  1.0      20        9   34000000.0  30600000   3833874  16668126   
7  34.0  1.0      12       11   38000000.0   3600000    128592   1239408   
8  24.0  1.0       3        4   19000000.0  24400000   1358348   3277652   
9  36.0  1.0       0       13   25000000.0  19700000   2777700   2147300   

  default  
0       1  
1       0  
2       0  
3       0  
4       1  
5       0  
6       0  
7       0  
8       1  
9       0  


----- ANÁLSIS UNIVARIANTE

In [60]:
def univariante_analysis(df):
    ''' Función de análisis univariante'''
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        fig = make_subplots(rows=1, cols=2, subplot_titles=['Histograma', 'Box Pot'])
        
        # Histograma
        fig.add_trace(
            go.Histogram(x=df[col], name='Histograma'),
            row=1, col=1
        )
        
        # Box plot
        fig.add_trace(
            go.Box(y=df[col], name='Box Plot'),
            row=1, col=2
        )
          
        # Etiquetas de ejes
        fig.update_xaxes(title_text=col, row=1, col=1)   # X del histograma
        fig.update_yaxes(title_text="Frecuencia", row=1, col=1)  # Y del histograma
        fig.update_yaxes(title_text=col, row=1, col=2)   # Y del boxplot
        #titulo
        fig.update_layout(title=f'Distribución de {col}')
        fig.show(renderer='browser')
        
graphics_univariate = univariante_analysis(df_clean)

----- ANÁLISIS BIVARIANTE