In [64]:
import pandas as pd
import numpy as np

class DataFrameAnalyzer:
    def __init__(self, dataframe: pd.DataFrame):
        """
        Inicializa la clase con un DataFrame
        """
        if not isinstance(dataframe, pd.DataFrame):
            raise TypeError("El argumento debe ser un DataFrame de pandas.")
        self.df = dataframe

    def resumen(self) -> pd.DataFrame:
        """
        Retorna un resumen detallado del dataset en formato DataFrame:
        - Tipo de Dato
        - Cardinalidad
        - % Cardinalidad
        - Valores Faltantes
        - % Valores Faltantes
        - Categoría
        """
        total_rows = len(self.df)
        summary = []

        for col in self.df.columns:
            # Tipo de dato
            data_type = self.df[col].dtype

            # Cardinalidad y % Cardinalidad
            cardinality = self.df[col].nunique()
            cardinality_pct = (cardinality / total_rows) * 100

            # Valores faltantes y % Valores faltantes
            missing = self.df[col].isnull().sum()
            missing_pct = (missing / total_rows) * 100

            # Determinar la categoría de la columna
            if pd.api.types.is_numeric_dtype(self.df[col]):
                if cardinality == 2:
                    category = "Binaria"
                elif np.issubdtype(self.df[col].dtype, np.integer):
                    category = "Numérica Discreta"
                else:
                    category = "Numérica Continua"
            elif pd.api.types.is_object_dtype(self.df[col]) or pd.api.types.is_categorical_dtype(self.df[col]):
                if cardinality == 2:
                    category = "Binaria"
                else:
                    category = "Categórica Nominal"
            else:
                category = "Otro"

            # Clasificar "rowid" o índices numéricos
            if "id" in col.lower() or col.lower() == "rowid":
                category = "Índice Numérico"

            # Añadir fila al resumen
            summary.append({
                "Columna": col,
                "Tipo de Dato": data_type,
                "Cardinalidad": cardinality,
                "% Cardinalidad": round(cardinality_pct, 2),
                "Valores Faltantes": missing,
                "% Valores Faltantes": round(missing_pct, 2),
                "Categoría": category
            })

        # Crear DataFrame resumen
        summary_df = pd.DataFrame(summary)
        return summary_df

    def describe_numeric(self) -> pd.DataFrame:
        """
        Análisis estadístico detallado de variables numéricas:
        - Media, mediana, moda
        - Desviación estándar
        - Cuartiles
        - Asimetría y curtosis
        """
        numeric_df = self.df.select_dtypes(include=['number'])  # Filtrar solo variables numéricas
        
        # Calcular estadísticas
        stats = numeric_df.describe().T
        stats['mean'] = numeric_df.mean()
        stats['median'] = numeric_df.median()
        stats['mode'] = numeric_df.mode().iloc[0]
        stats['std_dev'] = numeric_df.std()
        stats['skewness'] = numeric_df.skew()
        stats['kurtosis'] = numeric_df.kurt()
        
        return stats[['count', 'mean', 'median', 'mode', 'std_dev', 'min', '25%', '50%', '75%', 'max', 'skewness', 'kurtosis']]

    def describe_categorical(self) -> pd.DataFrame:
        """
        Análisis de variables categóricas:
        - Frecuencias
        - Proporciones
        - Valores únicos
        """
        categorical_df = self.df.select_dtypes(include=['object', 'category'])  # Filtrar variables categóricas
        
        # Calcular estadísticas
        stats = {
            "unique_values": categorical_df.nunique(),
            "most_frequent": categorical_df.mode().iloc[0],
            "frequency": categorical_df.apply(lambda x: x.value_counts().iloc[0]),
            "proportion": round((categorical_df.apply(lambda x: x.value_counts(normalize=True).iloc[0])*100),2)
        }
        
        return pd.DataFrame(stats)


In [118]:

import DataFrameAnalyzer


ModuleNotFoundError: No module named 'DataFrameAnalyzer'

In [65]:
tienda =  pd.read_csv('merch_sales.csv')

In [66]:
tienda.shape

(7394, 15)

In [67]:
tienda.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7394 entries, 0 to 7393
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Order ID                7394 non-null   int64 
 1   Order Date              7394 non-null   object
 2   Product ID              7394 non-null   object
 3   Product Category        7394 non-null   object
 4   Buyer Gender            7394 non-null   object
 5   Buyer Age               7394 non-null   int64 
 6   Order Location          7394 non-null   object
 7   International Shipping  7394 non-null   object
 8   Sales Price             7394 non-null   int64 
 9   Shipping Charges        7394 non-null   int64 
 10  Sales per Unit          7394 non-null   int64 
 11  Quantity                7394 non-null   int64 
 12  Total Sales             7394 non-null   int64 
 13  Rating                  7394 non-null   int64 
 14  Review                  7394 non-null   object
dtypes: i

In [104]:
# Reemplazamos los espacios en los nombres de las columnas por "_".
list1 = []
list2 = []
for col in tienda.columns:
    list1.append(col.replace(' ','_'))
    list2.append(col)
a = dict(zip(list2,list1))
tienda.rename(columns=a,inplace=True)

In [105]:
# Cambiamos la columna Order Date a DateTime
tienda['Order_Date'] = pd.to_datetime(tienda['Order_Date'])

In [106]:
analizar = DataFrameAnalyzer(tienda)

In [107]:
analizar.resumen()

  elif pd.api.types.is_object_dtype(self.df[col]) or pd.api.types.is_categorical_dtype(self.df[col]):


Unnamed: 0,Columna,Tipo de Dato,Cardinalidad,% Cardinalidad,Valores Faltantes,% Valores Faltantes,Categoría
0,Order_ID,int64,7394,100.0,0,0.0,Índice Numérico
1,Order_Date,datetime64[ns],366,4.95,0,0.0,Otro
2,Product_ID,object,13,0.18,0,0.0,Índice Numérico
3,Product_Category,object,3,0.04,0,0.0,Categórica Nominal
4,Buyer_Gender,object,2,0.03,0,0.0,Binaria
5,Buyer_Age,int64,18,0.24,0,0.0,Numérica Discreta
6,Order_Location,object,25,0.34,0,0.0,Categórica Nominal
7,International_Shipping,object,2,0.03,0,0.0,Binaria
8,Sales_Price,int64,13,0.18,0,0.0,Numérica Discreta
9,Shipping_Charges,int64,6,0.08,0,0.0,Numérica Discreta


In [108]:
analizar.describe_categorical()

Unnamed: 0,unique_values,most_frequent,frequency,proportion
Product_ID,13,BF1548,1497,20.25
Product_Category,3,Clothing,3704,50.09
Buyer_Gender,2,Male,5188,70.16
Order_Location,25,New Jersey,434,5.87
International_Shipping,2,No,5139,69.5
Review,29,Lack of delivery delays is greatly appreciated.,466,6.3


In [109]:
analizar.describe_numeric()

Unnamed: 0,count,mean,median,mode,std_dev,min,25%,50%,75%,max,skewness,kurtosis
Order_ID,7394.0,159735.134028,159577.5,120005.0,23040.247194,120005.0,139895.75,159577.5,179869.75,199994.0,0.011485,-1.185821
Buyer_Age,7394.0,26.455504,26.0,34.0,5.208202,18.0,22.0,26.0,31.0,35.0,0.010084,-1.214206
Sales_Price,7394.0,55.165404,65.0,65.0,39.575614,9.0,15.0,65.0,97.0,130.0,0.338032,-1.250453
Shipping_Charges,7394.0,14.633487,0.0,0.0,24.815719,0.0,0.0,0.0,40.0,100.0,1.652604,2.051472
Sales_per_Unit,7394.0,69.798891,65.0,65.0,46.702326,9.0,20.0,65.0,100.0,230.0,0.44914,-0.516342
Quantity,7394.0,1.668109,1.0,1.0,1.081398,1.0,1.0,1.0,2.0,5.0,1.577708,1.487874
Total_Sales,7394.0,115.832026,90.0,65.0,118.06796,9.0,40.0,90.0,137.0,1000.0,2.258077,6.602692
Rating,7394.0,3.499053,4.0,4.0,1.389731,1.0,3.0,4.0,5.0,5.0,-0.603098,-0.911546
