In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

def classification_pipeline(csv_file, target_column, test_size=0.2, random_state=42):

    ### Carga del dataset
    df = pd.read_csv(csv_file)

    ### Quitamos columnas del tipo ID o similares
    id_cols = [col for col in df.columns if 'id' in col.lower()]
    df = df.drop(columns=id_cols, errors='ignore')

    ### Tratamiento de datos nulos o faltantes
    ### Para numericas usmos la mediana
    num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    for col in num_cols:
        df[col].fillna(df[col].median(), inplace=True)

    ### Para categoricas utilizamos la moda
    cat_cols = df.select_dtypes(include=['object']).columns.tolist()
    for col in cat_cols:
        df[col].fillna(df[col].mode()[0], inplace=True)

    ### Convertir categoricas a numericas con Label Encoding
    label_encoders = {}
    for col in cat_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le  ### Gardamos os encoders por si hace falta decodificar

    ### Separar caracteristicass (X) y objetivo (y)
    X = df.drop(columns=[target_column])  
    y = df[target_column] 

    ### Dividir el dataset en train y test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    ### Entrenar un modelo de Random Forest
    rf_model = RandomForestClassifier(n_estimators=100, random_state=random_state)
    rf_model.fit(X_train, y_train)

   ### Predicciones y evaluacion de modelo
    y_pred = rf_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)

    print(f' Precisión del modelo: {accuracy:.4f}')
    print('Reporte de Clasificación:')
    print(classification_rep)

    return rf_model, accuracy, classification_rep



In [9]:
model, acc, report = classification_pipeline('Train.csv', target_column='Segmentation')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

 Precisión del modelo: 0.4851
Reporte de Clasificación:
              precision    recall  f1-score   support

           0       0.38      0.37      0.38       391
           1       0.37      0.34      0.36       369
           2       0.51      0.52      0.51       380
           3       0.64      0.66      0.65       474

    accuracy                           0.49      1614
   macro avg       0.47      0.47      0.47      1614
weighted avg       0.48      0.49      0.48      1614

