Caso Titanic

# Instalación de Librerías necesarias

In [None]:
# --- Configuración inicial ---
!pip install pandas scikit-learn matplotlib



# Creación y definición de Carpetas

In [34]:
import os

data_raw_folder = '/content/data/raw'
data_processed_folder = '/content/data/processed'
data_model_folder = '/content/models'
data_scores_folder = '/content/data/scores'

os.makedirs(data_raw_folder, exist_ok=True)
os.makedirs(data_processed_folder, exist_ok=True)
os.makedirs(data_model_folder, exist_ok=True)
os.makedirs(data_scores_folder, exist_ok=True)

train_url = os.path.join(data_raw_folder, "train.csv")
print(train_url)

/content/data/raw/train.csv


# Preparación de Datos

In [35]:
import pandas as pd

def read_file_csv(data_folder, filename, index_col=None):
    path = os.path.join(data_folder, filename)  # Ruta absoluta
    df = pd.read_csv(path)
    if index_col:
        df.set_index(index_col, inplace=True)
    print(f"{filename} cargado. Dimensiones: {df.shape}")
    display(df.head())
    return df

def data_preparation(df):
    # Eliminar columnas no útiles
    df = df.drop(['Name', 'Ticket', 'Cabin'], axis=1)
    display(df.head())
    # Rellenar valores faltantes
    df['Age'] = df['Age'].fillna(df['Age'].median())  # Sin inplace
    display(df.head())
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
    display(df.head())
    # Convertir variables categóricas
    df = pd.get_dummies(df, columns=['Sex', 'Embarked'], drop_first=True)
    display(df.head())
    return df

def data_exporting(df, filename):
    features = df.columns.tolist()
    dfp = df[features]
    dfp.to_csv(os.path.join(data_processed_folder, filename))
    print(filename, 'exportado correctamente en la carpeta processed')


df1 = read_file_csv(data_raw_folder, "train.csv", index_col='PassengerId')
tdf1 = data_preparation(df1)
data_exporting(tdf1, 'titanic_train.csv')

df2 = read_file_csv(data_raw_folder, "test.csv", index_col='PassengerId')
tdf2 = data_preparation(df2)
data_exporting(tdf2, 'titanic_score.csv')



train.csv cargado. Dimensiones: (891, 11)


Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,male,22.0,1,0,7.25,S
2,1,1,female,38.0,1,0,71.2833,C
3,1,3,female,26.0,0,0,7.925,S
4,1,1,female,35.0,1,0,53.1,S
5,0,3,male,35.0,0,0,8.05,S


Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,male,22.0,1,0,7.25,S
2,1,1,female,38.0,1,0,71.2833,C
3,1,3,female,26.0,0,0,7.925,S
4,1,1,female,35.0,1,0,53.1,S
5,0,3,male,35.0,0,0,8.05,S


Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,male,22.0,1,0,7.25,S
2,1,1,female,38.0,1,0,71.2833,C
3,1,3,female,26.0,0,0,7.925,S
4,1,1,female,35.0,1,0,53.1,S
5,0,3,male,35.0,0,0,8.05,S


Unnamed: 0_level_0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,3,22.0,1,0,7.25,True,False,True
2,1,1,38.0,1,0,71.2833,False,False,False
3,1,3,26.0,0,0,7.925,False,False,True
4,1,1,35.0,1,0,53.1,False,False,True
5,0,3,35.0,0,0,8.05,True,False,True


titanic_train.csv exportado correctamente en la carpeta processed
test.csv cargado. Dimensiones: (418, 10)


Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
892,3,male,34.5,0,0,7.8292,Q
893,3,female,47.0,1,0,7.0,S
894,2,male,62.0,0,0,9.6875,Q
895,3,male,27.0,0,0,8.6625,S
896,3,female,22.0,1,1,12.2875,S


Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
892,3,male,34.5,0,0,7.8292,Q
893,3,female,47.0,1,0,7.0,S
894,2,male,62.0,0,0,9.6875,Q
895,3,male,27.0,0,0,8.6625,S
896,3,female,22.0,1,1,12.2875,S


Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
892,3,male,34.5,0,0,7.8292,Q
893,3,female,47.0,1,0,7.0,S
894,2,male,62.0,0,0,9.6875,Q
895,3,male,27.0,0,0,8.6625,S
896,3,female,22.0,1,1,12.2875,S


Unnamed: 0_level_0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
892,3,34.5,0,0,7.8292,True,True,False
893,3,47.0,1,0,7.0,False,False,True
894,2,62.0,0,0,9.6875,True,True,False
895,3,27.0,0,0,8.6625,True,False,True
896,3,22.0,1,1,12.2875,False,False,True


titanic_score.csv exportado correctamente en la carpeta processed


# Entrenar modelo

In [36]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
#import pickle
from joblib import dump, load

#data = read_file_csv(data_processed_folder, "titanic_train.csv")
data = read_file_csv(data_processed_folder, "titanic_train.csv", index_col='PassengerId')

X = data.drop('Survived', axis=1)
y = data['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
print('Modelo entrenado')

# Guardando el modelo entrenado
package = os.path.join(data_model_folder, 'best_model')
print(package)
dump(model, package)

#pickle.dump(model, open(package, 'wb'))
print('Modelo exportado correctamente en la carpeta models')

titanic_train.csv cargado. Dimensiones: (891, 9)


Unnamed: 0_level_0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,3,22.0,1,0,7.25,True,False,True
2,1,1,38.0,1,0,71.2833,False,False,False
3,1,3,26.0,0,0,7.925,False,False,True
4,1,1,35.0,1,0,53.1,False,False,True
5,0,3,35.0,0,0,8.05,True,False,True


Modelo entrenado
/content/models/best_model
Modelo exportado correctamente en la carpeta models


# Evaluar modelo

In [37]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

data = read_file_csv(data_processed_folder, "titanic_train.csv", index_col='PassengerId')
X = data.drop('Survived', axis=1)
y = data['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Leemos el modelo entrenado para usarlo
package = os.path.join(data_model_folder, 'best_model')
model = load(package)
print('Modelo importado correctamente')

# Predecimos sobre el set de datos de validación
y_pred_test=model.predict(X_test)

# Generamos métricas de diagnóstico
cm_test = confusion_matrix(y_test,y_pred_test)
print("Matriz de confusion: ")
print(cm_test)
accuracy_test=accuracy_score(y_test,y_pred_test)
print("Accuracy: ", accuracy_test)
precision_test=precision_score(y_test,y_pred_test)
print("Precision: ", precision_test)
recall_test=recall_score(y_test,y_pred_test)
print("Recall: ", recall_test)
print('Finalizó la validación del Modelo')


titanic_train.csv cargado. Dimensiones: (891, 9)


Unnamed: 0_level_0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,3,22.0,1,0,7.25,True,False,True
2,1,1,38.0,1,0,71.2833,False,False,False
3,1,3,26.0,0,0,7.925,False,False,True
4,1,1,35.0,1,0,53.1,False,False,True
5,0,3,35.0,0,0,8.05,True,False,True


Modelo importado correctamente
Matriz de confusion: 
[[91 14]
 [18 56]]
Accuracy:  0.8212290502793296
Precision:  0.8
Recall:  0.7567567567567568
Finalizó la validación del Modelo


# Scoring del Modelo

In [38]:
data = read_file_csv(data_processed_folder, "titanic_score.csv", index_col='PassengerId')

#X = data.drop('Survived', axis=1)
#y = data['Survived']

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Leemos el modelo entrenado para usarlo
package = os.path.join(data_model_folder, 'best_model')
model = load(package)
print('Modelo importado correctamente')

# Predecimos sobre el set de datos de validación
#y_pred_test=model.predict(X_test)


# Predecimos sobre el set de datos de Scoring
scores = 'final_score.csv'
res = model.predict(data).reshape(-1,1)
pred = pd.DataFrame(res, columns=['PREDICT'])
ruta_file_scoring = os.path.join(data_scores_folder, scores)
print(ruta_file_scoring)
pred.to_csv(ruta_file_scoring)
print(scores, 'exportado correctamente en la carpeta scores')



titanic_score.csv cargado. Dimensiones: (418, 8)


Unnamed: 0_level_0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
892,3,34.5,0,0,7.8292,True,True,False
893,3,47.0,1,0,7.0,False,False,True
894,2,62.0,0,0,9.6875,True,True,False
895,3,27.0,0,0,8.6625,True,False,True
896,3,22.0,1,1,12.2875,False,False,True


Modelo importado correctamente
/content/data/scores/final_score.csv
final_score.csv exportado correctamente en la carpeta scores
