In [1]:
import numpy as np
import pandas as pd

In [2]:
#leer el csv de la web y guardarlo en local:
#una vez guardado, esta casilla se puede comentar:
import requests
import os

# URL del archivo
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv"

# Nombre de la carpeta y del archivo
carpeta = "data"
nombre_archivo = "titanic.csv"

# Crear la carpeta si no existe
os.makedirs(carpeta, exist_ok=True)

# Ruta completa del archivo
ruta_archivo = os.path.join(carpeta, nombre_archivo)

# Descargar el contenido
respuesta = requests.get(url)

# Verificar que la solicitud fue exitosa y guardar el archivo
if respuesta.status_code == 200:
    with open(ruta_archivo, "wb") as archivo:
        archivo.write(respuesta.content)
    print(f"Archivo guardado en {ruta_archivo}")
else:
    print("Error al descargar el archivo")

Archivo guardado en data/titanic.csv


In [3]:
data = pd.read_csv(ruta_archivo) #aquí dejo el original de lectura de datos

data.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [4]:
class RegresionLogisticaBootcamp:
    def __init__(self, learning_rate = 0.01, max_iter =1000):
        self.learning_rate = learning_rate
        self.max_iter = max_iter
        self.pesos = None           # en inglés weights
        self.sesgo = None           # en inglés, bias
    
    def _sigmoide (self, z):
        return 1/ (1 + np.exp(-z))
    
    def _binary_cross_entropy (self, y_true, y_pred):
        
        epsilon = 1e-15   #número fijo, que viene de log(0)
        y_pred = np.clip(y_pred, epsilon, 1-epsilon)    # es una fórmula fija, clip nos viene muy bien para llevar
                                # los números del umbral para arriba al 1, y los del umbral para abajo al 0
        return -np.mean (y_true * np.log (y_pred) + (1 - y_true) * np.log (1 - y_pred))
    
    def fit (self, df, target):
        X = df.drop (columns=target). values
        y = df[target].values
        
        # en este caso estoy buscando iteraciones hasta encontrar donde la perdida es la menor
        
        n_filas, n_columnas = X.shape
        self.pesos = np.zeros (n_columnas)
        self.sesgo = 0
        
        for i in range (self.max_iter):
            
            modelo_lineal = np.dot (X, self.pesos) + self.sesgo     #construyo la función para tener z
            
            y_pred = self._sigmoide (modelo_lineal)         #le paso al metodo _sigmoide el valor z que he construido
            
            dw = (1 / n_filas)* np.dot (X.T, (y_pred - y))       #aquí defino los pesos
            db = (1 / n_filas) + np.sum (y_pred -y)     #aquí defino el sesgo, bias

            self.pesos -= self.learning_rate * dw
            self.sesgo -= self.learning_rate * db
            
            if i % 100 == 0:
                loss = self._binary_cross_entropy (y, y_pred)
                print (f'Iteración (i) : Loss (loss)')
                
    def predict_proba (self, df):

        if self.pesos is None or self.sesgo is None:
            raise Exception ("El modelo no ha sido entrenado.")
    
        modelo_lineal = np.dot (X, self.pesos) + self.sesgo  
        
        return self._sigmoide (modelo_lineal)
                
    def predict (self, df, umbral = 0.5):
    
        if self.pesos is None or self.sesgo is None:
            raise Exception ("El modelo no ha sido entrenado.")
        
        probabilidad = self.predict_proba (df)
       
        return (probabilidad >= umbral).astype(int)
        
    
    

podría demostrar cual es la diferencia de mi clase contra LinearModel

In [5]:
from sklearn.linear_model import LogisticRegression

__________
EDA......


In [6]:
df = data.copy()
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [7]:
df = df[['Pclass','Name', 'Sex', 'Age','SibSp', 'Parch', 'Fare', 'Embarked', 'Survived']]

In [8]:
df

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.2500,S,0
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,1
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.9250,S,1
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1000,S,1
4,3,"Allen, Mr. William Henry",male,35.0,0,0,8.0500,S,0
...,...,...,...,...,...,...,...,...,...
886,2,"Montvila, Rev. Juozas",male,27.0,0,0,13.0000,S,0
887,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,30.0000,S,1
888,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,23.4500,S,0
889,1,"Behr, Mr. Karl Howell",male,26.0,0,0,30.0000,C,1


Imputaciones

In [9]:
df['Age'].describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

dada que la edad minima es 0.42, voy a imputar a los Nan el valor de la mediana.

In [10]:
df ['Embarked'] = df ['Embarked'].fillna(df['Embarked'].mode())
df ['Age'] = df ['Age'].fillna(df['Age'].median())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df ['Embarked'] = df ['Embarked'].fillna(df['Embarked'].mode())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df ['Age'] = df ['Age'].fillna(df['Age'].median())


Feature Engineering

In [11]:
# df['Title']=  df ['Name'].apply(lambda x: x.split (', ')[1])
# df['Title']=  df ['Title'].apply(lambda x: x.split ('.')[0])

#esto funciona perfectamente pero voy a intentar hacerlo en una sola linea de código


In [12]:
df.isna().sum()

Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    2
Survived    0
dtype: int64

In [13]:
#para hacer lo anterior de una sola vez y con RegEx
# import re
# df['Title'] = df ['Name'].apply(lambda x: re.search (', (\w+)\.', x).group(0))

#no nos funciona, así que nos rendimos con el RegEx

In [14]:
df['Title'] =  df['Name'].str.extract(', (\w+)\.').fillna ('Sin titulo')
df.drop(columns = 'Name', inplace =True)

  df['Title'] =  df['Name'].str.extract(', (\w+)\.').fillna ('Sin titulo')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Title'] =  df['Name'].str.extract(', (\w+)\.').fillna ('Sin titulo')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns = 'Name', inplace =True)


In [15]:
pd.__version__

'2.2.3'

In [16]:
df.to_parquet('data/df_limpio.parquet', index=False)

________________
desde aquí en adelante cada vez que rompa el df

In [17]:
df = pd.read_parquet ('df_limpio.parquet')

procesado para Machine Learning (ML)

In [18]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

In [19]:
lr

No vale para ML porque hay valores no numeríco 

In [20]:
df.select_dtypes (include='object').head()

Unnamed: 0,Sex,Embarked,Title
0,male,S,Mr
1,female,C,Mrs
2,female,S,Miss
3,female,S,Mrs
4,male,S,Mr


voy a cambiarlos a numéricos, de columna en columna

In [21]:
# empiezo con Sexo, una forma es con un if male= 0, else =1
# otra más interesante es replace, pasándole un diccionario
#df['Sex'].replace (({'male':0, 'female':1}))
#puedo hacerlo con map
#df['Sex'].map (({'male':0, 'female':1}))
#puedo hacerlo con Label encoder o categorical encoder, pero sería matar moscas a cañonazos
pd.get_dummies(df['Sex'],dtype = 'int')

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1
...,...,...
886,0,1
887,1,0
888,1,0
889,0,1


In [22]:
# a = pd.get_dummies(df['Sex'],dtype = 'int')

# df2 = df.copy ()
# pd.concat ([df2,a], axis=1)

In [23]:
df['Sex'] = df['Sex'].map ({'male':0, 'female':1})

# df = pd.concat ([df,df_sex], axis=1)
# df.drop (columns ='Sex', inplace =True)

df

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived,Title
0,3,0,22.0,1,0,7.2500,S,0,Mr
1,1,1,38.0,1,0,71.2833,C,1,Mrs
2,3,1,26.0,0,0,7.9250,S,1,Miss
3,1,1,35.0,1,0,53.1000,S,1,Mrs
4,3,0,35.0,0,0,8.0500,S,0,Mr
...,...,...,...,...,...,...,...,...,...
886,2,0,27.0,0,0,13.0000,S,0,Rev
887,1,1,19.0,0,0,30.0000,S,1,Miss
888,3,1,28.0,1,2,23.4500,S,0,Miss
889,1,0,26.0,0,0,30.0000,C,1,Mr


In [24]:
df_sex

NameError: name 'df_sex' is not defined

In [72]:
# from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder

# pre = LabelEncoder()

# a = pre.fit_transform(df['Sex'])

a no nos gusta pues no sabemos que es qué, y a futuro podría mezclar por el orden alfabético. Es mejor separar dos casillas

In [73]:
# from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder

# pre = LabelEncoder()

# pre.fit(df['Sex']) #primeramente lo entreno, le digo qué es 0 y 1.

In [74]:
# a = pre.transform(df['Sex'])    #y ahora transformo, de forma que si llegan nuevos datos, solo ejecuto esta parte de código
# a

sigo transformando las demás.

In [27]:
df

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived,Title
0,3,0,22.0,1,0,7.2500,S,0,Mr
1,1,1,38.0,1,0,71.2833,C,1,Mrs
2,3,1,26.0,0,0,7.9250,S,1,Miss
3,1,1,35.0,1,0,53.1000,S,1,Mrs
4,3,0,35.0,0,0,8.0500,S,0,Mr
...,...,...,...,...,...,...,...,...,...
886,2,0,27.0,0,0,13.0000,S,0,Rev
887,1,1,19.0,0,0,30.0000,S,1,Miss
888,3,1,28.0,1,2,23.4500,S,0,Miss
889,1,0,26.0,0,0,30.0000,C,1,Mr


In [28]:
df.select_dtypes (include='object').head(2)

Unnamed: 0,Embarked,Title
0,S,Mr
1,C,Mrs


In [29]:
df_emb = pd.get_dummies(df['Embarked'],dtype = 'int')


df = pd.concat ([df,df_emb], axis=1)
df.drop (columns ='Embarked', inplace =True)
df

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Survived,Title,C,Q,S
0,3,0,22.0,1,0,7.2500,0,Mr,0,0,1
1,1,1,38.0,1,0,71.2833,1,Mrs,1,0,0
2,3,1,26.0,0,0,7.9250,1,Miss,0,0,1
3,1,1,35.0,1,0,53.1000,1,Mrs,0,0,1
4,3,0,35.0,0,0,8.0500,0,Mr,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
886,2,0,27.0,0,0,13.0000,0,Rev,0,0,1
887,1,1,19.0,0,0,30.0000,1,Miss,0,0,1
888,3,1,28.0,1,2,23.4500,0,Miss,0,0,1
889,1,0,26.0,0,0,30.0000,1,Mr,1,0,0


In [30]:
df

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Survived,Title,C,Q,S
0,3,0,22.0,1,0,7.2500,0,Mr,0,0,1
1,1,1,38.0,1,0,71.2833,1,Mrs,1,0,0
2,3,1,26.0,0,0,7.9250,1,Miss,0,0,1
3,1,1,35.0,1,0,53.1000,1,Mrs,0,0,1
4,3,0,35.0,0,0,8.0500,0,Mr,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
886,2,0,27.0,0,0,13.0000,0,Rev,0,0,1
887,1,1,19.0,0,0,30.0000,1,Miss,0,0,1
888,3,1,28.0,1,2,23.4500,0,Miss,0,0,1
889,1,0,26.0,0,0,30.0000,1,Mr,1,0,0


In [31]:
from sklearn.preprocessing import OneHotEncoder

pre= OneHotEncoder (handle_unknown='ignore', sparse_output = False)

pre.fit (df[['Title']])

df_Title =pre.transform (df [['Title']])

# df_Title =pd.DataFrame(df_Title, columns =list (pre.categories_))
# df_Title =pd.DataFrame(df_Title, columns = pre.categories_[0].ravel())  #esta manera con el ravel que no entiendo bien
df_Title =pd.DataFrame(df_Title, columns = pre.get_feature_names_out(['Title']))

df = pd.concat ([df, df_Title], axis =1)

df.drop(columns ='Title', inplace = True)


In [32]:
df

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Survived,C,Q,S,...,Title_Master,Title_Miss,Title_Mlle,Title_Mme,Title_Mr,Title_Mrs,Title_Ms,Title_Rev,Title_Sin titulo,Title_Sir
0,3,0,22.0,1,0,7.2500,0,0,0,1,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1,1,38.0,1,0,71.2833,1,1,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,3,1,26.0,0,0,7.9250,1,0,0,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,1,35.0,1,0,53.1000,1,0,0,1,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,3,0,35.0,0,0,8.0500,0,0,0,1,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,2,0,27.0,0,0,13.0000,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
887,1,1,19.0,0,0,30.0000,1,0,0,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
888,3,1,28.0,1,2,23.4500,0,0,0,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
889,1,0,26.0,0,0,30.0000,1,1,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [33]:
df.shape

(891, 27)

In [34]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter = 2000)

lr.fit (X=df.drop(columns = 'Survived'), y =df['Survived'])

In [38]:
from sklearn.metrics import classification_report, confusion_matrix
pred = lr.predict (df.drop (columns= 'Survived'))

print (classification_report(y_true =df['Survived'], y_pred=pred, target_names= ['murió', 'sobrevivió']))

              precision    recall  f1-score   support

       murió       0.86      0.88      0.87       549
  sobrevivió       0.79      0.76      0.78       342

    accuracy                           0.83       891
   macro avg       0.82      0.82      0.82       891
weighted avg       0.83      0.83      0.83       891



In [39]:
print (confusion_matrix (y_true = y_test, y_pred=pred))

NameError: name 'y_test' is not defined

8vuelvo a hacerlo entrenando bien

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split (df.drop (columns ='Survive'), df ['Survived'], df['Survived'], test_size=0.2, random_state =42)

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter = 2000)

lr.fit (X=X_train, y = y_train)