In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [2]:
df_test = pd.read_csv('test.csv')
df_train = pd.read_csv('train.csv')

In [3]:
df_test.sample(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
169,1061,3,"Hellstrom, Miss. Hilda Maria",female,22.0,0,0,7548,8.9625,,S
146,1038,1,"Hilliard, Mr. Herbert Henry",male,,0,0,17463,51.8625,E46,S
130,1022,3,"Spinner, Mr. Henry John",male,32.0,0,0,STON/OQ. 369943,8.05,,S
134,1026,3,"Dintcheff, Mr. Valtcho",male,43.0,0,0,349226,7.8958,,S
283,1175,3,"Touma, Miss. Maria Youssef",female,9.0,1,1,2650,15.2458,,C


In [4]:
df_train.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
547,548,1,2,"Padro y Manent, Mr. Julian",male,,0,0,SC/PARIS 2146,13.8625,,C
724,725,1,1,"Chambers, Mr. Norman Campbell",male,27.0,1,0,113806,53.1,E8,S
556,557,1,1,"Duff Gordon, Lady. (Lucille Christiana Sutherl...",female,48.0,1,0,11755,39.6,A16,C
873,874,0,3,"Vander Cruyssen, Mr. Victor",male,47.0,0,0,345765,9.0,,S
792,793,0,3,"Sage, Miss. Stella Anna",female,,8,2,CA. 2343,69.55,,S


**1) Interpretando el DataFrame**

In [5]:
print (f'Tamaño de la muestra de entrenamiento {df_train.shape}.')
print (f'Tamaño de la muestra de prueba {df_test.shape}.')

Tamaño de la muestra de entrenamiento (891, 12).
Tamaño de la muestra de prueba (418, 11).


In [6]:
print (df_train.info())
print (df_test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pcl

In [7]:
print("Datos faltantes")
print(pd.isnull(df_train).sum())
print(pd.isnull(df_test).sum())

Datos faltantes
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [8]:
df_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [9]:
df_test.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


**2) Preprocesamiento de datos**

In [10]:
df_train["Sex"].unique()

array(['male', 'female'], dtype=object)

In [11]:
df_train["Embarked"].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [12]:
df_train["Sex"].replace(["female","male"],[0,1],inplace=True)
df_test["Sex"].replace(["female","male"],[0,1],inplace=True)
df_train["Embarked"].replace(["S","C","Q"],[0,1,2],inplace=True)
df_test["Embarked"].replace(["S","C","Q"],[0,1,2],inplace=True)

*Llenando las celdas N/A de la columna de Edad*

In [13]:
mean_age_train=round(df_train["Age"].mean())
mean_age_test=round(df_test["Age"].mean())

In [14]:
df_train["Age"]=df_train["Age"].replace(np.nan,mean_age_train)
df_test["Age"]=df_train["Age"].replace(np.nan,mean_age_test)

*Asignando bandas en la columna de edad
0-8, 9-15, 16-18, 19-25, 26-40, 41-60, 61-100*

In [15]:
bins=[0,8,15,18,25,40,60,100]
names=[1,2,3,4,5,6,7]
df_train["Age"]=pd.cut(df_train["Age"], bins, labels=names)
df_test["Age"]=pd.cut(df_test["Age"], bins, labels=names)

*Eliminamos las columnas no necesarias, y las filas con valores nulos*

In [16]:
df_train.drop(["Cabin"],axis=1, inplace=True)
df_test.drop(["Cabin"],axis=1, inplace=True)

In [17]:
df_train=df_train.drop(["PassengerId","Name","Ticket"], axis=1)
df_test=df_test.drop(["Name","Ticket"], axis=1)

In [18]:
df_train.dropna(axis=0,how="any",inplace=True)
df_test.dropna(axis=0,how="any",inplace=True)

*Revisión de los valores nulos y dataframes*

In [19]:
print(pd.isnull(df_train).sum())
print(pd.isnull(df_test).sum())
print(df_train.shape)
print(df_test.shape)

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64
PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64
(889, 8)
(417, 8)


In [20]:
df_train.sample(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
779,1,1,0,6,0,1,211.3375,0.0
386,0,3,1,1,5,2,46.9,0.0
769,0,3,1,5,0,0,8.3625,0.0
138,0,3,1,3,0,0,9.2167,0.0
735,0,3,1,5,0,0,16.1,0.0


In [21]:
df_test.sample(5)

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
242,1134,1,1,5,1,1,134.5,1
170,1062,3,1,7,0,0,7.55,0
180,1072,2,1,5,0,0,13.0,0
269,1161,3,1,5,0,0,8.6625,0
142,1034,1,1,4,1,3,262.375,1


**3) Algoritmos de Machine Learning**

In [22]:
X=np.array(df_train.drop(["Survived"],1))
y=np.array(df_train["Survived"])

  X=np.array(df_train.drop(["Survived"],1))


In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

*Regresión Logistica*

In [24]:
logred=LogisticRegression()
logred.fit(X_train, y_train)
Y_pred=logred.predict(X_test)
print("Precisión de la Regresión Logistica")
print(logred.score(X_train, y_train))

Precisión de la Regresión Logistica
0.8073136427566807


*Maquinas de Soporte*

In [25]:
svc=SVC()
svc.fit(X_train, y_train)
Y_pred=svc.predict(X_test)
print("Precisión Vectores de Soporte")
print(svc.score(X_train, y_train))

Precisión Vectores de Soporte
0.6863572433192686


*K Neighbor*

In [26]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
Y_pred = knn.predict(X_test)
print("Precisión Vecino más cercano")
print(knn.score(X_train, y_train))

Precisión Vecino más cercano
0.8649789029535865


**3) Predicción de los modelos**

In [27]:
ids=df_test["PassengerId"]

In [28]:
prediccion_logred = logred.predict(df_test.drop('PassengerId', axis=1))
out_logred = pd.DataFrame({ 'PassengerId' : ids, 'Survived': prediccion_logred })
print('Predicción Regresión Logística:')
out_logred.head()

Predicción Regresión Logística:


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [29]:
prediccion_svc = svc.predict(df_test.drop('PassengerId', axis=1))
out_svc = pd.DataFrame({ 'PassengerId' : ids, 'Survived': prediccion_svc })
print('Predicción Soporte de Vectores:')
out_svc.head()

Predicción Soporte de Vectores:


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [30]:
prediccion_knn = knn.predict(df_test.drop('PassengerId', axis=1))
out_knn = pd.DataFrame({ 'PassengerId' : ids, 'Survived': prediccion_knn })
print('Predicción Vecinos más cercanos:')
out_knn.head()

Predicción Vecinos más cercanos:


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
