### Importación de Bibliotecas

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

### Lectura del Archivo

In [2]:
df = pd.read_csv('./titanic.csv')

### Visualización de los Datos

In [3]:
print(df)

     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                                 ...     ...   ... 

In [4]:
# Columnas del DataSet
print(df.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [5]:
# Dimensiones del DataSet
print(df.shape)

(891, 12)


### Extracción de información con iloc y loc

In [6]:
# El primer parámetro hace referencia a la fila y el segundo a la columna, pueden ingresarse números, rangos o listas
# Sexta fila
df.iloc[5]

PassengerId                   6
Survived                      0
Pclass                        3
Name           Moran, Mr. James
Sex                        male
Age                         NaN
SibSp                         0
Parch                         0
Ticket                   330877
Fare                     8.4583
Cabin                       NaN
Embarked                      Q
Name: 5, dtype: object

In [7]:
# Fila del 0-2, sin incluir al 2, es decir fila 0 y 1
df.iloc[0:2]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [8]:
# Fila 0, columna 0, es decir PassengerId del primer registro
df.iloc[0,0]

1

In [9]:
# Lista de filas, 0, 10, 12 y un rango de columas 3-6, es decir 3 a 5
df.iloc[[0, 10, 12], 3:6]

Unnamed: 0,Name,Sex,Age
0,"Braund, Mr. Owen Harris",male,22.0
10,"Sandstrom, Miss. Marguerite Rut",female,4.0
12,"Saundercock, Mr. William Henry",male,20.0


In [10]:
# Loc funciona de forma similar a iloc pero en loc se hace uso de los nombres de columnas
# Fila 0, primera fila
df.loc[0]

PassengerId                          1
Survived                             0
Pclass                               3
Name           Braund, Mr. Owen Harris
Sex                               male
Age                               22.0
SibSp                                1
Parch                                0
Ticket                       A/5 21171
Fare                              7.25
Cabin                              NaN
Embarked                             S
Name: 0, dtype: object

In [11]:
# Fila 0, columna del precio
df.loc[0, 'Fare']

7.25

In [12]:
# Fila de 0-3, aquí se incluyen ambos extremos
df.loc[:3]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S


In [13]:
# Fila de 0-3, Sex-Fare
df.loc[:3, 'Sex':'Fare']

Unnamed: 0,Sex,Age,SibSp,Parch,Ticket,Fare
0,male,22.0,1,0,A/5 21171,7.25
1,female,38.0,1,0,PC 17599,71.2833
2,female,26.0,0,0,STON/O2. 3101282,7.925
3,female,35.0,1,0,113803,53.1


In [14]:
# Fila de 0-3, Columnas Sex, Fare, Embarked
df.loc[:3, ['Sex', 'Fare', 'Embarked']]

Unnamed: 0,Sex,Fare,Embarked
0,male,7.25,S
1,female,71.2833,C
2,female,7.925,S
3,female,53.1,S


In [15]:
# Tipos de datos
df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [16]:
# Obtención de descripción del DataSet
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [17]:
# Podemos hacer lo mismo para los atributo catégoricos
df.describe(include = 'all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Braund, Mr. Owen Harris",male,,,,347082.0,,B96 B98,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


### Transformación de Datos

In [18]:
# Eliminamos aquellas columnas no necesarias, Id, nombre, ticket, cabina
df = df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'])

In [19]:
# Eliminamos aquellas fillas con valor vacío
df = df.dropna()

In [20]:
# Transformamos aquellos datos que pueden ser catégoricos
df['Sex'] = df['Sex'].astype('category').cat.codes
df['Embarked'] = df['Embarked'].astype('category').cat.codes

In [21]:
df.dtypes

Survived      int64
Pclass        int64
Sex            int8
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Embarked       int8
dtype: object

In [22]:
df.loc[:,['Sex', 'Embarked']]

Unnamed: 0,Sex,Embarked
0,1,2
1,0,0
2,0,2
3,0,2
4,1,2
...,...,...
885,0,1
886,1,2
887,0,2
889,1,0


In [23]:
# Salvamos el archivo a CSV
path = './titanic_ml.csv'
df.to_csv(path, index = False)

# Preprocesado de Datos 

In [24]:
def split_label(df, test_size, label):
    train, test = train_test_split(df, test_size = test_size)
    features = df.columns.drop(label)
    train_X = train[features]
    train_y = train[label]
    test_X = test[features]
    test_y = test[label]
    return train_X, train_y, test_X, test_y

In [25]:
train_X, train_y, test_X, test_y = split_label(df, 0.2, 'Survived')

In [26]:
ohe = OneHotEncoder(sparse = False)
transformed = ohe.fit_transform(train_X[['Embarked']])

In [27]:
train_X_1 = train_X.drop(columns = ['Embarked'])
train_X_1[ohe.categories_[0]] = transformed

In [28]:
min_max_scaler = MinMaxScaler()
train_X_2 = min_max_scaler.fit_transform(train_X_1)

In [29]:
# Realizamos el mismo procedimiento para test_X
ohe = OneHotEncoder(sparse = False)
transformed2 = ohe.fit_transform(test_X[['Embarked']])

In [30]:
test_X_1 = test_X.drop(columns = ['Embarked'])
test_X_1[ohe.categories_[0]] = transformed2

In [31]:
min_max_scaler = MinMaxScaler()
test_X_2 = min_max_scaler.fit_transform(test_X_1)

# Clasificación 

In [32]:
from sklearn.svm import SVC
clf = SVC()
clf.fit(train_X_2, train_y)

SVC()

In [33]:
clf.predict(test_X_2)

array([0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1], dtype=int64)

In [34]:
clf.score(test_X_2, test_y)

0.7832167832167832

In [35]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()
clf.fit(train_X_2, train_y)
clf.score(test_X_2, test_y)

0.7622377622377622

# Regresión
En este caso se usara el atributo Fare como atributo clasificador

In [36]:
train_X, train_y, test_X, test_y = split_label(df, 0.2, 'Fare')

In [37]:
ohe = OneHotEncoder(sparse = False)
Embarked1 = ohe.fit_transform(train_X[['Embarked']])
train_X_1 = train_X.drop(columns = ['Embarked'])
train_X_1[ohe.categories_[0]] = Embarked1
min_max_scaler = MinMaxScaler()
train_X_2 = min_max_scaler.fit_transform(train_X_1)

In [38]:
ohe = OneHotEncoder(sparse = False)
Embarked2 = ohe.fit_transform(test_X[['Embarked']])
test_X_1 = test_X.drop(columns = ['Embarked'])
test_X_1[ohe.categories_[0]] = Embarked2
min_max_scaler = MinMaxScaler()
test_X_2 = min_max_scaler.fit_transform(test_X_1)

In [51]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(train_X_2, train_y)
reg.score(test_X_2, test_y)

0.4926507049865456

In [52]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
pred = reg.predict(test_X_2)
mean_squared_error(test_y, pred)

964.4965305759489

In [54]:
mean_squared_error(test_y, pred) ** 0.5

31.056344449660344

In [53]:
mean_absolute_error(test_y, pred)

19.640354906413336

In [60]:
from sklearn.neighbors import KNeighborsRegressor
new_reg = KNeighborsRegressor()
new_reg.fit(train_X_2, train_y)
new_reg.score(test_X_2, test_y)

0.5516987094908583

# Análisis de grupos 

In [62]:
ohe = OneHotEncoder(sparse = False)
embarked = ohe.fit_transform(df[['Embarked']])
df_1 = df.drop(columns = ['Embarked'])
df_1[ohe.categories_[0]] = embarked
min_max_scaler = MinMaxScaler()
df_2 = min_max_scaler.fit_transform(df_1)

In [63]:
from sklearn.cluster import KMeans
clu = KMeans(n_clusters = 3)
clu.fit(df_2)
clu.cluster_centers_

array([[ 6.07692308e-01,  3.73076923e-01,  5.30769231e-01,
         3.81939799e-01,  8.46153846e-02,  6.92307692e-02,
         1.33306411e-01,  1.00000000e+00,  6.93889390e-18,
         6.66133815e-16],
       [ 7.71144279e-02,  7.94776119e-01,  8.60696517e-01,
         3.68528681e-01,  1.07462687e-01,  5.97014925e-02,
         3.77720587e-02, -2.22044605e-16,  5.22388060e-02,
         9.47761194e-01],
       [ 9.88888889e-01,  4.08333333e-01,  2.11111111e-01,
         3.53602943e-01,  1.05555556e-01,  1.01851852e-01,
         8.62499736e-02, -1.66533454e-16,  3.88888889e-02,
         9.61111111e-01]])

In [64]:
clu.labels_

array([1, 0, 2, 2, 1, 1, 1, 2, 0, 2, 2, 1, 1, 1, 2, 1, 1, 1, 2, 2, 2, 1,
       2, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 2, 1, 1, 1, 0, 2, 0, 2, 0, 2,
       1, 0, 1, 1, 2, 1, 2, 1, 1, 1, 1, 0, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1,
       2, 1, 1, 1, 1, 1, 1, 0, 0, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 1, 0, 2, 1, 0, 1, 1, 0, 1, 1, 2, 1, 0, 2, 1, 1,
       0, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 0, 2, 1, 1, 2, 1, 1, 1,
       1, 1, 1, 1, 1, 2, 1, 0, 1, 0, 1, 1, 1, 2, 2, 2, 1, 1, 2, 1, 2, 2,
       0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 2, 0, 1, 2, 1, 1, 0, 2, 1, 0, 1,
       1, 1, 1, 2, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1,
       2, 2, 1, 1, 1, 1, 1, 0, 2, 0, 2, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2, 0,
       2, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 2, 2, 2, 0, 0, 1, 1, 0, 2, 0, 1,
       2, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 1, 2, 0, 1, 1, 2, 2, 0, 1, 2, 2,
       0, 1, 1, 1, 1, 0, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1, 1, 1, 0, 1, 1, 2,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 2, 0,

In [66]:
from sklearn.metrics import silhouette_score, calinski_harabasz_score

In [67]:
silhouette_score(df_2, clu.labels_)

0.39752927745639455

In [68]:
calinski_harabasz_score(df_2, clu.labels_)

360.0204240573852