In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [5]:
titanic = pd.read_csv('./titanic.csv')

In [6]:
titanic.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [8]:
titanic.shape

(418, 11)

## 1. 결측치 대체 및 제거

In [7]:
titanic.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [10]:
titanic_1 = titanic.drop(['Cabin'], axis=1)
titanic_1 = titanic_1.fillna(titanic_1.mean())

In [11]:
titanic_1.isnull().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

## 2. 지표변수

In [20]:
titanic_1['Age']

0      34.50000
1      47.00000
2      62.00000
3      27.00000
4      22.00000
         ...   
413    30.27259
414    39.00000
415    38.50000
416    30.27259
417    30.27259
Name: Age, Length: 418, dtype: float64

In [22]:
oldness = []
for a in titanic_1['Age']:
    if (10 <= a <= 30):
        oldness.append('young')
    elif (31 <= a <= 50):
        oldness.append('middle')
    else:
        oldness.append('old')

titanic_1['oldness'] = oldness

titanic_1.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,oldness
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Q,middle
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,S,middle
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,Q,old
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,S,young
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,S,young


## 3. Feature Split

In [24]:
titanic['Name']

0                                  Kelly, Mr. James
1                  Wilkes, Mrs. James (Ellen Needs)
2                         Myles, Mr. Thomas Francis
3                                  Wirz, Mr. Albert
4      Hirvonen, Mrs. Alexander (Helga E Lindqvist)
                           ...                     
413                              Spector, Mr. Woolf
414                    Oliva y Ocana, Dona. Fermina
415                    Saether, Mr. Simon Sivertsen
416                             Ware, Mr. Frederick
417                        Peter, Master. Michael J
Name: Name, Length: 418, dtype: object

In [25]:
first_name = []
last_name = []
for name in titanic['Name']:
    name = name.replace(',', '.')
    names = name.split('.')
    first_name.append(names[0])
    last_name.append(names[2][1:])
titanic['First name'] = first_name
titanic['Last name'] = last_name
titanic.head()


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,First name,Last name
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,Kelly,James
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,Wilkes,James (Ellen Needs)
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,Myles,Thomas Francis
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,Wirz,Albert
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,Hirvonen,Alexander (Helga E Lindqvist)


## 4. Scaling

In [30]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
mm_scaler = MinMaxScaler()
sd_scaler = StandardScaler()

In [31]:
# Min max Scaling
mm_scaler = mm_scaler.fit_transform(titanic_1['Age'].values.reshape(-1,1))
titanic_1['Min max scaled age'] = mm_scaler

In [33]:
# Standard Scaling
sd_scaler = sd_scaler.fit_transform(titanic_1['Age'].values.reshape(-1,1))
titanic_1['Standard scaled age'] = sd_scaler


In [38]:
titanic_1.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,oldness,Min max sclaed age,Standard scaled age
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Q,middle,0.452723,0.334993
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,S,middle,0.617566,1.32553
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,Q,old,0.815377,2.514175
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,S,young,0.353818,-0.25933
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,S,young,0.287881,-0.655545


## 5. One-hot encoding

In [41]:
sex_encoded = []
for sex in titanic['Sex']:
    if sex == 'male':
        sex_encoded.append(1)
    else:
        sex_encoded.append(0)
titanic['Sex'] = sex_encoded
titanic.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,First name,Last name
0,892,3,"Kelly, Mr. James",1,34.5,0,0,330911,7.8292,,Q,Kelly,James
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",0,47.0,1,0,363272,7.0,,S,Wilkes,James (Ellen Needs)
2,894,2,"Myles, Mr. Thomas Francis",1,62.0,0,0,240276,9.6875,,Q,Myles,Thomas Francis
3,895,3,"Wirz, Mr. Albert",1,27.0,0,0,315154,8.6625,,S,Wirz,Albert
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0,22.0,1,1,3101298,12.2875,,S,Hirvonen,Alexander (Helga E Lindqvist)
