# 鐵達尼號生存預測-1(資料預處理)

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("titanic_data.csv")
print(df.shape)

(1309, 11)


In [2]:
print(df.head())

   pclass  survived                                             name     sex  \
0       1         1                    Allen  Miss. Elisabeth Walton  female   
1       1         1                   Allison  Master. Hudson Trevor    male   
2       1         0                     Allison  Miss. Helen Loraine  female   
3       1         0             Allison  Mr. Hudson Joshua Creighton    male   
4       1         0  Allison  Mrs. Hudson J C (Bessie Waldo Daniels)  female   

       age  sibsp  parch  ticket      fare    cabin embarked  
0  29.0000      0      0   24160  211.3375       B5        S  
1   0.9167      1      2  113781  151.5500  C22 C26        S  
2   2.0000      1      2  113781  151.5500  C22 C26        S  
3  30.0000      1      2  113781  151.5500  C22 C26        S  
4  25.0000      1      2  113781  151.5500  C22 C26        S  


## 找出遺漏值

In [3]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    1309 non-null   int64  
 1   survived  1309 non-null   int64  
 2   name      1309 non-null   object 
 3   sex       1309 non-null   object 
 4   age       1046 non-null   float64
 5   sibsp     1309 non-null   int64  
 6   parch     1309 non-null   int64  
 7   ticket    1309 non-null   object 
 8   fare      1308 non-null   float64
 9   cabin     295 non-null    object 
 10  embarked  1307 non-null   object 
dtypes: float64(2), int64(4), object(5)
memory usage: 112.6+ KB
None


In [4]:
print(df.isnull().sum())

pclass         0
survived       0
name           0
sex            0
age          263
sibsp          0
parch          0
ticket         0
fare           1
cabin       1014
embarked       2
dtype: int64


## 資料預處理

### 刪除不需要的欄位

In [5]:
df = df.drop(["name","ticket","cabin"],axis=1)

### 處理遺失資料

In [6]:
df[["age"]] = df[["age"]].fillna(value=df[["age"]].mean())
df[["fare"]] = df[["fare"]].fillna(value=df[["fare"]].mean())

In [5]:
df[["embarked"]] = df[["embarked"]].fillna(value=df["embarked"].value_counts().idxmax())
print(df["embarked"].value_counts())
print(df["embarked"].value_counts().idxmax())

S    916
C    270
Q    123
Name: embarked, dtype: int64
S


### 轉換分類資料

In [6]:
df["sex"] = df["sex"].map({"female":1,"male":0}).astype(int)

### embarked欄位進行one-hot encoding

In [7]:
embarked_one_hot = pd.get_dummies(df["embarked"],prefix="embarked")
df = df.drop("embarked",axis=1)
df = df.join(embarked_one_hot)

### 將標籤資料的survived欄位移至最後

In [8]:
df_survived = df.pop("survived")
df["survived"] = df_survived
print(df.head())

   pclass                                             name  sex      age  \
0       1                    Allen  Miss. Elisabeth Walton    1  29.0000   
1       1                   Allison  Master. Hudson Trevor    0   0.9167   
2       1                     Allison  Miss. Helen Loraine    1   2.0000   
3       1             Allison  Mr. Hudson Joshua Creighton    0  30.0000   
4       1  Allison  Mrs. Hudson J C (Bessie Waldo Daniels)    1  25.0000   

   sibsp  parch  ticket      fare    cabin  embarked_C  embarked_Q  \
0      0      0   24160  211.3375       B5           0           0   
1      1      2  113781  151.5500  C22 C26           0           0   
2      1      2  113781  151.5500  C22 C26           0           0   
3      1      2  113781  151.5500  C22 C26           0           0   
4      1      2  113781  151.5500  C22 C26           0           0   

   embarked_S  survived  
0           1         1  
1           1         1  
2           1         0  
3           1     

### 分割成訓練資料集和測試資料集

In [9]:
mask = np.random.rand(len(df)) < 0.8
df_train = df[mask]
df_test = df[~mask]
print("Train:",df_train.shape)
print("Test:",df_test.shape)

Train: (1030, 13)
Test: (279, 13)


In [10]:
df_train.to_csv("titanic_train.csv",index=False)
df_test.to_csv("titanic_test.csv",index=False)