# 예측모델 V.2 (High Score 0.76076)

기존에 주어진 컬럼들 중

- **`Pclass`** 와 **`Sex`** 만 이용하여 생존 여부를 예측  + `One-hot Encoding`

- **`Fare`** 를 `Categorize` 하여 추가하기 + `One-hot Encoding`


# csv 파일 읽기

In [1]:
import pandas as pd

df_train = pd.read_csv('./input/train.csv')
df_test = pd.read_csv('./input/test.csv')


In [2]:
# df_train 데이터 엿보기 (default : 5)
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [4]:
df_train['CategoricalFare'] = pd.qcut(df_train['Fare'], 4)
df_test['CategoricalFare'] = pd.qcut(df_train['Fare'], 4)

print( df_train[['Fare','CategoricalFare']].head())

      Fare  CategoricalFare
0   7.2500   (-0.001, 7.91]
1  71.2833  (31.0, 512.329]
2   7.9250   (7.91, 14.454]
3  53.1000  (31.0, 512.329]
4   8.0500   (7.91, 14.454]


In [5]:
print( df_train[['CategoricalFare','Survived']].groupby('CategoricalFare').mean() )

                 Survived
CategoricalFare          
(-0.001, 7.91]   0.197309
(7.91, 14.454]   0.303571
(14.454, 31.0]   0.454955
(31.0, 512.329]  0.581081


In [6]:
#Pclass, Sex, Fare 컬럼만 가지고 감

df_train = df_train[['Survived','Pclass','Sex','Fare']]
df_test = df_test[['Pclass','Sex','Fare']]

df_train.head()

Unnamed: 0,Survived,Pclass,Sex,Fare
0,0,3,male,7.25
1,1,1,female,71.2833
2,1,3,female,7.925
3,1,1,female,53.1
4,0,3,male,8.05


# Null Check

In [7]:
for col in df_train.columns:
    msg = 'column: {:>10}\t Percent of NaN value: {:.2f}%'.format(col, 100 * (df_train[col].isnull().sum() / df_train[col].shape[0]))
    print(msg)

column:   Survived	 Percent of NaN value: 0.00%
column:     Pclass	 Percent of NaN value: 0.00%
column:        Sex	 Percent of NaN value: 0.00%
column:       Fare	 Percent of NaN value: 0.00%


In [8]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 4 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null object
Fare        891 non-null float64
dtypes: float64(1), int64(2), object(1)
memory usage: 27.9+ KB


# Data Cleaning

## 2) Fare

In [9]:
def category_fare(x):
    if x <= 7.91:
        return 0
    elif x <= 14.454:
        return 1
    elif x <= 31.0:
        return 2
    else:
        return 3

df_train['Fare'] = df_train['Fare'].apply(category_fare)    
df_test['Fare'] = df_test['Fare'].apply(category_fare)

In [10]:
df_train.head()

Unnamed: 0,Survived,Pclass,Sex,Fare
0,0,3,male,0
1,1,1,female,3
2,1,3,female,1
3,1,1,female,3
4,0,3,male,1


# one-hot Encoding

## 1) Pclass

In [11]:
# Pclass
df_train = pd.get_dummies(df_train, columns=['Pclass'], prefix='Pclass')
df_test = pd.get_dummies(df_test, columns=['Pclass'], prefix='Pclass')

# Sex
df_train = pd.get_dummies(df_train, columns=['Sex'], prefix='Sex')
df_test = pd.get_dummies(df_test, columns=['Sex'], prefix='Sex')

# Fare
df_train = pd.get_dummies(df_train, columns=['Fare'], prefix='Fare')
df_test = pd.get_dummies(df_test, columns=['Fare'], prefix='Fare')

In [12]:
df_train.head(20)

Unnamed: 0,Survived,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Fare_0,Fare_1,Fare_2,Fare_3
0,0,0,0,1,0,1,1,0,0,0
1,1,1,0,0,1,0,0,0,0,1
2,1,0,0,1,1,0,0,1,0,0
3,1,1,0,0,1,0,0,0,0,1
4,0,0,0,1,0,1,0,1,0,0
5,0,0,0,1,0,1,0,1,0,0
6,0,1,0,0,0,1,0,0,0,1
7,0,0,0,1,0,1,0,0,1,0
8,1,0,0,1,1,0,0,1,0,0
9,1,0,1,0,1,0,0,0,1,0


# 모델 만들기

In [13]:
from sklearn.ensemble import RandomForestClassifier 
from sklearn import metrics 
from sklearn.model_selection import train_test_split

In [14]:
X_train = df_train.drop('Survived', axis=1).values
target_label = df_train['Survived'].values
X_test = df_test.values


In [15]:
X_tr, X_vld, y_tr, y_vld = train_test_split(X_train, target_label, test_size=0.3, random_state=2018)

In [16]:
model = RandomForestClassifier()

In [17]:
X_tr

array([[1, 0, 0, ..., 0, 0, 1],
       [0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 1, 0, ..., 1, 0, 0],
       [0, 0, 1, ..., 1, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]], dtype=uint8)

In [18]:
model.fit(X_tr, y_tr)
prediction = model.predict(X_vld)

print('총 {}명 중 {:.2f}% 정확도로 생존을 맞춤'.format(y_vld.shape[0], 100 * metrics.accuracy_score(prediction, y_vld)))


총 268명 중 83.21% 정확도로 생존을 맞춤


In [19]:
from pandas import Series
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 8))

Series(model.feature_importances_, index=df_test.columns).sort_values(ascending=True).plot.barh()

plt.xlabel('Feature importance')
plt.ylabel('Feature')

plt.show()

<Figure size 800x800 with 1 Axes>

In [20]:
submission = pd.read_csv('./input/gender_submission.csv')

submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [21]:
submission['Survived'] = model.predict(X_test)

In [22]:
submission['Survived'].mean()

0.30861244019138756

# 제출용 csv 파일 만들기

In [23]:
submission.to_csv('./output/v2 - one-hot Encoding.csv', index=False)