In [5]:
import numpy as np
import pandas as pd

In [6]:
import os

for dirname, _, filenames in os.walk('/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [7]:
train_path = "./input/train.csv"
test_path = "./input/test.csv"

In [8]:
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

In [9]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


- survival -> 0 = No, 1 = Yes
- pclass -> 티켓 클래스 : 1 = 1st, 2 = 2nd, 3 = 3rd
- sex -> 성별 : 남성, 여성
- age -> 나이
- sibsp -> # of siblings / spouses aboard the Titanic
- parch -> # of parents / children aboard the Titanic
- ticket -> 티켓번호
- fare -> 탑승요금
- cabin -> 선실의 수
- embarked -> 정착지 : C = Cherbourg, Q = Queenstown, S = Southampton

In [10]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [17]:
(train.isna().sum() /len(train) * 100).sort_values(ascending = False) # 결측치 비율

Cabin          77.104377
Age            19.865320
Embarked        0.224467
PassengerId     0.000000
Survived        0.000000
Pclass          0.000000
Name            0.000000
Sex             0.000000
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.000000
dtype: float64

In [20]:
train.nunique().sort_values() # 각 Feature 별 Unique 수

Survived         2
Sex              2
Pclass           3
Embarked         3
SibSp            7
Parch            7
Age             88
Cabin          147
Fare           248
Ticket         681
PassengerId    891
Name           891
dtype: int64

### 데이터 처리 -결측치 (Missing Value)

- Cabin : 77%의 결측치가 존재한다. 3/4가 넘는 Missing Value가 존재하기 때문에 사용하지 않기로 결정.
- Age : 20%의 결측치가 존재한다. 20%의 결측치는 특정 전략을 사용하여 근사치로 채우기로 결정.
- Embarked : 0.2% 의 결측치가 존재한다. 매우 낮은 비율의 결측치는 특정 전략으로 채우더라도 모델에 미치는 영향이 거의 없으므로 결측치가 존재하는 데이터는 drop 하기로 결정.

### 데이터 처리 - 범주형 자료 (Categorical Data)

- Sex : 2가지 밖에 없으므로 label encoder를 활용 (문자를 수치화)
- Name : 영향을 주지 않는 value 이므로 drop.
- Ticket : 마찬가지로 영향을 주지 않는 value 이므로 drop.
- Cabin : Drop, Too much missing
- Embarked : 3가지의 possible values 이므로 0,1,2로 encoding

In [21]:
train['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [22]:
def cleanData(data):
    # Drop 할 Value
    data.drop(['Cabin','Name','Ticket'], axis=1, inplace=True)
    
    # Data missing Case2
    data['Age'] = data.groupby(['Pclass','Sex'])['Age'].transform(lambda x : x.fillna(x.median())) # 중앙값으로 대체
    
    # Data (Fare)
    data['Fare'] = data.groupby(['Pclass','Sex'])['Fare'].transform(lambda x: x.fillna(x.median())) # 중앙값으로 대체
    
    # Embarked의 결측치 제거
    data.dropna(axis=0, subset=['Embarked'], inplace=True)
    
    # Sex
    data['Sex'].replace({'male':0, 'female':1}, inplace=True)
    
    # Embarked
    data['Embarked'].replace({'S':0, 'C':1, 'Q':2}, inplace=True)
    
    return data

In [23]:
clean_train = cleanData(train)
clean_test = cleanData(test)

In [25]:
clean_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  889 non-null    int64  
 1   Survived     889 non-null    int64  
 2   Pclass       889 non-null    int64  
 3   Sex          889 non-null    int64  
 4   Age          889 non-null    float64
 5   SibSp        889 non-null    int64  
 6   Parch        889 non-null    int64  
 7   Fare         889 non-null    float64
 8   Embarked     889 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 69.5 KB


### Modeling

In [32]:
# imports 
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC

y = train['Survived']
X = pd.get_dummies(train.drop('Survived', axis=1))

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

In [33]:
def fit(model):
    model.fit(X_train, y_train)
    prediction = model.predict(X_val)
    return accuracy_score(y_val, prediction)

In [36]:
# Let some models

model1 = LogisticRegression(solver='liblinear')
model2 = GradientBoostingClassifier()
model3 = RandomForestClassifier()
model4 = SGDClassifier()
model5 = SVC()

models = [model1, model2, model3, model4, model5]
i = 0

for model in models:
    i+=1
    print("Model ", i, ":", model)
    print("ACC: ", fit(model))

Model  1 : LogisticRegression(solver='liblinear')
ACC:  0.7752808988764045
Model  2 : GradientBoostingClassifier()
ACC:  0.8146067415730337
Model  3 : RandomForestClassifier()
ACC:  0.8033707865168539
Model  4 : SGDClassifier()
ACC:  0.5955056179775281
Model  5 : SVC()
ACC:  0.6123595505617978


In [37]:
model = GradientBoostingClassifier(min_samples_split=20, min_samples_leaf=60, max_depth=3, max_features=7)
fit(model)

0.8089887640449438

In [39]:
predict = model2.predict(pd.get_dummies(clean_test))

output = pd.DataFrame({'PassengerId': clean_test.PassengerId, 'Survived': predict})
output.to_csv('1_submission.csv', index=False)
print("Submission saved")

Submission saved


### GridSearch

In [40]:
from sklearn.model_selection import GridSearchCV

In [43]:
params = {
    'min_samples_split' : [15,20,25,30] ,
    'min_samples_leaf' : [5,7,8,10,60], 
    'max_depth' : [1,2,3,4,5], 
    'max_features' : [3,4,5,6,7]
}

gbc = GradientBoostingClassifier()

grid = GridSearchCV(gbc, param_grid = params, cv=5, refit=True)
grid.fit(X_train, y_train)

print('best parameters : ', grid.best_params_)
print('best score : ', grid.best_score_)

em = grid.best_estimator_
pred = em.predict(X_val)
accuracy_score(y_val, pred)

best parameters :  {'max_depth': 3, 'max_features': 4, 'min_samples_leaf': 8, 'min_samples_split': 20}
best score :  0.8270067960208806


0.8146067415730337

In [44]:
model = GradientBoostingClassifier(min_samples_split=20, min_samples_leaf=8, max_depth=3, max_features=4)
fit(model)

0.8033707865168539

In [45]:
predict = model.predict(pd.get_dummies(clean_test))

output = pd.DataFrame({'PassengerId': clean_test.PassengerId, 'Survived': predict})
output.to_csv('1_2_submission.csv', index=False)
print("Submission saved")

Submission saved
