In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

In [10]:
test1 = pd.read_csv("titanic/test.csv")
test1

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [11]:
test1['Embarked'].fillna('S', inplace = True)
test1['Fare'].fillna(0, inplace=True)
test1['Fare'] = test1['Fare'].map(lambda x : np.log(x) if x > 0 else 0)
test1['Initial'] = test1['Name'].str.extract('([A-Za-z]+)\.')
test1['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don', 'Dona'],['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr','Other'],inplace=True)
mapping = {
    "Mr":0,
    "Miss":1,
    "Mrs" : 1,
    "Master":2,
    "Other":3
}

test1['Initial'] = test1['Initial'].map(mapping)

In [12]:
mapping_sex = {
    'male' : 0,
    'female': 1
}

mapping_em = {
    'S' :0,
    'C' :1,
    'Q' :2
}


test1['Sex'] = test1['Sex'].map(mapping_sex)
test1['Embarked'] = test1['Embarked'].map(mapping_em)


test1.drop(['PassengerId', "Ticket", "Cabin", "Name"], axis = 1, inplace = True)

In [13]:
test1.groupby('Initial')['Age'].mean()

Initial
0    32.114130
1    30.203095
2     7.406471
3    42.000000
Name: Age, dtype: float64

In [14]:

test1.loc[ (test1['Age'].isnull()) & (test1['Initial'] == 0), 'Age' ] = 32
test1.loc[ (test1['Age'].isnull()) & (test1['Initial'] == 1), 'Age' ] = 28
test1.loc[ (test1['Age'].isnull()) & (test1['Initial'] == 2), 'Age' ] = 5
test1.loc[ (test1['Age'].isnull()) & (test1['Initial'] == 3), 'Age' ] = 45
y = test1['Survived']
X = test1.drop('Survived', axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)
pred = rf.predict(X_test)
print("정확도 :{0:.3f}".format(accuracy_score(y_test, pred)))

gb_param_grid = {
    'n_estimators' : [100, 200],
    'max_depth' : [6, 8, 10, 12],
    'min_samples_leaf' : [3, 5, 7, 10],
    'min_samples_split' : [2, 3, 5, 10]
}
gb = GradientBoostingClassifier(random_state=0)
gb.fit(X_train, y_train)


gb_grid = GridSearchCV(gb, param_grid = gb_param_grid, scoring="accuracy", n_jobs= -1, verbose = 1)
gb_grid.fit(X_train, y_train)




KeyError: 'Survived'

In [None]:


rfAnswer = rf.predict(test1)
gbAnswer = gb_grid.predict(test1)
answer1 = pd.read_csv("titanic/gender_submission.csv")
answer2 = answer1.copy()
answer1['Survived'] = rfAnswer
answer2['Survived'] = gbAnswer
answer1.to_csv("rfsubmission.csv",index = False)
answer2.to_csv("gbsubmission.csv",index = False)


In [15]:
# 가장 좋은 파라미터들로 모델 생성
gb2 = GradientBoostingClassifier(n_estimators=100,
                                 max_depth=6,
                                 min_samples_leaf=10,
                                 min_samples_split=2)
gb2.fit(X_train, y_train)


# 타이타닉의 테스트 데이터를 입력
test = pd.read_csv('titanic/test.csv')

# 전처리....
test['Embarked'].fillna('S', inplace = True)
test['Fare'].fillna(0, inplace=True)
test['Fare'] = test['Fare'].map(lambda x : np.log(x) if x > 0 else 0)

test['Initial'] = test['Name'].str.extract('([A-Za-z]+)\.')
test['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don', 'Dona'],
                        ['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr','Other'],
                        inplace=True)
mapping = {
    "Mr":0,
    "Miss":1,
    "Mrs" : 1,
    "Master":2,
    "Other":3
}

test['Initial'] = test['Initial'].map(mapping)

mapping_sex = {
    'male' : 0,
    'female': 1
}

mapping_em = {
    'S' :0,
    'C' :1,
    'Q' :2
}

test['Sex'] = test['Sex'].map(mapping_sex)
test['Embarked'] = test['Embarked'].map(mapping_em)


test.drop(['PassengerId', "Ticket", "Cabin", "Name"], axis = 1, inplace = True)

test.groupby('Initial')['Age'].mean()

test.loc[ (test['Age'].isnull()) & (test['Initial'] == 0), 'Age' ] = 32
test.loc[ (test['Age'].isnull()) & (test['Initial'] == 1), 'Age' ] = 28
test.loc[ (test['Age'].isnull()) & (test['Initial'] == 2), 'Age' ] = 5
test.loc[ (test['Age'].isnull()) & (test['Initial'] == 3), 'Age' ] = 45

# test 데이터 결과 예측
pred = gb2.predict(test)

# 제출 파일 만들기
sub = pd.read_csv('titanic/test.csv')[['PassengerId']]
pred = pd.Series(pred)
sub['Survived'] = pred
sub.to_csv('sub_titanic_03.csv', index=False)

NameError: name 'X_train' is not defined

# 창호씨 방식

In [79]:
data1 = pd.read_csv('titanic/test.csv')
data = pd.read_csv('titanic/train.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [80]:
data['Embarked'].fillna('S', inplace = True)
data['Fare'].fillna(0, inplace=True)
data['Fare'] = data['Fare'].map(lambda x : np.log(x) if x > 0 else 0)

In [81]:
data['Initial'] = data['Name'].str.extract('([A-Za-z]+)\.')
data['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don', 'Dona'],['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr','Other'],inplace=True)
mapping = {
    "Mr":0,
    "Miss":1,
    "Mrs" : 1,
    "Master":2,
    "Other":3
}

data['Initial'] = data['Initial'].map(mapping)

In [82]:
mapping_sex = {
    'male' : 0,
    'female': 1
}

mapping_em = {
    'S' :0,
    'C' :1,
    'Q' :2
}


data['Sex'] = data['Sex'].map(mapping_sex)
data['Embarked'] = data['Embarked'].map(mapping_em)


data.drop(['PassengerId', "Ticket", "Cabin", "Name"], axis = 1, inplace = True)

In [83]:
data.groupby('Initial')['Age'].mean()

Initial
0    32.739609
1    27.834615
2     4.574167
3    45.888889
Name: Age, dtype: float64

In [84]:
data.loc[ (data['Age'].isnull()) & (data['Initial'] == 0), 'Age' ] = 32
data.loc[ (data['Age'].isnull()) & (data['Initial'] == 1), 'Age' ] = 28
data.loc[ (data['Age'].isnull()) & (data['Initial'] == 2), 'Age' ] = 5
data.loc[ (data['Age'].isnull()) & (data['Initial'] == 3), 'Age' ] = 45

In [85]:
y = data['Survived']
X = data.drop('Survived', axis = 1)

In [86]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

In [87]:
gb_param_grid = {
    'n_estimators' : [100],
    'max_depth' : [6],
    'min_samples_leaf' : [10],
    'min_samples_split' : [2]
}

In [88]:
gb = GradientBoostingClassifier(random_state=0)
gb.fit(X_train, y_train)

In [89]:
gb_grid = GridSearchCV(gb, param_grid = gb_param_grid, scoring="accuracy", n_jobs= -1, verbose = 1)
gb_grid.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [90]:
gb_grid.best_score_

0.8272234807446074

In [91]:
gb_grid.best_params_

{'max_depth': 6,
 'min_samples_leaf': 10,
 'min_samples_split': 2,
 'n_estimators': 100}

gb_grid.predict(data1)

In [92]:
data1['Embarked'].fillna('S', inplace = True)
data1['Fare'].fillna(0, inplace=True)
data1['Fare'] = data1['Fare'].map(lambda x : np.log(x) if x > 0 else 0)
data1['Initial'] = data1['Name'].str.extract('([A-Za-z]+)\.')
data1['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don', 'Dona'],['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr','Other'],inplace=True)
mapping = {
    "Mr":0,
    "Miss":1,
    "Mrs" : 1,
    "Master":2,
    "Other":3
}
data1['Initial'] = data1['Initial'].map(mapping)

In [93]:
apping_sex = {
    'male' : 0,
    'female': 1
}

mapping_em = {
    'S' :0,
    'C' :1,
    'Q' :2
}


data1['Sex'] = data1['Sex'].map(mapping_sex)
data1['Embarked'] = data1['Embarked'].map(mapping_em)


data1.drop(['PassengerId', "Ticket", "Cabin", "Name"], axis = 1, inplace = True)

In [94]:
data1.groupby('Initial')['Age'].mean()

Initial
0    32.114130
1    30.203095
2     7.406471
3    42.000000
Name: Age, dtype: float64

In [95]:
data1.loc[ (data1['Age'].isnull()) & (data1['Initial'] == 0), 'Age' ] = 32
data1.loc[ (data1['Age'].isnull()) & (data1['Initial'] == 1), 'Age' ] = 28
data1.loc[ (data1['Age'].isnull()) & (data1['Initial'] == 2), 'Age' ] = 5
data1.loc[ (data1['Age'].isnull()) & (data1['Initial'] == 3), 'Age' ] = 45

In [96]:
data1[data1.Age.isnull()] = data1.Age.mean()

In [97]:
pred = gb_grid.predict(data1)

In [98]:
pred = pred.tolist()

In [99]:
predfile = pd.DataFrame(data= pred, columns = ['Survived'])

In [100]:
data1 = pd.read_csv('titanic/test.csv')

In [101]:
predfile = pd.concat([data1['PassengerId'], predfile], axis = 1)

In [102]:
predfile.to_csv("johnson.csv", index = False)