In [242]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer


In [243]:
cancer = load_breast_cancer()
x_train, x_test, y_train, y_test = train_test_split(cancer.data, cancer.target, stratify = cancer.target, random_state=42)

tree = DecisionTreeClassifier(random_state=0)
tree.fit(x_train, y_train)

print(tree.score(x_train, y_train), tree.score(x_test,y_test))

1.0 0.9370629370629371


In [244]:
tree = DecisionTreeClassifier(max_depth=4, random_state =0)
tree.fit(x_train,y_train)

print(tree.score(x_train, y_train), tree.score(x_test,y_test))

0.9882629107981221 0.951048951048951


In [245]:
import pandas as pd 
import numpy as np

In [246]:
heart1= pd.read_csv('heart/heart.csv')
heart2 = pd.read_csv('heart/heart2.csv')


heart2['id_3'] = heart2['id_2']
heart2['id_4'] = heart2['id_3'].apply(lambda x : '{0:0>3}'.format(x))
heart2['id_5'] =  heart2['id'].map(str) + heart2['id_4'].map(str)
heart2['id'] = heart2['id_5']
heart2.drop(axis=1, columns = ['id_2', 'id_3','id_4','id_5'], inplace = True)

heart2['id'] = heart2['id'].astype('int64')
merge_heart = pd.merge(heart1, heart2 , on = 'id')    


mean_thalach = round(merge_heart['thalach'].mean(),0)
merge_heart.loc[merge_heart['thalach'].isna(), 'thalach'] = mean_thalach
merge_heart.loc[merge_heart['cp'].apply(lambda x: not x.isdigit()) , 'cp'] = '3'

merge_heart.dropna(subset=['age'],inplace=True)
merge_heart.loc[merge_heart['exang'].isna() , 'exang'] = 0
merge_heart.head()


Unnamed: 0,id,age,sex,trestbps,restecg,exang,ca,thal,target,cp,chol,fbs,thalach,oldpeak,slope
0,10001,63.0,1,145,0,0.0,0,1,1,3,233,1,150.0,2.3,0
1,10002,37.0,1,130,1,0.0,0,2,1,2,250,0,187.0,3.5,0
2,10003,41.0,0,130,0,0.0,0,2,1,1,204,0,172.0,1.4,2
3,10004,56.0,1,120,1,0.0,0,2,1,1,236,0,178.0,0.8,2
4,10005,57.0,0,120,1,1.0,0,2,1,0,354,0,163.0,0.6,2


In [247]:
#Train 전처리

In [468]:
train_data = pd.read_csv('titanic_train.csv')
test_data = pd.read_csv('titanic_test.csv')

# #Null값 체크
# column_list = train_data.columns
# column_list
# for i,j in enumerate(column_list):
#     print(j , train_data.loc[train_data[j].isna(), j ])

In [448]:
#Age, Cabin, Embarked 

In [469]:
#전처리 (성별, Embarked)
test_data.info()

train_data.loc[train_data['Sex'] == 'male' , 'Sex'] =0
train_data.loc[train_data['Sex'] == 'female' , 'Sex'] =1
temp_train = pd.get_dummies(train_data['Embarked'], prefix='Embarked')  
train_data = pd.concat([train_data, temp_train] ,axis=1)
train_data = train_data.drop(['Embarked'], axis= 1)


test_data.loc[test_data['Sex'] == 'male' , 'Sex'] =0
test_data.loc[test_data['Sex'] == 'female' , 'Sex'] =1
temp_test = pd.get_dummies(test_data['Embarked'], prefix='Embarked')  
test_data = pd.concat([test_data, temp_test] ,axis=1)
test_data = test_data.drop(['Embarked'], axis= 1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [472]:
#전처리 (나이)
# train_data.loc[train_data['Age'].isna(),'Age'] = train_data['Age'].median()
# train_data

temp_man = train_data.loc[train_data['Sex']==0]
temp_man['Age'].median()
temp_woman = train_data.loc[train_data['Sex']==1]
temp_woman['Age'].median()

train_data.loc[(train_data['Sex']==0) & (train_data['Age'].isna())] = temp_man['Age'].median()
train_data.loc[(train_data['Sex']==1) & (train_data['Age'].isna())] = temp_woman['Age'].median()

#나이,Fare 중간값
test_data.loc[test_data['Age'].isna(),'Age'] = test_data['Age'].median()
test_data.loc[test_data['Fare'].isna(),'Fare'] = test_data['Fare'].median()

In [474]:
from sklearn.model_selection import GridSearchCV

param_grid = {'max_depth' : [3,4,5,6,7,8,9], 'max_features' : [5,6,7,8,9,10,11,12,13,14,15]
               , 'n_estimators' : [500,1000,1500,2000,3000,5000], 'n_jobs' : [-1,1,4]}

In [None]:
## 수행
from sklearn.ensemble import RandomForestClassifier
# clf = RandomForestClassifier(n_estimators=1000, max_depth=5,
#                               random_state=0)
# clf.fit(x_train,y_train)
# print(clf.score(x_train,y_train))

grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5,return_train_score =True )
grid_search.fit(x_train,y_train)
print("테스트 세트 점수: {:.2f}".format(grid_search.score(X_test, y_test)))
print("최적 매개변수:", grid_search.best_params_)
print("최고 교차 검증 점수: {:.2f}".format(grid_search.best_score_))
print("최고 성능 모델:\n", grid_search.best_estimator_)


x_test = test_data[['Pclass','Age', 'Sex', 'SibSp', 'Parch', 'Fare','Embarked_C', 'Embarked_Q', 'Embarked_S']]

pred = grid_search.predict(x_test)
# pred = clf.predict(x_test)


# sub = pd.DataFrame(columns=[test_data['PassengerId'],pred])
sub = pd.DataFrame()
# sub.to_csv('Submit_titanic.csv', sep=',', encoding ='utf-8')
sub['PassengerId']=test_data['PassengerId']
sub['Survived'] = pred

sub.index = sub['PassengerId']
sub.drop(columns='PassengerId', inplace=True)
sub.to_csv('Submit_titanic.csv', sep=',', encoding ='utf-8')

In [451]:
# train_data

In [452]:
# train_data['Age'].count()
# train_data.loc[train_data['Age'].isna(), 'Age']
# 714 : 177 

In [453]:
# train_data['Cabin'].count()
# train_data.loc[train_data['Cabin'].isna(), 'Cabin']
# 204 : 687 

In [454]:
#원핫 인코딩
temp_train = pd.get_dummies(train_data['Embarked'], prefix='Embarked')  
train_data = pd.concat([train_data, temp_train] ,axis=1)
train_data = train_data.drop(['Embarked'], axis= 1)
train_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.2500,,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.9250,,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1000,C123,0,0,1
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.0500,,0,0,1
5,6,0,3,"Moran, Mr. James",0,,0,0,330877,8.4583,,0,1,0
6,7,0,1,"McCarthy, Mr. Timothy J",0,54.0,0,0,17463,51.8625,E46,0,0,1
7,8,0,3,"Palsson, Master. Gosta Leonard",0,2.0,3,1,349909,21.0750,,0,0,1
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",1,27.0,0,2,347742,11.1333,,0,0,1
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",1,14.0,1,0,237736,30.0708,,1,0,0


In [455]:
#나이 중간값
# train_data.loc[train_data['Age'].isna(),'Age'] = train_data['Age'].median()
# train_data

temp_man = train_data.loc[train_data['Sex']==0]
temp_man['Age'].median()
temp_woman = train_data.loc[train_data['Sex']==1]
temp_woman['Age'].median()

train_data.loc[(train_data['Sex']==0) & (train_data['Age'].isna())] = temp_man['Age'].median()
train_data.loc[(train_data['Sex']==1) & (train_data['Age'].isna())] = temp_woman['Age'].median()

In [456]:
x_train = train_data[['Pclass','Age', 'Sex', 'SibSp', 'Parch', 'Fare','Embarked_C', 'Embarked_Q', 'Embarked_S']]
y_train = train_data['Survived']

In [457]:
#Test 전처리

In [467]:
test_data = pd.read_csv('titanic_test.csv')
#Null값 체크
# column_list = train_data.columns
# column_list
# for i,j in enumerate(column_list):
#     print(j , train_data.loc[train_data[j].isna(), j ])
# test_data

PassengerId Series([], Name: PassengerId, dtype: float64)
Survived Series([], Name: Survived, dtype: float64)
Pclass Series([], Name: Pclass, dtype: float64)
Name Series([], Name: Name, dtype: object)
Sex Series([], Name: Sex, dtype: float64)
Age Series([], Name: Age, dtype: float64)
SibSp Series([], Name: SibSp, dtype: float64)
Parch Series([], Name: Parch, dtype: float64)
Ticket Series([], Name: Ticket, dtype: object)
Fare Series([], Name: Fare, dtype: float64)
Cabin 0      NaN
2      NaN
4      NaN
7      NaN
8      NaN
9      NaN
12     NaN
13     NaN
14     NaN
15     NaN
16     NaN
18     NaN
20     NaN
22     NaN
24     NaN
25     NaN
30     NaN
33     NaN
34     NaN
35     NaN
37     NaN
38     NaN
39     NaN
40     NaN
41     NaN
43     NaN
44     NaN
49     NaN
50     NaN
51     NaN
      ... 
845    NaN
847    NaN
848    NaN
850    NaN
851    NaN
852    NaN
854    NaN
855    NaN
856    NaN
858    NaN
860    NaN
861    NaN
864    NaN
865    NaN
866    NaN
869    NaN
870    Na

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.2250,,S
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0000,,S
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.1500,,S


In [459]:
temp_test = pd.get_dummies(test_data['Embarked'], prefix='Embarked')  
test_data = pd.concat([test_data, temp_test] ,axis=1)
test_data = test_data.drop(['Embarked'], axis= 1)

test_data.loc[test_data['Sex'] == 'male' , 'Sex'] =0
test_data.loc[test_data['Sex'] == 'female' , 'Sex'] =1


In [460]:
#전처리 
test_data.info()

#나이,Fare 중간값
test_data.loc[test_data['Age'].isna(),'Age'] = test_data['Age'].median()
test_data.loc[test_data['Fare'].isna(),'Fare'] = test_data['Fare'].median()
test_data

#전처리 
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 13 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null int64
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked_C     418 non-null uint8
Embarked_Q     418 non-null uint8
Embarked_S     418 non-null uint8
dtypes: float64(2), int64(5), object(3), uint8(3)
memory usage: 34.0+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 13 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null int64
Age            418 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null 

In [466]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=1000, max_depth=5,
                              random_state=0)
clf.fit(x_train,y_train)
print(clf.score(x_train,y_train))

x_test = test_data[['Pclass','Age', 'Sex', 'SibSp', 'Parch', 'Fare','Embarked_C', 'Embarked_Q', 'Embarked_S']]


pred = clf.predict(x_test)
# sub = pd.DataFrame(columns=[test_data['PassengerId'],pred])
sub = pd.DataFrame()
# sub.to_csv('Submit_titanic.csv', sep=',', encoding ='utf-8')
sub['PassengerId']=test_data['PassengerId']
sub['Survived'] = pred

sub.index = sub['PassengerId']
sub.drop(columns='PassengerId', inplace=True)
sub.to_csv('Submit_titanic.csv', sep=',', encoding ='utf-8')

0.9506172839506173


In [462]:
depth = [1,2,3,4,5,6,7,8,9,10]
for i in depth:
    clf = RandomForestClassifier(n_estimators=100, max_depth=i,
                                  random_state=0)
    clf.fit(x_train,y_train)
    print(clf.score(x_train,y_train))

0.6150392817059483
0.8249158249158249
0.8226711560044894
0.8552188552188552
0.8799102132435466
0.8843995510662177
0.8967452300785634
0.9113355780022446
0.9315375982042648
0.9450056116722784


In [463]:
#나이의 알고리즘 (실패)

In [283]:
# # x_age_train = train_data['']
# from sklearn.model_selection import train_test_split
# temp = train_data.loc[train_data['Age'].notnull()]
# x_age_train = temp[['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare','Embarked_C', 'Embarked_Q', 'Embarked_S']]
# y_age_train = temp['Age']
# # x_temp_train,y_temp_train,x_temp_test,y_temp_test = train_test_split(x_age_train,y_age_train, random_state =1,stratify = y_age_train )


In [210]:
# from sklearn.linear_model import LinearRegression
# RF_age = LinearRegression()
# RF_age.fit(x_age_train ,y_age_train)
# RF_age.score(x_age_train, y_age_train)

0.24360600811638844