In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

# 학습을 위한 데이터(csv) 불러오기
- csv파일로 정리된 데이터를 가져옵니다.

In [2]:
data = pd.read_csv('results0410_m.csv')

- fillna(0)는 NaN인 데이터를 0으로 채우라는 것입니다. 데이터의 규모가 크지 않다면 csv파일을 미리 검토하여, 숫자가 아닌 데이터를 적절한 값으로 변경해주는 것이 좋습니다. 적절하지 못하거나 누락된 데이터를 다루는 방법은 다양하지만 여기서는 0을 채울 것 입니다.

In [3]:
data = data.fillna(0)
data[:4]

Unnamed: 0,Patient ID,Label,original_firstorder_10Percentile,original_firstorder_90Percentile,original_firstorder_Energy,original_firstorder_Entropy,original_firstorder_InterquartileRange,original_firstorder_Kurtosis,original_firstorder_Maximum,original_firstorder_Mean,...,log-sigma-5-0-mm-3D_glszm_LargeAreaLowGrayLevelEmphasis,log-sigma-5-0-mm-3D_glszm_LowGrayLevelZoneEmphasis,log-sigma-5-0-mm-3D_glszm_SizeZoneNonUniformity,log-sigma-5-0-mm-3D_glszm_SizeZoneNonUniformityNormalized,log-sigma-5-0-mm-3D_glszm_SmallAreaEmphasis,log-sigma-5-0-mm-3D_glszm_SmallAreaHighGrayLevelEmphasis,log-sigma-5-0-mm-3D_glszm_SmallAreaLowGrayLevelEmphasis,log-sigma-5-0-mm-3D_glszm_ZoneEntropy,log-sigma-5-0-mm-3D_glszm_ZonePercentage,log-sigma-5-0-mm-3D_glszm_ZoneVariance
0,10507209,3,-46.477375,258.599425,10492580000.0,6.288854,205.941402,1.931149,469.641244,108.119006,...,6.437638,0.002342,1216.548156,0.189287,0.428052,356.782181,0.000875,8.269472,0.110195,1826.995332
1,11038454,3,-34.419225,193.97293,80924860000.0,5.926278,138.825455,2.122706,461.369427,64.961432,...,517.399532,0.002807,2625.905057,0.16095,0.393305,239.910789,0.001047,8.429233,0.028294,364993.8789
2,11117389,1,-34.940415,145.752348,45370910000.0,5.808166,100.146822,3.299762,428.31401,46.879163,...,468.503301,0.002494,1116.406621,0.157952,0.39083,288.51646,0.000996,8.492476,0.019557,327829.5517
3,11493583,2,-19.402685,522.313469,855033200.0,7.022576,305.881341,2.335018,821.665003,212.933828,...,0.530733,0.00523,195.537517,0.266763,0.52372,294.065533,0.002938,7.042645,0.260206,53.982207


## Data(X)와 Label(y)을 나누기
- 항목이름이 'Label'인 데이터를 y로 가져오고, Patient ID와 같이 학습에 활용되지 않는 데이터는 제외하고, X를 데이터로 만듭니다.
- .pop은 빼서 다른 변수에 저장하는 것. 'Patient ID'외에도 fitting에 사용할 필요없는 변수들은 .pop이나 .drop으로 빼주도록 합니다.

In [4]:
y = data.pop('Label')
y[:4]

0    3
1    3
2    1
3    2
Name: Label, dtype: int64

In [5]:
pid = data.pop('Patient ID')
pid[:4]

0    10507209
1    11038454
2    11117389
3    11493583
Name: Patient ID, dtype: int64

In [6]:
X = data
X[:4]

Unnamed: 0,original_firstorder_10Percentile,original_firstorder_90Percentile,original_firstorder_Energy,original_firstorder_Entropy,original_firstorder_InterquartileRange,original_firstorder_Kurtosis,original_firstorder_Maximum,original_firstorder_Mean,original_firstorder_MeanAbsoluteDeviation,original_firstorder_Median,...,log-sigma-5-0-mm-3D_glszm_LargeAreaLowGrayLevelEmphasis,log-sigma-5-0-mm-3D_glszm_LowGrayLevelZoneEmphasis,log-sigma-5-0-mm-3D_glszm_SizeZoneNonUniformity,log-sigma-5-0-mm-3D_glszm_SizeZoneNonUniformityNormalized,log-sigma-5-0-mm-3D_glszm_SmallAreaEmphasis,log-sigma-5-0-mm-3D_glszm_SmallAreaHighGrayLevelEmphasis,log-sigma-5-0-mm-3D_glszm_SmallAreaLowGrayLevelEmphasis,log-sigma-5-0-mm-3D_glszm_ZoneEntropy,log-sigma-5-0-mm-3D_glszm_ZonePercentage,log-sigma-5-0-mm-3D_glszm_ZoneVariance
0,-46.477375,258.599425,10492580000.0,6.288854,205.941402,1.931149,469.641244,108.119006,100.628664,108.113241,...,6.437638,0.002342,1216.548156,0.189287,0.428052,356.782181,0.000875,8.269472,0.110195,1826.995332
1,-34.419225,193.97293,80924860000.0,5.926278,138.825455,2.122706,461.369427,64.961432,72.110523,50.306098,...,517.399532,0.002807,2625.905057,0.16095,0.393305,239.910789,0.001047,8.429233,0.028294,364993.8789
2,-34.940415,145.752348,45370910000.0,5.808166,100.146822,3.299762,428.31401,46.879163,58.118038,35.856661,...,468.503301,0.002494,1116.406621,0.157952,0.39083,288.51646,0.000996,8.492476,0.019557,327829.5517
3,-19.402685,522.313469,855033200.0,7.022576,305.881341,2.335018,821.665003,212.933828,168.082255,171.534056,...,0.530733,0.00523,195.537517,0.266763,0.52372,294.065533,0.002938,7.042645,0.260206,53.982207


## Validation set을 나누고, model의 파라미터들을 설정하기
- ML model의 overfitting과 관련하여 data의 일부를 validation set을 빼서 validation set의 accuracy를 관찰하는 것이 좋습니다.
- test_size = 0.25의 0.25를 적절한 비율로 변경하면 됩니다. 이 경우 25%의 데이터를 빼게 됩니다.
- class imbalance가 심한 경우에는 아래의 방법을 그대로 쓰는 것을 권장하지 않습니다. 관련해서는 추후 업데이트 예정 
- RandomForestClassifier는 n_estimators, max_depth등 다양한 파라미터들을 가지고 이에 따라 결과가 달라지는 것을 주의합니다.

In [7]:
seed=np.random.randint(42)
#seed=42
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25,random_state=seed)

In [8]:
clf = RandomForestClassifier(n_estimators = 50, max_depth=3,random_state=seed)

## Model을 학습시키기

In [9]:
clf.fit(X_train,y_train)

# clf.fit(X,y)  # <- validation set을 나누지 않고 model을 학습시키고자 하면...

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=3, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=40, verbose=0,
                       warm_start=False)

## Validation set에 대한 예측결과를 출력하기

In [10]:
p_valid = clf.predict(X_valid)
p_valid

array([1, 1, 1, 1, 2, 1, 3, 3, 3, 1, 3, 3, 1, 3, 1, 2, 1, 1])

In [11]:
d={'label':y_valid,'prediction':p_valid}

In [12]:
r_valid = pd.DataFrame(data=d)
r_valid

Unnamed: 0,label,prediction
45,1,1
0,3,1
63,3,1
52,3,1
26,2,2
21,3,1
4,3,3
24,2,3
68,2,3
38,1,1


In [13]:
confusion_matrix(y_valid,p_valid,labels=[1,2,3])

array([[4, 1, 1],
       [1, 1, 2],
       [5, 0, 3]])

# 새로운 데이터(csv)를 평가하기
- 학습된 모델을 새로운 데이터를 평가하기 위해, 같은 포맷으로 만들어진 별도의 csv파일을 불러옵니다.(여기서는 예제에서는 작동하는 것을 보기 위해 훈련에 사용한 csv파일을 다시 불러오는 것이지만...)
- predict_proba는 각각의 label에 대한 output의 probability를 나타냅니다. 이는 ROC curve같은 것을 그릴 때에 활용될 수 있습니다.
- 결과는 to_csv를 이용해 csv파일로 저장할 수 있습니다.

In [14]:
data = pd.read_csv('results0410_m.csv')
data = data.fillna(0)
y_test = data.pop('Label')
pid = data.pop('Patient ID')
X_test = data
p_test = clf.predict(X_test)
prob_test = clf.predict_proba(X_test)
d={'ID':pid,'label':y_test,'prediction':p_test,'prob1':prob_test[:,0],'prob2':prob_test[:,1],'prob3':prob_test[:,2]}
r_test = pd.DataFrame(data=d)
r_test[:4]

Unnamed: 0,ID,label,prediction,prob1,prob2,prob3
0,10507209,3,1,0.630152,0.065763,0.304085
1,11038454,3,3,0.392145,0.01253,0.595324
2,11117389,1,1,0.635824,0.01295,0.351226
3,11493583,2,2,0.073333,0.90092,0.025747


In [15]:
r_test.to_csv('test_results.csv')

In [16]:
confusion_matrix(y_test,p_test,labels=[1,2,3])

array([[29,  1,  1],
       [ 1, 11,  2],
       [ 5,  0, 22]])