# 유방암 모델 만들기

In [1]:
import pandas as pd 
import joblib

In [2]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

트레인셋

In [4]:
df_train = pd.read_csv('../static/data/classification/cancer_train.csv')
df_train.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,25.73,17.46,174.2,2010.0,0.1149,0.2363,0.3368,0.1913,0.1956,0.06121,...,23.58,229.3,3234.0,0.153,0.5937,0.6451,0.2756,0.369,0.08815,0
1,13.5,12.71,85.69,566.2,0.07376,0.03614,0.002758,0.004419,0.1365,0.05335,...,16.94,95.48,698.7,0.09023,0.05836,0.01379,0.0221,0.2267,0.06192,1
2,17.35,23.06,111.0,933.1,0.08662,0.0629,0.02891,0.02837,0.1564,0.05307,...,31.47,128.2,1218.0,0.124,0.1486,0.1211,0.08235,0.2452,0.06515,0
3,11.61,16.02,75.46,408.2,0.1088,0.1168,0.07097,0.04497,0.1886,0.0632,...,19.67,81.93,475.7,0.1415,0.217,0.2302,0.1105,0.2787,0.07427,1
4,11.87,21.54,76.83,432.0,0.06613,0.1064,0.08777,0.02386,0.1349,0.06612,...,28.18,83.51,507.2,0.09457,0.3399,0.3218,0.0875,0.2305,0.09952,1


In [5]:
X_train = df_train.drop(columns='target', axis=1)
y_train = df_train.target.values

정규화

In [7]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_train.shape, y_train.shape

((426, 30), (426,))

테스트셋

In [8]:
# 정규화까지 한 번에
df_test = pd.read_csv('../static/data/classification/cancer_test.csv')
y_test = df_test.target.values
X_test = scaler.fit_transform(df_test.drop(columns='target', axis=1))
X_test.shape, y_test.shape

((143, 30), (143,))

1. Logistic Regression

In [9]:
lr_clf = LogisticRegression()
lr_clf.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [10]:
params = {
    #'C': [0.1, 1, 5]
    'C': [3, 4, 5, 6, 7]
}

In [11]:
grid_cv = GridSearchCV(lr_clf, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)

최고 평균 정확도: 0.9741
최적 파라미터: {'C': 5}


In [12]:
best_lr = grid_cv.best_estimator_
pred = best_lr.predict(X_test)
accuracy_score(y_test, pred)

0.972027972027972

In [14]:
joblib.dump(best_lr, '../static/model/cancer_lr.pkl')

['../static/model/cancer_lr.pkl']

2. SVM

In [15]:
sv_clf = SVC()
sv_clf.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [16]:
params = {
    #'C': [0.1, 1, 5, 7, 10]
    'C': [5, 6, 7, 8, 9]
}
grid_cv = GridSearchCV(sv_clf, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)

최고 평균 정확도: 0.9789
최적 파라미터: {'C': 7}


In [17]:
best_sv = grid_cv.best_estimator_
pred = best_sv.predict(X_test)
accuracy_score(y_test, pred)

0.9790209790209791

In [18]:
joblib.dump(best_sv, '../static/model/cancer_sv.pkl')

['../static/model/cancer_sv.pkl']

3. Random Forest

In [19]:
rf_clf = RandomForestClassifier()
rf_clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [20]:
params = {
    'max_depth': [4, 6, 8, 10],
    'min_samples_split': [2, 3, 4]
}
grid_cv = GridSearchCV(rf_clf, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)

최고 평균 정확도: 0.9601
최적 파라미터: {'max_depth': 8, 'min_samples_split': 2}


In [21]:
best_rf = grid_cv.best_estimator_
pred = best_rf.predict(X_test)
accuracy_score(y_test, pred)

0.951048951048951

In [22]:
joblib.dump(best_rf, '../static/model/cancer_rf.pkl')

['../static/model/cancer_rf.pkl']

데이터 인덱스로 잡아서 테스트하기

In [28]:
index = 101
scaled_test = scaler.fit_transform(df_test.iloc[:, :-1])
scaled_test.shape

(143, 30)

In [29]:
# index행의 열들
print(scaled_test[index, :])
# 정규화
test_data = scaled_test[index, :].reshape(1,-1)
test_data

[0.25242572 0.48668846 0.24846558 0.14770598 0.27275401 0.37258725
 0.24586173 0.17761755 0.33436341 0.32958801 0.06039295 0.11666094
 0.11946859 0.03180557 0.10010807 0.61213361 0.19401515 0.25913999
 0.09269442 0.25006564 0.27563939 0.38360842 0.37082936 0.15457878
 0.25998123 0.72753346 0.79795521 0.46744747 0.30394487 0.74611676]


array([[0.25242572, 0.48668846, 0.24846558, 0.14770598, 0.27275401,
        0.37258725, 0.24586173, 0.17761755, 0.33436341, 0.32958801,
        0.06039295, 0.11666094, 0.11946859, 0.03180557, 0.10010807,
        0.61213361, 0.19401515, 0.25913999, 0.09269442, 0.25006564,
        0.27563939, 0.38360842, 0.37082936, 0.15457878, 0.25998123,
        0.72753346, 0.79795521, 0.46744747, 0.30394487, 0.74611676]])

In [30]:
# index행의 label(결과)
label = df_test.iloc[index, -1]
label

1

In [31]:
# index로 행을 하나만 뽑아 왔으니
# 예측 결과는 element가 1개인 리스트
pred_lr = best_lr.predict(test_data)
pred_sv = best_sv.predict(test_data)
pred_rf = best_rf.predict(test_data)

In [32]:
pred_lr

array([0], dtype=int64)

In [33]:
# 원래 target, logistic, svc, random forest
label, pred_lr[0], pred_sv[0], pred_rf[0]

(1, 0, 1, 0)

In [43]:
# index행의 값을 가지고 있는 Series를 Dictionary로 바꾸기
print(type(df_test.iloc[index, :-1]))
print(df_test.iloc[index, :-1].to_dict())

<class 'pandas.core.series.Series'>
{'mean radius': 13.24, 'mean texture': 20.13, 'mean perimeter': 86.87, 'mean area': 542.9, 'mean smoothness': 0.08284, 'mean compactness': 0.1223, 'mean concavity': 0.10099999999999999, 'mean concave points': 0.02833, 'mean symmetry': 0.1601, 'mean fractal dimension': 0.06432, 'radius error': 0.281, 'texture error': 0.8135, 'perimeter error': 3.3689999999999998, 'area error': 23.81, 'smoothness error': 0.004929, 'compactness error': 0.06657, 'concavity error': 0.07683, 'concave points error': 0.013680000000000001, 'symmetry error': 0.01526, 'fractal dimension error': 0.008133, 'worst radius': 15.44, 'worst texture': 25.5, 'worst perimeter': 115.0, 'worst area': 733.5, 'worst smoothness': 0.1201, 'worst compactness': 0.5646, 'worst concavity': 0.6556, 'worst concave points': 0.1357, 'worst symmetry': 0.2845, 'worst fractal dimension': 0.1249}
