# 피마/아이리스/와인 모델 만들기

In [31]:
import pandas as pd 
import joblib
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [112]:
# 트레인 셋
df_train = pd.read_csv('../static/data/classification/wine_train.csv')
df_train.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,13.4,3.91,2.48,23.0,102.0,1.8,0.75,0.43,1.41,7.3,0.7,1.56,750.0,2
1,14.38,1.87,2.38,12.0,102.0,3.3,3.64,0.29,2.96,7.5,1.2,3.0,1547.0,0
2,13.24,3.98,2.29,17.5,103.0,2.64,2.63,0.32,1.66,4.36,0.82,3.0,680.0,0
3,13.05,5.8,2.13,21.5,86.0,2.62,2.65,0.3,2.01,2.6,0.73,3.1,380.0,1
4,12.22,1.29,1.94,19.0,92.0,2.36,2.04,0.39,2.08,2.7,0.86,3.02,312.0,1


In [113]:
X_train = df_train.iloc[:, :-1].values
y_train = df_train.iloc[:, -1].values
X_train.shape, y_train.shape

((133, 13), (133,))

In [114]:
# 테스트 셋
df_test = pd.read_csv('../static/data/classification/wine_test.csv')
df_test.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,12.25,4.72,2.54,21.0,89.0,1.38,0.47,0.53,0.8,3.85,0.75,1.27,720.0,2
1,12.36,3.83,2.38,21.0,88.0,2.3,0.92,0.5,1.04,7.65,0.56,1.58,520.0,2
2,12.51,1.24,2.25,17.5,85.0,2.0,0.58,0.6,1.25,5.45,0.75,1.51,650.0,2
3,13.29,1.97,2.68,16.8,102.0,3.0,3.23,0.31,1.66,6.0,1.07,2.84,1270.0,0
4,13.4,4.6,2.86,25.0,112.0,1.98,0.96,0.27,1.11,8.5,0.67,1.92,630.0,2


In [115]:
X_test = df_test.iloc[:, :-1].values
y_test = df_test.iloc[:, -1].values
X_test.shape, y_test.shape

((45, 13), (45,))

In [116]:
scaler = StandardScaler()
# fit의 파라미터는 2차원 이상의 값이어야 함
scaler.fit(df_train.iloc[:, :-1])

StandardScaler()

In [117]:
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled.shape, X_test_scaled.shape

((133, 13), (45, 13))

In [118]:
joblib.dump(scaler, '../static/model/wine_scaler.pkl')

['../static/model/wine_scaler.pkl']

In [119]:
# 1. 로지스틱 회귀
lr_clf = LogisticRegression()
lr_clf.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [135]:
params = {
    #'C': [0.1, 1, 5, 10]
    'C': [0.1,0.2, 0.3,0.4, 0.5]
}
grid_cv = GridSearchCV(lr_clf, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train_scaled, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)
best_lr = grid_cv.best_estimator_
pred = best_lr.predict(X_test_scaled)
accuracy_score(y_test, pred)

최고 평균 정확도: 0.9701
최적 파라미터: {'C': 0.1}


1.0

In [121]:
joblib.dump(best_lr, '../static/model/wine_lr.pkl')

['../static/model/wine_lr.pkl']

In [122]:
# SVC
sv_clf = SVC()
sv_clf.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [123]:
params = {
    #'C': [0.1, 1, 5, 7, 10]
    'C': [0.4, 0.5, 0.6,0.7,0.8]
    #'C': [0.5, 0.8, 1, 2, 3]
    #'C': [2.5, 3, 3.5, 4]
}
grid_cv = GridSearchCV(sv_clf, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train_scaled, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)
best_sv = grid_cv.best_estimator_
pred = best_sv.predict(X_test_scaled)
accuracy_score(y_test, pred)

최고 평균 정확도: 0.9852
최적 파라미터: {'C': 0.5}


1.0

In [124]:
joblib.dump(best_sv, '../static/model/wine_sv.pkl')

['../static/model/wine_sv.pkl']

In [125]:
# 3. Random Forest
rf_clf = RandomForestClassifier()
rf_clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [126]:
params = {
    #'max_depth': [4, 6, 8, 10],
    'max_depth': [7, 8, 9],
    #'min_samples_split': [2, 3, 4]
    'min_samples_split': [3, 4, 5, 6]
}
grid_cv = GridSearchCV(rf_clf, param_grid=params, scoring='accuracy', cv=5)
grid_cv.fit(X_train_scaled, y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터:', grid_cv.best_params_)
best_rf = grid_cv.best_estimator_
pred = best_rf.predict(X_test_scaled)
accuracy_score(y_test, pred)

최고 평균 정확도: 0.9775
최적 파라미터: {'max_depth': 7, 'min_samples_split': 6}


1.0

In [127]:
joblib.dump(best_rf, '../static/model/wine_rf.pkl')

['../static/model/wine_rf.pkl']

In [128]:
index = 20
print(df_test.iloc[index, :-1].values)
test_data = (df_test.iloc[index, :-1].values).reshape(1,-1)
label = df_test.iloc[index, -1]
test_data, label

[1.305e+01 1.730e+00 2.040e+00 1.240e+01 9.200e+01 2.720e+00 3.270e+00
 1.700e-01 2.910e+00 7.200e+00 1.120e+00 2.910e+00 1.150e+03]


(array([[1.305e+01, 1.730e+00, 2.040e+00, 1.240e+01, 9.200e+01, 2.720e+00,
         3.270e+00, 1.700e-01, 2.910e+00, 7.200e+00, 1.120e+00, 2.910e+00,
         1.150e+03]]),
 0)

In [129]:
scaler = joblib.load('../static/model/wine_scaler.pkl')

In [130]:
test_scaled = scaler.transform(test_data)

In [131]:
pred_lr = best_lr.predict(test_scaled)
pred_sv = best_sv.predict(test_scaled)
pred_rf = best_rf.predict(test_scaled)

In [132]:
label, pred_lr[0], pred_sv[0], pred_rf[0]

(0, 0, 0, 0)

In [133]:
tmp = df_test.iloc[index, :-1].to_dict()
tmp

{'alcohol': 13.05,
 'malic_acid': 1.73,
 'ash': 2.04,
 'alcalinity_of_ash': 12.4,
 'magnesium': 92.0,
 'total_phenols': 2.72,
 'flavanoids': 3.27,
 'nonflavanoid_phenols': 0.17,
 'proanthocyanins': 2.91,
 'color_intensity': 7.2,
 'hue': 1.12,
 'od280/od315_of_diluted_wines': 2.91,
 'proline': 1150.0}