In [1]:
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import lightgbm as lgb # lightgbm 부스팅 알고리즘 사용
from lightgbm import LGBMClassifier

from sklearn import metrics
from sklearn.metrics import roc_auc_score

In [2]:
redwine = pd.read_csv("wine2.csv")

redwine['quality'] = [1 if i == 'high rank' else 0 for i in redwine['quality'] ]
redwine.head()

Unnamed: 0,fixacid,volacid,citacid,rsugar,salt,freedioxid,totaldioxid,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,1
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0


In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(redwine[ redwine.columns[:-1] ], redwine['quality'], 
                                                    test_size = 0.3, random_state = 2)

# 단순 모델링 비교

LogisticRegression

In [4]:
clf = LogisticRegression(random_state=0).fit(X_train, y_train)

y_pred = clf.predict(X_test)
y_pred2 = y_pred.round(0)
y_pred2 = y_pred2.astype(int)
print('정확도 :', metrics.accuracy_score(y_test, y_pred2))
print('AUC    :', roc_auc_score(y_pred2, y_test))

정확도 : 0.7649572649572649
AUC    : 0.7646557474245906


RandomForest 일반 and 개선

In [5]:
model = RandomForestClassifier(random_state = 0)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_pred2 = y_pred.round(0)
y_pred2 = y_pred2.astype(int)
print('정확도 :', model.score(X_test, y_test) )
print('AUC    :', roc_auc_score(y_pred2, y_test))

정확도 : 0.7970085470085471
AUC    : 0.7970181575878728


In [6]:
model = RandomForestClassifier(criterion = 'entropy', n_estimators = 100, random_state = 0)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_pred2 = y_pred.round(0)
y_pred2 = y_pred2.astype(int)
print('정확도 :', model.score(X_test, y_test) )
print('AUC    :', roc_auc_score(y_pred2, y_test))

정확도 : 0.811965811965812
AUC    : 0.8115050117924528


lgbm 일반 and 개선

In [7]:
lgbm = LGBMClassifier(random_state = 0)

lgbm_clf = lgbm.fit(X_train,y_train) #train the model on 100 epocs

y_pred = lgbm_clf.predict(X_test)
y_pred2 = y_pred.round(0)
y_pred2 = y_pred2.astype(int)

print('정확도 :', metrics.accuracy_score(y_test, y_pred2))
print('AUC    :', roc_auc_score(y_pred2, y_test))

정확도 : 0.7948717948717948
AUC    : 0.7957269411151642


In [8]:
d_train=lgb.Dataset(X_train, label=y_train)

lgbm_param = {'objective':'binary',
              "metric" : "auc",
              'boosting_type': 'gbdt',
              'random_state':42,
              'learning_rate':0.075,
            }

clf = lgb.train(lgbm_param,d_train,100) #train the model on 100 epocs

y_pred = clf.predict(X_test)
y_pred2 = y_pred.round(0)
y_pred2 = y_pred2.astype(int)

print('정확도 :', metrics.accuracy_score(y_test, y_pred2))
print('AUC    :', roc_auc_score(y_pred2, y_test))

정확도 : 0.7970085470085471
AUC    : 0.7981161607691877


현재 데이터를 이를 고도화하고 일반적인 결과로 다른 사람들을 납득 시키기 위해서는, 

cross validation 과정, 더 많은 데이터 수집,

새롭게 만들 수 있는 파생변수, 스태킹 등의 고도화된 모델링 구축 및 하이퍼 파라미터 조정이 필요한 것은 당연하다.

하지만 이번 코드에서는 데이터를 간단히 train, test로 나누는 과정과

그 이후에, 정말로 아무런 과정 없이 모델링만 한 상황을 간단히 코드로 표현하였다.

데이터 수도 적고, 고도화를 하려는 목적이 아니므로 어느 정도의 정확도를 가지고 있다만 보는 것이 좋아보인다.