In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from collections import Counter

from xgboost import XGBClassifier, plot_importance
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from imblearn.over_sampling import ADASYN
from imblearn.pipeline import Pipeline

from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family=font_name)

% matplotlib inline

# Base Model

In [69]:
# 남성
man = pd.read_pickle('man_data.pkl')

scaler = MinMaxScaler()
clf = LogisticRegression()
model = Pipeline([('scaler', scaler), ('clf', clf)])

val = cross_val_score(model, X = man['train_X'], y = man['train_y'], cv = 5, scoring = 'accuracy')
print('Accuracy: %.5f' % np.mean(val))

Accuracy: 0.69391


# Model fitting

In [73]:
# kNN

if __name__ == '__main__':
    k_lst = [28]
        
    for k in k_lst:
        scaler = MinMaxScaler()
        clf = KNeighborsClassifier(n_neighbors = k)
        model = Pipeline([('scaler', scaler), ('clf', clf)])
        val = cross_val_score(model, X = man['train_X'], y = man['train_y'], cv = 5, scoring = 'accuracy', n_jobs = -1)
        print('[k: %s]' % (k), 'accuracy: %.5f' % np.mean(val))

[k: 28] accuracy: 0.68726


In [74]:
# NB

if __name__ == '__main__':
    scaler = MinMaxScaler()
    clf = MultinomialNB()
    model = Pipeline([('scaler', scaler), ('clf', clf)])
    val = cross_val_score(model, X = man['train_X'], y = man['train_y'], cv = 5, scoring = 'accuracy', n_jobs = -1)
    print('accuracy: %.5f' % np.mean(val))

accuracy: 0.70749


In [75]:
# SVC linear

if __name__ == '__main__':
    C_lst = [1]
    tol = [0.1]
        
    for c in C_lst:
        for t in tol:
            scaler = MinMaxScaler()
            clf = SVC(kernel = 'linear', C = c, tol = t)
            model = Pipeline([('scaler', scaler), ('clf', clf)])
            val = cross_val_score(model, X = man['train_X'], y = man['train_y'], cv = 5, scoring = 'accuracy', n_jobs = -1)
            print('[C: %s tol: %s]' % (c, t), 'accuracy: %.5f' % np.mean(val))

[C: 1 tol: 0.1] accuracy: 0.69370


In [76]:
# SVC rbf

if __name__ == '__main__':
    C_lst = [1]
    gamma = ['auto']
        
    for c in C_lst:
        for g in gamma:
            scaler = MinMaxScaler()
            clf = SVC(kernel = 'rbf', C = c, gamma = g)
            model = Pipeline([('scaler', scaler), ('clf', clf)])
            val = cross_val_score(model, X = man['train_X'], y = man['train_y'], cv = 5, scoring = 'accuracy', n_jobs = -1)
            print('[C: %s gamma: %s]' % (c, g), 'accuracy: %.5f' % np.mean(val))

[C: 1 gamma: auto] accuracy: 0.71439


In [79]:
# RF

if __name__ == '__main__':
    n_est = [1000]
    max_depth = range(1, 9, 1)
        
    for n in n_est:
        for d in max_depth:
            scaler = MinMaxScaler()
            clf = RandomForestClassifier(n_estimators = n, max_depth = d, random_state = 10)
            model = Pipeline([('scaler', scaler), ('clf', clf)])
            val = cross_val_score(model, X = man['train_X'], y = man['train_y'], cv = 5, scoring = 'accuracy', n_jobs = -1)
            print('[n_est: %s max_depth: %s]' % (n, d), 'accuracy: %.5f' % np.mean(val))

[n_est: 1000 max_depth: 1] accuracy: 0.68059
[n_est: 1000 max_depth: 2] accuracy: 0.68680
[n_est: 1000 max_depth: 3] accuracy: 0.70683
[n_est: 1000 max_depth: 4] accuracy: 0.68681
[n_est: 1000 max_depth: 5] accuracy: 0.68036
[n_est: 1000 max_depth: 6] accuracy: 0.67370
[n_est: 1000 max_depth: 7] accuracy: 0.67348
[n_est: 1000 max_depth: 8] accuracy: 0.65992
