In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [2]:
data = pd.read_csv('./datasets/data.csv', encoding='CP949')
target = pd.read_csv('./datasets/OECD신용등급.csv', encoding='CP949')
target.drop('신용등급', axis=1, inplace=True)

In [3]:
datasets = pd.merge(data, target, on=('국가별', '시점'), how='left')
datasets.columns = ['국가별', '시점', '인당_국민총소득', 'GDP_성장률', '디플레이터', '수출', '수입',
       '무역의존도_수출', '무역의존도_수입', '외환보유액', '부채비율', '국민부담률', '평균근로자세금', '경제활동참가율',
       '고용률', '실업률', '신용등급_1']
datasets['수입'] = datasets['수입'].astype(float)
datasets['무역의존도_수입'] = datasets['무역의존도_수입'].astype(float)
datasets.dropna(inplace=True)
datasets.reset_index(inplace=True)
datasets.drop(['국가별', 'index'], axis=1, inplace=True)
datasets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114 entries, 0 to 113
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   시점        114 non-null    int64  
 1   인당_국민총소득  114 non-null    float64
 2   GDP_성장률   114 non-null    float64
 3   디플레이터     114 non-null    float64
 4   수출        114 non-null    float64
 5   수입        114 non-null    float64
 6   무역의존도_수출  114 non-null    float64
 7   무역의존도_수입  114 non-null    float64
 8   외환보유액     114 non-null    int64  
 9   부채비율      114 non-null    float64
 10  국민부담률     114 non-null    float64
 11  평균근로자세금   114 non-null    float64
 12  경제활동참가율   114 non-null    float64
 13  고용률       114 non-null    float64
 14  실업률       114 non-null    float64
 15  신용등급_1    114 non-null    object 
dtypes: float64(13), int64(2), object(1)
memory usage: 14.4+ KB


In [4]:
lb = LabelEncoder()
datasets['시점'] = lb.fit_transform(datasets['시점'])
datasets.describe()

Unnamed: 0,시점,인당_국민총소득,GDP_성장률,디플레이터,수출,수입,무역의존도_수출,무역의존도_수입,외환보유액,부채비율,국민부담률,평균근로자세금,경제활동참가율,고용률,실업률
count,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0
mean,1.710526,32750.570175,3.348246,95.848246,223842.7,224774.5,36.308246,36.257544,82965.03,72.816588,34.342982,38.482456,60.285965,66.111404,8.215789
std,1.053638,19334.244737,3.164914,13.415816,282335.3,329305.7,19.655216,19.602815,183684.7,41.206568,7.007278,8.778587,5.906092,7.026458,4.194235
min,0.0,3911.0,-5.5,58.2,3092.0,3919.0,8.24,2.24,279.0,6.788,11.4,14.7,48.1,46.3,2.6
25%,1.0,16402.25,1.825,88.625,49648.0,45036.25,20.455,22.7875,9670.25,45.5905,30.5,32.675,57.325,61.025,5.1
50%,2.0,31591.5,2.95,100.0,120463.0,89101.5,31.785,29.41,32170.0,62.2375,33.1,38.8,59.75,66.2,7.35
75%,3.0,43878.5,4.2,103.9,325304.5,314787.2,51.0525,50.7125,73309.25,97.9775,39.35,43.925,63.175,71.725,9.825
max,3.0,88706.0,25.2,143.1,1503400.0,2248800.0,86.7,84.74,1233153.0,234.073,48.0,57.1,81.4,83.8,24.9


In [59]:
x_train, x_test, y_train, y_test = train_test_split(datasets.iloc[:,:15], datasets.iloc[:,15], test_size=0.2, random_state=42)

In [None]:
# from imblearn.over_sampling import SMOTE

# smote = SMOTE(random_state=42)
# X_train_over, y_train_over = smote.fit_resample(x_train, y_train)
# print("SMOTE 적용 전 학습용 피처/레이블 데이터 세트 : ", x_train.shape, y_train.shape)
# print('SMOTE 적용 후 학습용 피처/레이블 데이터 세트 :', X_train_over.shape, y_train_over.shape)
# print('SMOTE 적용 후 값의 분포 :\n',pd.Series(y_train_over).value_counts() )

In [60]:
ss = StandardScaler()
x_train = ss.fit_transform(x_train)
x_test = ss.fit_transform(x_test)



lb = LabelEncoder()
#ohe = OneHotEncoder()
y_train = lb.fit_transform(y_train.values.reshape(-1, 1))
y_test = lb.fit_transform(y_test.values.reshape(-1, 1))


  y = column_or_1d(y, warn=True)


In [65]:
lr = LogisticRegression()
lr.fit(x_train, y_train)
pred = lr.predict(x_test)
acc = accuracy_score(pred, y_test, average='binary')
prec = precision_score(pred, y_test)
reca = recall_score(pred, y_test)
f1 = f1_score(pred, y_test)
#auc = roc_auc_score(pred, y_test)

print('정확도 : {:.2f}'.format(acc*100))
print('정밀도 : {:.2f}'.format(prec*100))
print('재현율 : {:.2f}'.format(reca*100))
print('f1 score : {:.2f}'.format(f1*100))
#print('AUC : {:.2f}'.format(auc*100))

ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].