In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [3]:
data = pd.read_csv('../datasets/input.csv')
target = pd.read_csv('../datasets/target.csv')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1264 entries, 0 to 1263
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   국가별       1264 non-null   object 
 1   시점        1264 non-null   int64  
 2   인당_GDP    1264 non-null   float64
 3   부채비율      1264 non-null   float64
 4   실업률       1264 non-null   float64
 5   상품및서비스수입  1264 non-null   float64
 6   상품및서비스수출  1264 non-null   float64
 7   정치적안정성    1264 non-null   int64  
 8   규제의질      1264 non-null   int64  
 9   경상수지      1261 non-null   float64
dtypes: float64(6), int64(3), object(1)
memory usage: 98.9+ KB


In [5]:
target.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1264 entries, 0 to 1263
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   국가별     1264 non-null   object 
 1   시점      1264 non-null   int64  
 2   신용등급점수  1264 non-null   float64
dtypes: float64(1), int64(1), object(1)
memory usage: 29.8+ KB


In [6]:
datasets = pd.merge(data, target, on=('국가별', '시점'), how='outer')
datasets.dropna(inplace=True)
datasets.reset_index(inplace=True)
datasets.drop(['국가별', 'index'], axis=1, inplace=True)
datasets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1261 entries, 0 to 1260
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   시점        1261 non-null   int64  
 1   인당_GDP    1261 non-null   float64
 2   부채비율      1261 non-null   float64
 3   실업률       1261 non-null   float64
 4   상품및서비스수입  1261 non-null   float64
 5   상품및서비스수출  1261 non-null   float64
 6   정치적안정성    1261 non-null   int64  
 7   규제의질      1261 non-null   int64  
 8   경상수지      1261 non-null   float64
 9   신용등급점수    1261 non-null   float64
dtypes: float64(7), int64(3)
memory usage: 98.6 KB


In [7]:
lb = LabelEncoder()
datasets['시점'] = lb.fit_transform(datasets['시점'])
datasets.describe()

Unnamed: 0,시점,인당_GDP,부채비율,실업률,상품및서비스수입,상품및서비스수출,정치적안정성,규제의질,경상수지,신용등급점수
count,1261.0,1261.0,1261.0,1261.0,1261.0,1261.0,1261.0,1261.0,1261.0,1261.0
mean,7.505155,22012.77933,55.695265,7.11314,4.170044,3.660556,54.612213,67.01586,-1.326674,12.526324
std,4.603761,22939.181148,37.78344,4.499929,10.071854,9.109875,27.897707,23.512842,7.070186,5.28881
min,0.0,333.731576,-3.4,0.21,-50.275895,-56.736571,0.0,6.0,-41.526871,0.0
25%,4.0,4379.658787,30.3,4.12,0.200224,0.259701,29.0,48.0,-4.754963,8.0
50%,8.0,12808.03834,47.1,6.12,4.560764,4.075288,58.0,70.0,-1.742977,12.5
75%,11.0,36323.44774,70.78,8.48,9.093538,7.668103,79.0,88.0,2.367691,17.0
max,15.0,123514.1967,266.2,31.11,66.893243,86.043298,100.0,100.0,27.39765,20.0


In [8]:
#datasets.iloc[:,9] = ((datasets.iloc[:,9].apply(lambda x:math.trun c(x))).astype(int)).astype(str)
datasets.iloc[:,9] = (round(datasets.iloc[:,9])).astype(int).astype(str)

In [9]:
datasets.groupby('신용등급점수').count()

Unnamed: 0_level_0,시점,인당_GDP,부채비율,실업률,상품및서비스수입,상품및서비스수출,정치적안정성,규제의질,경상수지
신용등급점수,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,4,4,4,4,4,4,4,4,4
10,78,78,78,78,78,78,78,78,78
11,77,77,77,77,77,77,77,77,77
12,96,96,96,96,96,96,96,96,96
13,52,52,52,52,52,52,52,52,52
14,85,85,85,85,85,85,85,85,85
15,65,65,65,65,65,65,65,65,65
16,77,77,77,77,77,77,77,77,77
17,37,37,37,37,37,37,37,37,37
18,51,51,51,51,51,51,51,51,51


In [10]:
x_train, x_test, y_train, y_test = train_test_split(datasets.iloc[:,:9], datasets.iloc[:,9], test_size=0.2, random_state=42)

In [12]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((1008, 9), (253, 9), (1008,), (253,))

In [13]:
len(np.unique(y_train)), len(np.unique(y_test))

(20, 20)

In [8]:
# from imblearn.over_sampling import SMOTE

# smote = SMOTE(random_state=42)
# X_train_over, y_train_over = smote.fit_resample(x_train, y_train)
# print("SMOTE 적용 전 학습용 피처/레이블 데이터 세트 : ", x_train.shape, y_train.shape)
# print('SMOTE 적용 후 학습용 피처/레이블 데이터 세트 :', X_train_over.shape, y_train_over.shape)
# print('SMOTE 적용 후 값의 분포 :\n',pd.Series(y_train_over).value_counts() )

In [14]:
ss = StandardScaler()
x_train = ss.fit_transform(x_train)
x_test = ss.fit_transform(x_test)

In [15]:
lr = LogisticRegression()
lr.fit(x_train, y_train)
pred = lr.predict(x_test)
acc = accuracy_score(pred, y_test)
# prec = precision_score(pred, y_test)
# reca = recall_score(pred, y_test)
# f1 = f1_score(pred, y_test)
#auc = roc_auc_score(pred, y_test)

print('정확도 : {:.2f}'.format(acc*100))
# print('정밀도 : {:.2f}'.format(prec*100))
# print('재현율 : {:.2f}'.format(reca*100))
# print('f1 score : {:.2f}'.format(f1*100))
#print('AUC : {:.2f}'.format(auc*100))

정확도 : 35.97


In [16]:
params = { "penalty" : ["l2", "l1"], "C" : [0.01, 0.1, 1, 5, 10] } 
lr = LogisticRegression() 
gs = GridSearchCV(lr, param_grid=params, cv=5, n_jobs=-1, scoring="accuracy") 
gs.fit(x_train, y_train) 
print("best param : {}".format(gs.best_params_)) 
print("best acc : {}".format(gs.best_score_))



best param : {'C': 5, 'penalty': 'l2'}
best acc : 0.36905078567558247


25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "d:\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Python310\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "d:\Python310\lib\site-packages\sklearn\linear_model\_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

 0.36905079        nan 0.36805576        nan]
STOP: TOTAL NO. of

In [17]:
pred = gs.predict(x_test)
accuracy_score(pred, y_test)

0.35968379446640314