### 01. 신용등급 예측
- 신용등급을 예측하는 모델을 만들고, 평가데이터에 적용해 얻은 예측값을 result.csv 파일로 저장하라
- target : Credit_Score (Good, Standard, Poor)
- 제출형식 : pred (1개 컬럼)
- 평가 : f1 macro score

In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

train = pd.read_csv("https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert/main/part2/ch7/score_train.csv")
test = pd.read_csv("https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert/main/part2/ch7/score_test.csv")
print(train.shape, test.shape)
display(train.head(2), test.head(2))

(4198, 21) (1499, 20)


Unnamed: 0,Delay_from_due_date,Num_of_Delayed_Payment,Num_Credit_Inquiries,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Amount_invested_monthly,Monthly_Balance,Credit_Mix,Payment_Behaviour,...,Annual_Income,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Monthly_Inhand_Salary,Changed_Credit_Limit,Outstanding_Debt,Total_EMI_per_month,Credit_Score
0,26.0,10.0,5.0,30.358905,200.0,NM,192.3418,321.431503,Good,Low_spent_Small_value_payments,...,36963.89,0.0,4.0,1.0,2.0,2902.324167,6.47,1195.96,46.459114,Standard
1,6.0,17.0,4.0,24.589796,70.0,Yes,53.110239,427.404738,Standard,High_spent_Medium_value_payments,...,34599.8,8.0,4.0,16.0,3.0,3130.316667,16.53,851.37,82.516689,Standard


Unnamed: 0,Delay_from_due_date,Num_of_Delayed_Payment,Num_Credit_Inquiries,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Amount_invested_monthly,Monthly_Balance,Credit_Mix,Payment_Behaviour,Age,Annual_Income,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Monthly_Inhand_Salary,Changed_Credit_Limit,Outstanding_Debt,Total_EMI_per_month
0,19.0,20.0,11.0,28.893968,57.0,Yes,126.895309,217.045508,Bad,Low_spent_Small_value_payments,21.0,16644.79,7.0,6.0,24.0,9.0,1547.065833,9.93,3519.49,100.765767
1,6.0,6.0,7.0,34.443437,366.0,No,217.005753,572.710742,Good,Low_spent_Medium_value_payments,22.0,83358.3,5.0,3.0,5.0,3.0,6896.525,2.66,727.88,169.936005


In [21]:
############################# EDA #####################################################
# 결측치 없음, object 컬럼 ~ Payment_of_Min_Amount, Credit_Mix, Payment_Behaviour
# 타겟변수 불균형, train/test의 object 컬럼 카테고리 값들은 동일함
#######################################################################################

# 타겟변수 분리
target = train.pop('Credit_Score')
# 원-핫 인코딩
train = pd.get_dummies(train)
test = pd.get_dummies(test)
# 검증데이터 분리
from sklearn.model_selection import train_test_split
X_train,X_valid, y_train, y_valid = train_test_split(train, target, test_size=0.2, stratify=target, random_state=42)
# 스케일링
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(test)

# 모델링
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=120, n_estimators=300, class_weight='balanced')
rf.fit(X_train, y_train)

# 평가
from sklearn.metrics import f1_score, classification_report
rf_pred = rf.predict(X_valid)
score = f1_score(y_valid, rf_pred, average='macro')
report = classification_report(y_valid, rf_pred)

print('f1 score(macro) = ', score)
print('Report = ', report)

f1 score(macro) =  0.7033306547551269
Report =                precision    recall  f1-score   support

        Good       0.60      0.64      0.62       148
        Poor       0.73      0.73      0.73       247
    Standard       0.76      0.75      0.76       445

    accuracy                           0.72       840
   macro avg       0.70      0.71      0.70       840
weighted avg       0.73      0.72      0.73       840



In [22]:
# submit
pred = rf.predict(X_test)
result = pd.DataFrame({'pred':pred})
result.to_csv('result.csv', index=False)

---
#### 02. 다중분류 : 약물의 종류를 예측하라
- 타겟 : Drug (DrugY, drugX, drugA, drugC, drugB)
- 평가 : f1-macro
- 제출 : result.csv ("pred")

In [44]:
train = pd.read_csv("https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert/main/part2/ch7/drug_train.csv")
test = pd.read_csv("https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert/main/part2/ch7/drug_test.csv")
print(train.shape, test.shape)
display(train.head(2), test.head(2))

(100, 6) (100, 5)


Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,70,M,HIGH,HIGH,9.849,drugB
1,36,M,LOW,NORMAL,11.424,drugX


Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K
0,74,F,LOW,HIGH,20.942
1,65,M,HIGH,NORMAL,34.997


In [45]:
########## EDA ###################################
# 결측치 없음, obj컬럼 = Sex, BP, Cholesterol
# target(Drug)의 불균형 크다
# obj 변수들의 카테고리는 train/test 동일하다
# 이번 풀이는 cross_val_scor를 사용한다. 이때 train_test_split와 scaling은 필요없다.
##################################################

target = train.pop('Drug')
# 원-핫 인코딩
train = pd.get_dummies(train)
test = pd.get_dummies(test)

# cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score

rf = RandomForestClassifier(random_state=120)
f1_score = cross_val_score(rf, train, target, cv=3, scoring = 'f1_macro')
rf.fit(train, target)

# eval
print('f1 score(macro) = ', f1_score)
print('f1 score(macro) mean = ', f1_score.mean())

f1 score(macro) =  [1.         0.93777778 0.8567619 ]
f1 score(macro) mean =  0.9315132275132275


In [47]:
#submit
pred = rf.predict(test)
result = pd.DataFrame({'pred':pred})
result.to_csv('result.csv', index=False)

---
#### 03. 유리종류 예측
- target : Type (1,2,3,5,6,7)
- 평가 : f1 weighted
- 제출 : result_csv (컬럼 pred)

In [98]:
train = pd.read_csv("https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert/main/part2/ch7/glass_train.csv")
test = pd.read_csv("https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert/main/part2/ch7/glass_test.csv")
print(train.shape, test.shape)
display(train.head(2), test.head(2))

(149, 10) (65, 9)


Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.51829,14.46,2.24,1.62,72.38,0.0,9.26,0.0,0.0,6
1,1.5161,13.33,3.53,1.34,72.67,0.56,8.33,0.0,0.0,3


Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe
0,1.51748,12.86,3.56,1.27,73.21,0.54,8.38,0.0,0.17
1,1.52058,12.85,1.61,2.17,72.18,0.76,9.7,0.24,0.51


In [99]:
############# EDA ##################################
# 결측치 없음, 모두 숫자형 변수
# 타겟변수 분포 불균형, 변수들에 이상치 다수
####################################################

target = train.pop('Type')
# 검증데이터 분리
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(train, target, test_size=0.2, random_state=42)

# scaling : 트리 모델이므로 생략

# model
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=120, n_estimators=200, max_depth=10, class_weight='balanced')
rf.fit(X_train, y_train)

# eval
from sklearn.metrics import f1_score
rf_pred = rf.predict(X_valid)
score = f1_score(y_valid, rf_pred, average='weighted')
print('f1 score = ', score)

f1 score =  0.6612698412698413


In [93]:
# 파라미터 최적화
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    'n_estimators': [200, 400, 600, 800],
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'max_features': ['sqrt', 'log2', None],
    'class_weight': ['balanced']
}

rf = RandomForestClassifier(random_state=120)

random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=20,
    scoring='f1_weighted',
    cv=3,
    n_jobs=-1,
    verbose=2,
    random_state=42
)

random_search.fit(X_train, y_train)

print("Best Params:", random_search.best_params_)
print("Best CV f1_score:", random_search.best_score_)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Params: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 10, 'class_weight': 'balanced'}
Best CV f1_score: 0.7800281216229492
