# 1. 신용 등급 예측

### 1. 베이스라인

In [1]:
import pandas as pd

train = pd.read_csv('score_train.csv')
test = pd.read_csv('score_test.csv')
train.shape, test.shape

((4198, 21), (1499, 20))

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4198 entries, 0 to 4197
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Delay_from_due_date       4198 non-null   float64
 1   Num_of_Delayed_Payment    4198 non-null   float64
 2   Num_Credit_Inquiries      4198 non-null   float64
 3   Credit_Utilization_Ratio  4198 non-null   float64
 4   Credit_History_Age        4198 non-null   float64
 5   Payment_of_Min_Amount     4198 non-null   object 
 6   Amount_invested_monthly   4198 non-null   float64
 7   Monthly_Balance           4198 non-null   float64
 8   Credit_Mix                4198 non-null   object 
 9   Payment_Behaviour         4198 non-null   object 
 10  Age                       4198 non-null   float64
 11  Annual_Income             4198 non-null   float64
 12  Num_Bank_Accounts         4198 non-null   float64
 13  Num_Credit_Card           4198 non-null   float64
 14  Interest

In [9]:
train.isnull().sum().sum(), test.isnull().sum().sum()

(0, 0)

In [10]:
# 데이터 전처리 및 원 - 핫 인코딩 
target = train.pop('Credit_Score')

train = pd.get_dummies(train)
test = pd.get_dummies(test)

# 검증 데이터 나누기
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train, target, test_size = 0.2, random_state = 0)

# 머신러닝 학습 및 평가 
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state = 0)
rf.fit(X_train, y_train)
pred = rf.predict(X_val)

from sklearn.metrics import f1_score
score = f1_score(y_val, pred, average = 'macro')
print(score)

# 예측 및 결과 파일 생성 
pred = rf.predict(test)
submit = pd.DataFrame({'pred' : pred})
submit.to_csv('result.csv', index= False)

print(pd.read_csv('result.csv').head())

0.7004593488873695
       pred
0      Poor
1      Good
2  Standard
3      Good
4  Standard


In [13]:
import pandas as pd

train = pd.read_csv('score_train.csv')
test = pd.read_csv('score_test.csv')

# 데이터 전처리
target = train.pop('Credit_Score')

# 스케일링
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
cols = train.select_dtypes(include = ['int', 'float']).columns
train[cols] = scaler.fit_transform(train[cols])
test[cols] = scaler.transform(test[cols])

# 원-핫 인코딩
train = pd.get_dummies(train)
test= pd.get_dummies(test)

# 검증 데이터 나누기 
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train, target, test_size = 0.2, random_state = 0)

# 머신러닝 학습 및 평가
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state = 0)
rf.fit(X_train, y_train)
pred = rf.predict(X_val)

from sklearn.metrics import f1_score
score = f1_score(y_val, pred, average = 'macro')
print(score)

# 예측 및 결과 파일 생성
pred = rf.predict(test)
submit = pd.DataFrame({'pred' : pred})
submit.to_csv('result.csv', index = False)


0.7020460066061173


# 2. 약물 종류 예측

### 1. 베이스라인

In [8]:
# 라이브러리 및 데이터 불러오기
import pandas as pd

train = pd.read_csv('drug_train.csv')
test = pd.read_csv('drug_test.csv')
train.shape, test.shape

((100, 6), (100, 5))

In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          100 non-null    int64  
 1   Sex          100 non-null    object 
 2   BP           100 non-null    object 
 3   Cholesterol  100 non-null    object 
 4   Na_to_K      100 non-null    float64
 5   Drug         100 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 4.8+ KB


In [10]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          100 non-null    int64  
 1   Sex          100 non-null    object 
 2   BP           100 non-null    object 
 3   Cholesterol  100 non-null    object 
 4   Na_to_K      100 non-null    float64
dtypes: float64(1), int64(1), object(3)
memory usage: 4.0+ KB


In [11]:
train.isnull().sum().sum(), test.isnull().sum().sum()

(0, 0)

In [12]:
# 데이터 전-처리
target = train.pop('Drug')
train = pd.get_dummies(train)
test= pd.get_dummies(test)

# 검증 데이터 나누기
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train, target, test_size = 0.2, random_state = 0)

# 머신러닝 학습 및 평가 
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state = 0)
rf.fit(X_train, y_train)
pred = rf.predict(X_val)
from sklearn.metrics import f1_score
score = f1_score(y_val, pred, average= 'macro')
print(score)

pred = rf.predict(test)
submit = pd.DataFrame({'pred' : pred})
submit.to_csv('result2.csv', index=  False)

1.0


In [14]:
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
f1_scores = cross_val_score(rf,train, target, cv=3, scoring = "f1_macro")
print(f1_scores)
f1_scores.mean()

[1.         0.93777778 0.78461538]


0.9074643874643874

### 2. 성능 개선

In [15]:
# 라이브러리 및 데이터 불러오기 
import pandas as pd
train = pd.read_csv('drug_train.csv')
test = pd.read_csv('drug_test.csv')

# 타겟 열 추출
target = train.pop('Drug')

# 원 - 핫 인코딩
train = pd.get_dummies(train)
test = pd.get_dummies(test)

# 크로스 배릴데이션 
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state = 0)
f1_scores = cross_val_score(rf,train,target, cv=3, scoring = 'f1_macro')
print(f1_scores.mean())

rf.fit(train, target)
pred = rf.predict(test)
submit = pd.DataFrame({'pred' : pred})
submit.to_csv('result2.csv', index= False)

0.9074643874643874


# 3. 유리 종류 예측 

### 1. 베이스라인

In [16]:
import pandas as pd 
train = pd.read_csv('glass_train.csv')
test= pd.read_csv('glass_test.csv')
train.shape, test.shape

((149, 10), (65, 9))

In [17]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149 entries, 0 to 148
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   RI      149 non-null    float64
 1   Na      149 non-null    float64
 2   Mg      149 non-null    float64
 3   Al      149 non-null    float64
 4   Si      149 non-null    float64
 5   K       149 non-null    float64
 6   Ca      149 non-null    float64
 7   Ba      149 non-null    float64
 8   Fe      149 non-null    float64
 9   Type    149 non-null    int64  
dtypes: float64(9), int64(1)
memory usage: 11.8 KB


In [18]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65 entries, 0 to 64
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   RI      65 non-null     float64
 1   Na      65 non-null     float64
 2   Mg      65 non-null     float64
 3   Al      65 non-null     float64
 4   Si      65 non-null     float64
 5   K       65 non-null     float64
 6   Ca      65 non-null     float64
 7   Ba      65 non-null     float64
 8   Fe      65 non-null     float64
dtypes: float64(9)
memory usage: 4.7 KB


In [19]:
train.isnull().sum().sum(), test.isnull().sum().sum()

(0, 0)

In [20]:
# 데이터 전처리
target = train.pop('Type')

# 검증 데이터 나누기
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train, target, test_size = 0.2, random_state = 0)

# 머신러닝 학습 및 모델 펴악 
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state = 0)
rf.fit(X_train, y_train)
pred = rf.predict(X_val)
from sklearn.metrics import f1_score
score = f1_score(y_val, pred, average = 'weighted')
print(score)

pred = rf.predict(test)
submit = pd.DataFrame({'pred' : pred})
submit.to_csv('result3.csv', index= False)

0.611980176686059


### 2. 성능 개선

In [22]:
# 라이브러리 및 데이터 불러오기
import pandas as pd
train = pd.read_csv('glass_train.csv')
test = pd.read_csv('glass_test.csv')

# 데이터 전처리
target= train.pop('Type')

#검증 데이터 나누기
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train, target, test_size = 0.2, random_state = 0)

#머신러닝 학습 및 평가
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth = 5, n_estimators = 200, random_state = 0)
rf.fit(X_train, y_train)
pred = rf.predict(X_val)
from sklearn.metrics import f1_score
score = f1_score(y_val, pred, average = 'weighted')
print(score)

# 예측 및 결과 파일 생성 
pred = rf.predict(test)
submit = pd.DataFrame({'pred' : pred})
submit.to_csv('result3.csv', index= False)


0.6507936507936507
