In [2]:
import pandas as pd
from glob import glob
from tqdm import tqdm
import warnings


In [2]:
warnings.filterwarnings(action = 'ignore')

In [3]:
know_train = [pd.read_csv(path) for path in sorted(glob('/Volumes/USB/데이콘/know_data/train/*.csv'))]
know_test = [pd.read_csv(path) for path in sorted(glob('/Volumes/USB/데이콘/know_data/test/*.csv'))]

In [5]:
know_train[0].drop(columns=['Unnamed: 0'],inplace=True)
know_test[0].drop(columns=['Unnamed: 0'],inplace=True)

know_train[1].drop(columns=['Unnamed: 0'],inplace=True)
know_test[1].drop(columns=['Unnamed: 0'],inplace=True)

know_train[2].drop(columns=['Unnamed: 0'],inplace=True)
know_test[2].drop(columns=['Unnamed: 0'],inplace=True)

know_train[3].drop(columns=['Unnamed: 0'],inplace=True)
know_test[3].drop(columns=['Unnamed: 0'],inplace=True)

### 데이터 라벨 인코딩

In [7]:
from sklearn.preprocessing import LabelEncoder
years = ['2017', '2018', '2019', '2020']

year_encoder = {}

for year, df in zip(years, know_train):
    print(year)
    encoders = {}
    
    for col in df.columns:
        if col == 'ID':
            continue
        
        try:
            df[col] = df[col].map(int)
        except:
            encoder = LabelEncoder()
            df[col] = df[col].map(str)
            df[col] = encoder.fit_transform(df[col])
            encoders[col] = encoder
            
            
    year_encoder[year] = encoders

2017
2018
2019
2020


In [8]:
train_data = {}
for year, df in zip(years, know_train):
    train_data[year] = {'X': df.iloc[:, 1:-1], # ID제외
                        'y': df.iloc[:, -1]} 

In [9]:
years = ['2017', '2018', '2019', '2020']

for year, df in zip(years, know_test):
    print(year)
    encoders = {}
    
    for col in df.columns:
        
        try:
            df[col] = df[col].map(int)
        except:
            encoder = year_encoder[year][col]
            df[col] = df[col].map(str)
            category_map = {category: idx for idx, category in enumerate(encoder.classes_)}
            df[col] = df[col].apply(lambda x: category_map[x] if x in category_map else -1) # train set에서 보지못한 카테고리변수 -1(UNK) 처리

2017
2018
2019
2020


### RandomForest 진행

해당코드에서 메모리 부족 문제 발생으로 for문을 분해해 학습 진행

In [10]:
test_data = {}
for year, df in zip(years, know_test):
    test_data[year] =  {'X': df.iloc[:,1:]}

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

dt_models = {}
model = RandomForestClassifier(n_estimators=900, criterion="entropy", max_depth=16, random_state=123456)
model.fit(train_data['2017']['X'].iloc[:, :], train_data['2017']['y'])
preds = model.predict(train_data['2017']['X'].iloc[:, :])
print(accuracy_score(train_data['2017']['y'],preds))
dt_models['2017'] = model
    

1.0


In [12]:
dt_predicts = [] 

pred = dt_models['2017'].predict(test_data['2017']['X'])
dt_predicts.extend(pred)

In [13]:
dt_models = {}
model = RandomForestClassifier(n_estimators=700, criterion="entropy", max_depth=16, random_state=123456)
model.fit(train_data['2018']['X'].iloc[:, :], train_data['2018']['y'])
preds = model.predict(train_data['2018']['X'].iloc[:, :])
print(accuracy_score(train_data['2018']['y'],preds))
dt_models['2018'] = model

1.0


In [14]:
pred = dt_models['2018'].predict(test_data['2018']['X'])
dt_predicts.extend(pred)

In [15]:
dt_models = {}
model = RandomForestClassifier(n_estimators=700, criterion="entropy", max_depth=16, random_state=123456)
model.fit(train_data['2019']['X'].iloc[:, :], train_data['2019']['y'])
preds = model.predict(train_data['2019']['X'].iloc[:, :])
print(accuracy_score(train_data['2019']['y'],preds))
dt_models['2019'] = model

1.0


In [16]:
pred = dt_models['2019'].predict(test_data['2019']['X'])
dt_predicts.extend(pred)

In [17]:
dt_models = {}
model = RandomForestClassifier(n_estimators=700, criterion="entropy", max_depth=16, random_state=123456)
model.fit(train_data['2020']['X'].iloc[:, :], train_data['2020']['y'])
preds = model.predict(train_data['2020']['X'].iloc[:, :])
print(accuracy_score(train_data['2020']['y'],preds))
dt_models['2020'] = model

1.0


In [18]:
pred = dt_models['2020'].predict(test_data['2020']['X'])
dt_predicts.extend(pred)

### 결과 도출 단계

sample submission 파일을 불러와 진행하였습니다

In [3]:
submission = pd.read_csv('/Users/home/Library/Mobile Documents/com~apple~CloudDocs/Downloads/KNOW_data/sample_submission.csv') # sample submission 불러오기

In [22]:
submission['knowcode'] = dt_predicts

submission.to_csv('submission.csv', index=False)

In [23]:
submission

Unnamed: 0,idx,knowcode
0,0,29401
1,1,9999999
2,2,9999999
3,3,121102
4,4,412003
...,...,...
35226,35244,851101
35227,35245,411104
35228,35246,833001
35229,35247,140101


In [59]:
submission[submission['knowcode']==9999999]

Unnamed: 0,idx,knowcode
1,1,9999999
2,2,9999999
7,7,9999999
8,8,9999999
12,12,9999999
...,...,...
9463,9463,9999999
9471,9471,9999999
9475,9475,9999999
9476,9476,9999999
