## Import

In [1]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) # Seed 고정

## Data Load

In [38]:
train_df = pd.read_csv('./train_filled2.csv')
test_df = pd.read_csv('./test.csv')

In [39]:
train_x = train_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
train_y = train_df['Y_Class']

test_x = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

In [47]:
train_y

0      1
1      2
2      1
3      2
4      1
      ..
593    1
594    0
595    0
596    1
597    1
Name: Y_Class, Length: 598, dtype: int64

## Data Pre-processing

In [41]:
# train_x = train_x.fillna(0)
test_x = test_x.fillna(0)

In [42]:
# qualitative to quantitative
qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])
    
    for label in np.unique(test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_x[i] = le.transform(test_x[i]) 
print('Done.')

Done.


In [43]:
train_x

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,2,0,2.480969,95.048443,0.0,45.0,10.401384,0.0,48.858131,10.048443,...,39.340,40.89,32.560,34.09,77.77,1.0,0.0,0.0,0.0,0.0
1,3,0,2.000000,95.000000,0.0,45.0,10.000000,0.0,50.000000,10.000000,...,38.890,42.82,43.920,35.34,72.55,1.0,0.0,0.0,0.0,0.0
2,2,0,2.480969,95.048443,0.0,45.0,10.401384,0.0,48.858131,10.048443,...,39.190,36.65,42.470,36.53,78.35,1.0,0.0,0.0,0.0,0.0
3,3,0,2.000000,95.000000,0.0,45.0,10.000000,0.0,50.000000,10.000000,...,37.740,39.17,52.170,30.58,71.78,1.0,0.0,0.0,0.0,0.0
4,2,0,2.480969,95.048443,0.0,45.0,10.401384,0.0,48.858131,10.048443,...,38.700,41.89,46.930,33.09,76.97,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,5,2,2.000000,95.000000,0.0,45.0,10.000000,0.0,50.000000,10.000000,...,53.810,56.14,51.310,56.06,64.90,1.0,0.0,0.0,0.0,0.0
594,2,0,2.000000,97.000000,0.0,45.0,10.000000,0.0,45.000000,10.000000,...,49.470,53.07,50.890,55.10,66.49,1.0,0.0,0.0,0.0,0.0
595,2,0,2.000000,97.000000,0.0,45.0,10.000000,0.0,45.000000,10.000000,...,53.385,55.79,52.775,56.52,64.76,1.0,0.0,0.0,0.0,0.0
596,4,1,40.000000,94.000000,0.0,45.0,11.000000,0.0,45.000000,10.000000,...,53.810,56.14,51.310,56.06,64.90,1.0,0.0,0.0,0.0,0.0


## Classification Model Fit

In [44]:
RF = RandomForestClassifier(random_state=37).fit(train_x, train_y)
print('Done.')

Done.


## Inference

In [45]:
preds = RF.predict(test_x)
print('Done.')

Done.


In [46]:
print(preds)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


## Submit

In [28]:
submit = pd.read_csv('./sample_submission.csv')

In [29]:
submit['Y_Class'] = preds

In [30]:
submit.to_csv('./baseline_submission.csv', index=False)