 # Project 1. 자율주행 센서의 안테나 성능 예측 
### Background
- 생산 공정 데이터를 활용하여 Radar 센서의 안테나 성능 예측을 위한 AI 모델 개발 (데이콘 competition)

### Summary
1. Import Libraries / Packages
2. Data Manipulation
3. Define Model)
4. Model Traing
5. Validation
6. Prediction

## 1. Import Libraries / Packages

In [2]:
import pandas as pd
import random
import os
import numpy as np
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error , r2_score
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

In [3]:
# Seed 고정

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42)

## 2. Data manipulation

In [4]:
#raw data 불러오기
train_df = pd.read_csv('./train.csv') # features
test_df = pd.read_csv('./test.csv') # test

In [5]:
# feature, label 나누기
X_train = train_df.filter(regex='X') # Input : X Featrue
y_train = train_df.filter(regex='Y') # Output : Y Feature

In [6]:
# train셋 안에서 훈련용/검증용 나누기
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=156 )

## 3. Define Model

In [7]:
#Define the estimator
estimator = RandomForestRegressor(max_depth = 30, 
                                  random_state = 156,
                                  n_estimators = 800,
                                  n_jobs = -1,
                                  min_samples_leaf = 8,
                                  min_samples_split = 8,
                                 )

In [8]:
# Define the model : MultiOutputRegressor
my_model = MultiOutputRegressor(estimator = estimator, n_jobs = -1).fit(X_tr, y_tr) # n_jobs = -1 -> 사용 가능한 모든 프로세스 / 스레드를 사용함을 의미

## 4. Model Training

In [10]:
# 테스트 데이터 준비
X_test = test_df.drop(columns=['ID'])
# test_x_scale = scaler.fit_transform(test_x[:])
# test_x_scale

In [11]:
# 1차 predict
preds = my_model.predict(X_val)
print('Done.')


KeyboardInterrupt



## 5. Validation

In [None]:
# 1차 검증
mse = mean_squared_error(y_val, preds) #평가척도
rmse = np.sqrt(mse) #mean_squared_error함수의 squared=False 옵션으로도 구할 수 있음.

print('MSE : {0:.3f} , RMSE : {1:.3F}'.format(mse , rmse))
print('Variance score : {0:.3f}'.format(r2_score(y_val, preds))) #결정계수

In [12]:
# feature 중요도
importances_values = estimator.feature_importances_
importances = pd.Series(importances_values, index=X_train.columns)
top20 = importances.sort_values(ascending=False)[:20]
plt.figure(figsize=(8, 6))
plt.title('Feature importances Top 20')
sns.barplot(x = top20, y = top20.index)
plt.show()

NotFittedError: This RandomForestRegressor instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

## 6. Prediction

In [None]:
#최종 예측
my_fmodel = MultiOutputRegressor(estimator = estimator, n_jobs = -1).fit(X_train, y_train) # n_jobs = -1 -> 사용 가능한 모든 프로세스 / 스레드를 사용함을 의미
preds = my_fmodel.predict(X_test)
print('Done.')

In [36]:
preds

array([[  1.4710082 ,   1.19035672,   1.11763683, ..., -26.11446507,
        -26.06228634, -26.10070942],
       [  1.42689635,   1.22007794,   1.16513542, ..., -26.14551808,
        -26.13475381, -26.13242849],
       [  1.38813154,   1.09650931,   1.04128168, ..., -25.88910959,
        -25.91937843, -25.92646029],
       ...,
       [  1.2359692 ,   0.93676707,   0.95752368, ..., -26.49986265,
        -26.471607  , -26.49624231],
       [  1.17877135,   0.85173444,   0.89065956, ..., -26.46338186,
        -26.46640503, -26.49613365],
       [  1.30163586,   0.97258776,   0.98211198, ..., -26.4670594 ,
        -26.47936004, -26.48218177]])

## 7. submission

In [30]:
submit = pd.read_csv('./sample_submission.csv')

In [31]:
for idx, col in enumerate(submit.columns):
    if col=='ID':
        continue
    submit[col] = preds[:,idx-1]
print('Done.')


Done.


In [32]:
submit

Unnamed: 0,ID,Y_01,Y_02,Y_03,Y_04,Y_05,Y_06,Y_07,Y_08,Y_09,Y_10,Y_11,Y_12,Y_13,Y_14
0,TEST_00001,1.476261,1.210293,1.110736,14.039317,31.066633,16.650865,3.168970,-26.103182,-26.202742,-22.431839,24.467194,-26.095528,-26.069312,-26.086740
1,TEST_00002,1.451316,1.191355,1.154898,13.580468,30.991087,16.650387,3.172205,-26.238353,-26.184104,-22.266456,24.350268,-26.133199,-26.108978,-26.124071
2,TEST_00003,1.379797,1.132904,1.077127,13.832803,31.999874,16.608557,3.168516,-26.007367,-26.075518,-22.269052,24.298057,-25.964950,-25.996908,-25.998729
3,TEST_00004,1.454906,1.149447,1.094785,14.885575,32.118500,17.083437,3.100685,-25.707326,-25.752381,-21.836812,24.775836,-25.687304,-25.685976,-25.686804
4,TEST_00005,1.431904,1.044507,0.990575,14.922984,31.832361,17.027708,3.169441,-25.660324,-25.615455,-22.040079,24.755540,-25.543595,-25.651403,-25.640309
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39603,TEST_39604,1.257105,0.974964,1.013157,12.860907,30.748744,16.724698,3.166933,-26.540364,-26.501342,-22.959006,24.365608,-26.445116,-26.440617,-26.452671
39604,TEST_39605,1.260489,0.874111,0.930772,14.289367,31.152853,16.687768,3.184797,-26.490015,-26.489505,-22.947160,24.389693,-26.463371,-26.419499,-26.424081
39605,TEST_39606,1.257871,0.921194,0.944207,13.056239,31.033331,16.531991,3.162037,-26.527971,-26.537844,-22.815616,24.296947,-26.508609,-26.492190,-26.465053
39606,TEST_39607,1.198016,0.862122,0.856929,14.017519,30.607618,16.748117,3.170478,-26.489510,-26.579465,-22.922726,24.436565,-26.482114,-26.494048,-26.462697


In [33]:
submit.to_csv('./submit_0826_1.csv', index=False)