In [1]:
from scipy.stats import skew, kurtosis
from sklearn.manifold import TSNE

import pandas as pd
import numpy as np
import missingno
import seaborn as sns
import os
from tqdm import tqdm
pd.set_option('display.max_rows', 500)

train = pd.read_csv('./data/train_features.csv')
train_labels = pd.read_csv('./data/train_labels.csv')
test = pd.read_csv('./data/test_features.csv')
submission = pd.read_csv('./data/sample_submission.csv')

In [2]:
train.shape

(1875000, 8)

In [3]:
train

Unnamed: 0,id,time,acc_x,acc_y,acc_z,gy_x,gy_y,gy_z
0,0,0,1.206087,-0.179371,-0.148447,-0.591608,-30.549010,-31.676112
1,0,1,1.287696,-0.198974,-0.182444,0.303100,-39.139103,-24.927216
2,0,2,1.304609,-0.195114,-0.253382,-3.617278,-44.122565,-25.019629
3,0,3,1.293095,-0.230366,-0.215210,2.712986,-53.597843,-27.454013
4,0,4,1.300887,-0.187757,-0.222523,4.286707,-57.906561,-27.961234
...,...,...,...,...,...,...,...,...
1874995,3124,595,-0.712530,-0.658357,0.293707,-29.367857,-104.013664,-76.290437
1874996,3124,596,-0.683037,-0.658466,0.329223,-30.149089,-101.796809,-76.625087
1874997,3124,597,-0.664730,-0.666625,0.364114,-27.873095,-98.776072,-79.365125
1874998,3124,598,-0.630534,-0.682565,0.373696,-23.636550,-99.139495,-80.259478


In [2]:
train['acc_t'] = np.sqrt(train['acc_x']**2 + train['acc_y']**2 + train['acc_z']**2)
test['acc_t'] = np.sqrt(test['acc_x']**2 + test['acc_y']**2 + test['acc_z']**2)

train['gy_t'] = np.sqrt(train['gy_x']**2 + train['gy_y']**2 + train['gy_z']**2)
test['gy_t'] = np.sqrt(test['gy_x']**2 + test['gy_y']**2 + test['gy_z']**2)

In [2]:
train['acc_t'] = np.sqrt(train['acc_x']**2 + train['acc_y']**2 + train['acc_z']**2)
test['acc_t'] = np.sqrt(test['acc_x']**2 + test['acc_y']**2 + test['acc_z']**2)

train['gy_t'] = np.sqrt(train['gy_x']**2 + train['gy_y']**2 + train['gy_z']**2)
test['gy_t'] = np.sqrt(test['gy_x']**2 + test['gy_y']**2 + test['gy_z']**2)

train['total'] = train['acc_t'] + train['gy_t']
test['total'] = test['acc_t'] + test['gy_t']

In [2]:
train['acc_t'] = np.sqrt(train['acc_x']**2 + train['acc_y']**2 + train['acc_z']**2)
test['acc_t'] = np.sqrt(test['acc_x']**2 + test['acc_y']**2 + test['acc_z']**2)
train['acc_xy'] = np.sqrt(train['acc_x']**2 + train['acc_y']**2)
test['acc_xy'] = np.sqrt(test['acc_x']**2 + test['acc_y']**2)
train['acc_yz'] = np.sqrt(train['acc_y']**2 + train['acc_z']**2)
test['acc_yz'] = np.sqrt(test['acc_y']**2 + test['acc_z']**2)
train['acc_xz'] = np.sqrt(train['acc_x']**2 + train['acc_z']**2)
test['acc_xz'] = np.sqrt(test['acc_x']**2 + test['acc_z']**2)

train['gy_t'] = np.sqrt(train['gy_x']**2 + train['gy_y']**2 + train['gy_z']**2)
test['gy_t'] = np.sqrt(test['gy_x']**2 + test['gy_y']**2 + test['gy_z']**2)
train['gy_xy'] = np.sqrt(train['gy_x']**2 + train['gy_y']**2)
test['gy_xy'] = np.sqrt(test['gy_x']**2 + test['gy_y']**2)
train['gy_yz'] = np.sqrt(train['gy_y']**2 + train['gy_z']**2)
test['gy_yz'] = np.sqrt(test['gy_y']**2 + test['gy_z']**2)
train['gy_xz'] = np.sqrt(train['gy_x']**2 + train['gy_z']**2)
test['gy_xz'] = np.sqrt(test['gy_x']**2 + test['gy_z']**2)

In [3]:
def ft_trans(name,train,test):
    def train_test(check, num_col):
        
        if check == 'train':
            df_checking = train.copy()
            train_datas = np.zeros((len(df_checking.id.unique()), 304))
        elif check == 'test':
            df_checking = test.copy()
            train_datas = np.zeros((len(df_checking.id.unique()), 304))
            
        for i, num in enumerate(tqdm(df_checking.id.unique())):
            
            tt = df_checking.loc[df_checking.id == num][name] - df_checking.loc[df_checking.id == num][name].mean()
            fmax = 50
            dt = 1/fmax
            N = 600
            
            t = np.arange(0, N)*dt
            x = tt.values
            df = fmax/N
            f = np.arange(0, N)*df
            xf = np.fft.fft(x)*dt
            tq_index = f[0:int(N/2+1)]
            tq_abs = np.abs(xf[0:int(N/2+1)])
            
            results = pd.DataFrame(tq_abs, tq_index).reset_index().rename(columns={'index':'hz',0:'abs_value'})
            
            ar0 = np.array([num])
            ar1 = results.abs_value.values
            ar2 = np.array([skew(results.abs_value), kurtosis(results.abs_value, fisher=True)])
            return_value = np.concatenate([ar0, ar1, ar2])
            train_datas[i] = return_value
            
        return train_datas
    
    
    col_ft = ['_'+str(x) for x in range(304)]
    num_col = len(col_ft)
    train_datas = train_test('train', num_col)
    test_datas = train_test('test', num_col)
    
    col_ft_F = ['id']+[name+"_" + x for x in col_ft[1:]]
    train_df = pd.DataFrame(train_datas, columns=col_ft_F)
    test_df = pd.DataFrame(test_datas, columns=col_ft_F)
    
    train_df.id = train_df.id.astype('int')
    test_df.id = test_df.id.astype('int')
    
    return train_df, test_df

In [4]:
train_fft, test_fft = ft_trans('acc_t', train, test)

100%|██████████| 3125/3125 [00:17<00:00, 177.75it/s]
100%|██████████| 782/782 [00:02<00:00, 324.98it/s]


In [5]:
train_fft, test_fft = ft_trans('gy_t', train, test)

100%|██████████| 3125/3125 [00:17<00:00, 182.61it/s]
100%|██████████| 782/782 [00:02<00:00, 323.10it/s]


In [7]:
train_fft.shape

(3125, 304)

In [8]:
test_fft.shape

(782, 304)

In [6]:
train__fft_target = train_fft.iloc[:,1:-2]

In [7]:
train__fft_target.shape

(3125, 301)

In [8]:
test__fft_target = test_fft.iloc[:,1:-2]

In [9]:
test__fft_target.shape

(782, 301)

In [9]:
features = ['id', 'acc_x', 'acc_y', 'acc_z', 'gy_x', 'gy_y', 'gy_z', 'acc_t', 'gy_t', 'acc_xy', 'acc_yz', 'acc_xz', 'gy_xy', 'gy_yz', 'gy_xz']
X_train = train[features].groupby('id').agg(['max', 'min', 'mean', 'std'])
X_test = test[features].groupby('id').agg(['max', 'min', 'mean', 'std'])

In [3]:
features = ['id', 'acc_x', 'acc_y', 'acc_z', 'gy_x', 'gy_y', 'gy_z', 'acc_t', 'gy_t']
X_train = train[features].groupby('id').agg(['max', 'min', 'mean', 'std'])
X_test = test[features].groupby('id').agg(['max', 'min', 'mean', 'std'])

In [3]:
features = ['id', 'acc_x', 'acc_y', 'acc_z', 'gy_x', 'gy_y', 'gy_z', 'acc_t', 'gy_t', 'total']
X_train = train[features].groupby('id').agg(['max', 'min', 'mean', 'std'])
X_test = test[features].groupby('id').agg(['max', 'min', 'mean', 'std'])

In [6]:
y_train = train_labels['label']

In [12]:
X_train = pd.concat([X_train,train__fft_target],axis=1)

In [13]:
X_test = X_test.reset_index()
X_test = X_test.drop(['id'],axis=1)

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


In [14]:
X_test = pd.concat([X_test,test__fft_target],axis=1)

In [15]:
X_test

Unnamed: 0,"(acc_x, max)","(acc_x, min)","(acc_x, mean)","(acc_x, std)","(acc_y, max)","(acc_y, min)","(acc_y, mean)","(acc_y, std)","(acc_z, max)","(acc_z, min)",...,gy_t__292,gy_t__293,gy_t__294,gy_t__295,gy_t__296,gy_t__297,gy_t__298,gy_t__299,gy_t__300,gy_t__301
0,-0.275446,-1.564000,-1.018731,0.236232,0.228040,-0.470937,-0.019574,0.091641,0.286182,-0.573836,...,2.409821,2.257558,1.843405,0.911961,1.191994,1.389685,0.774847,1.264183,1.169758,1.337552
1,0.627571,-1.929033,-0.522843,0.539688,1.708743,-0.200678,0.612161,0.333015,0.671876,-1.212052,...,2.500361,2.397393,0.550313,2.571511,1.384850,2.078676,2.527210,2.137230,1.282196,3.678066
2,2.972063,-0.792916,0.506947,0.219934,1.941820,0.219008,0.903819,0.191485,0.644154,-0.484614,...,1.108236,1.253278,0.182205,1.402802,0.434885,0.360935,0.196780,1.298469,0.729166,0.754254
3,0.337281,-1.045889,-0.577603,0.431713,-0.258476,-1.294482,-0.610557,0.233601,0.702574,-0.469924,...,0.192490,0.081635,0.292015,0.089924,0.314402,0.390531,0.367464,0.252793,0.295148,0.570445
4,0.015642,-2.153047,-0.738640,0.305797,1.562602,-0.860883,0.182535,0.314294,1.037876,-0.631258,...,0.612393,2.527394,5.850573,1.284003,3.527688,1.852169,0.622027,0.856973,4.097299,0.298385
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
777,0.427159,-2.050254,-0.907299,0.352604,3.057501,-1.414874,0.488264,0.370471,0.237183,-1.517656,...,7.312410,7.587643,7.413809,6.735101,6.874923,6.987798,5.091053,4.371228,6.408055,5.192909
778,1.659451,-1.709527,-0.608731,0.663522,1.549890,-1.247963,0.371269,0.406506,0.713875,-0.996954,...,2.743801,1.582575,3.099657,2.920597,0.820636,1.055903,1.240361,3.047443,1.282748,3.114722
779,-0.085249,-2.124959,-0.753193,0.252666,1.236138,-0.443533,0.199782,0.300552,0.939270,-0.332901,...,0.358470,1.831463,1.332555,1.075824,2.164495,1.224490,1.277325,1.651631,1.011536,2.111658
780,1.438345,0.536568,0.958903,0.164880,0.076427,-0.580191,-0.320975,0.120706,0.097833,-0.774209,...,3.139355,1.750927,1.372878,3.047922,0.959172,1.743673,2.232256,2.384429,0.467780,2.318288


In [4]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)

In [28]:
from sklearn.model_selection import train_test_split

dx_train, dx_test, dy_train, dy_test = train_test_split(X_train_scaled, y_train, test_size=0.2, random_state=42, shuffle=True)

# SVC

In [142]:
from sklearn.svm import SVC

svm = SVC(kernel= 'rbf', C=180, gamma=0.25, probability=True)

In [143]:
svm.fit(dx_train, dy_train)

SVC(C=180, gamma=0.25, probability=True)

In [163]:
svm_pred = svm.predict(X_test_scaled)

In [30]:
print("Accuracy: {}%".format(svm.score(dx_test, dy_test) * 100 ))

Accuracy: 80.64%


Accuracy: 80.32000000000001% 1300 0.1 acc 푸리에변환

Accuracy: 79.2% C=2150, gamma=0.005 acc + gy 푸리에변환

80.80000000000001% SVC(C=100, gamma=0.35, probability=True)   t만

Accuracy: 80.47999999999999%  SVC(C=150, gamma=0.28, probability=True)  2차원도

Accuracy: 80.64% SVC(C=180, gamma=0.25, probability=True) total

# Bagging + SVC

In [145]:
from sklearn.ensemble import BaggingClassifier

In [146]:
from sklearn.ensemble import BaggingClassifier
bagging = BaggingClassifier(svm, n_estimators=150, oob_score=True, n_jobs=-1, random_state=42)

In [147]:
bagging.fit(dx_train, dy_train)
print("Accuracy: {}%".format(bagging.score(dx_test, dy_test) * 100 ))

Accuracy: 77.12%


In [164]:
bagging_pred = bagging.predict(X_test_scaled)

Accuracy: 81.44%bagging = BaggingClassifier(SVC(C=150, gamma=0.28), n_estimators=180, oob_score=True, n_jobs=-1, random_state=42)

# K-Fold + Bagging + SVC

In [67]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(bagging, X_train_scaled, y_train)
print("교차 검증 점수: {}".format(scores))
print("교차 검증 평균 점수: {:.2f}".format(scores.mean()))

교차 검증 점수: [0.7872 0.7968 0.8    0.8048 0.8032]
교차 검증 평균 점수: 0.80


교차 검증 점수: [0.8096 0.7728 0.8064 0.8016 0.8096]
교차 검증 평균 점수: 0.80  bagging

교차 검증 점수: [0.8128 0.7728 0.8096 0.8032 0.8144]
교차 검증 평균 점수: 0.80
svm

교차 검증 점수: [0.8096 0.7696 0.808  0.7968 0.8096]
교차 검증 평균 점수: 0.80 lgbm

# RandomForest

In [149]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(criterion='entropy', n_estimators=2000, n_jobs=20, random_state=42)

In [150]:
rf.fit(dx_train, dy_train)

RandomForestClassifier(criterion='entropy', n_estimators=2000, n_jobs=20,
                       random_state=42)

In [165]:
rf_pred = rf.predict(X_test_scaled)

In [105]:
print("Accuracy: {}%".format(rf.score(dx_test, dy_test) * 100 ))

Accuracy: 75.36%


Accuracy: 68.32000000000001% 800

# LGBM

In [20]:
from catboost import CatBoostClassifier, Pool
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold

In [152]:
from sklearn.metrics import log_loss
import time
from sklearn.model_selection import StratifiedKFold, cross_val_score
from bayes_opt import BayesianOptimization

In [153]:
ml = LGBMClassifier(learning_rate=0.05, max_depth=10, n_estimators=1300, num_leaves=1024)

In [154]:
ml.fit(dx_train, dy_train, eval_metric = 'logloss')

LGBMClassifier(learning_rate=0.05, max_depth=10, n_estimators=1300,
               num_leaves=1024)

In [166]:
lgb_pred = ml.predict(X_test_scaled)

In [28]:
print("Accuracy: {}%".format(ml.score(dx_test, dy_test) * 100 ))

Accuracy: 77.44%


LGBMClassifier(learning_rate=0.05, max_depth=10, n_estimators=1300,
               num_leaves=1024)
Accuracy: 77.12%

In [11]:
X_test_scaled = scaler.transform(X_test)

In [39]:
y_pred = svm.predict_proba(X_test_scaled)

In [40]:
y_pred.shape

(782, 61)

# Ensemble

In [167]:
pred = np.array([rf_pred, svm_pred, lgb_pred, bagging_pred])
print(pred.shape)

(4, 782)


In [168]:
pred = np.transpose(pred)
print(pred.shape)

(782, 4)


In [158]:
lr_final = SVC(kernel= 'rbf', C=180, gamma=0.25, probability=True)

In [159]:
lr_final.fit(pred, dy_test)

SVC(C=180, gamma=0.25, probability=True)

In [171]:
final = lr_final.predict_proba(pred)

In [161]:
from sklearn.metrics import accuracy_score

In [162]:
print('최종 메타 모델의 예측 정확도: {0:.4f}'.format(accuracy_score(dy_test , final)))

최종 메타 모델의 예측 정확도: 0.9056


In [172]:
submission.iloc[:,1:] = final
submission

Unnamed: 0,id,0,1,2,3,4,5,6,7,8,...,51,52,53,54,55,56,57,58,59,60
0,3125,0.013972,0.023891,0.018355,0.006857,0.009440,0.013553,0.023414,0.028291,0.026208,...,0.008401,0.018673,0.013141,0.008309,0.020276,0.009534,0.008103,0.008394,0.003294,0.019987
1,3126,0.002628,0.001868,0.001993,0.001230,0.000948,0.003082,0.004890,0.004966,0.003461,...,0.000987,0.001807,0.001492,0.000895,0.001672,0.000858,0.001029,0.001017,0.000535,0.007372
2,3127,0.013442,0.024030,0.017931,0.006240,0.008934,0.012875,0.023129,0.028197,0.025841,...,0.007784,0.018271,0.012617,0.007813,0.019590,0.009044,0.007668,0.008080,0.003030,0.019275
3,3128,0.002628,0.001868,0.001993,0.001230,0.000948,0.003082,0.004890,0.004966,0.003461,...,0.000987,0.001807,0.001492,0.000895,0.001672,0.000858,0.001029,0.001017,0.000535,0.007372
4,3129,0.002628,0.001868,0.001993,0.001230,0.000948,0.003082,0.004890,0.004966,0.003461,...,0.000987,0.001807,0.001492,0.000895,0.001672,0.000858,0.001029,0.001017,0.000535,0.007372
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
777,3902,0.002628,0.001868,0.001993,0.001230,0.000948,0.003082,0.004890,0.004966,0.003461,...,0.000987,0.001807,0.001492,0.000895,0.001672,0.000858,0.001029,0.001017,0.000535,0.007372
778,3903,0.002628,0.001868,0.001993,0.001230,0.000948,0.003082,0.004890,0.004966,0.003461,...,0.000987,0.001807,0.001492,0.000895,0.001672,0.000858,0.001029,0.001017,0.000535,0.007372
779,3904,0.002628,0.001868,0.001993,0.001230,0.000948,0.003082,0.004890,0.004966,0.003461,...,0.000987,0.001807,0.001492,0.000895,0.001672,0.000858,0.001029,0.001017,0.000535,0.007372
780,3905,0.014662,0.022862,0.019056,0.007341,0.009575,0.013203,0.020970,0.024595,0.021717,...,0.008612,0.018528,0.014223,0.008564,0.018890,0.009581,0.008349,0.008468,0.003647,0.018605


In [173]:
submission.to_csv('./output/Fourier_MinMaxScaler_SVM_+gy+total_std_all_stacking.csv', index=False)