In [1]:
import random
import pandas as pd
import numpy as np
import os
import librosa
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from tqdm.auto import tqdm

import warnings
warnings.filterwarnings(action='ignore') 

CFG = {
    'SR':30000,
    'N_MFCC':128, # MFCC 벡터를 추출할 개수 (<=128)
    'SEED':41
}

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(CFG['SEED']) # Seed 고정

train_df = pd.read_csv('train.csv') # 모두 정상 Sample
test_df = pd.read_csv('test.csv')

def get_mfcc_feature(df):
    features = []
    for path in tqdm(df['SAMPLE_PATH']):
        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=CFG['SR'])
        
        # librosa패키지를 사용하여 mfcc 추출
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CFG['N_MFCC'])

        y_feature = []
        # 추출된 MFCC들의 평균을 Feature로 사용
        for e in mfcc:
            y_feature.append(np.mean(e))
        features.append(y_feature)
    return features

train_features = get_mfcc_feature(train_df)
test_features = get_mfcc_feature(test_df)

import os
import glob
import re
import seaborn as sns
from numpy.fft import *
import pandas as pd
from scipy import signal
from scipy import stats
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
# from IPython.core.display import display, HTML
import warnings
warnings.filterwarnings(action="ignore")

def calc(data, window_size = 128):
#     data = data.drop(['Time (s)','Volt_R','Volt_S','Volt_T'],axis = 1)
    n = len(data)
    m = len(data.columns)
    i = 0
    df = pd.DataFrame()
    while i<n:
        df0 = data[i:i+window_size]
        indx = pd.DataFrame([i//window_size]*m,columns = ['indx'])
        
        
        kurtosis = pd.DataFrame(df0.kurtosis(),columns = ['kurtosis'])
        skew = pd.DataFrame(df0.skew(),columns = ['skew'])
        rms = pd.DataFrame(df0.apply(lambda d: np.sqrt((d ** 2).sum()/d.size)),columns = ['rms'])
        std = pd.DataFrame(df0.std(),columns = ['std'])
        mx = pd.DataFrame(df0.apply(lambda d: d.max()),columns = ['max'])
        mn = pd.DataFrame(df0.apply(lambda d: d.min()),columns = ['min'])
        mean = pd.DataFrame(df0.mean(),columns = ['mean'])
        
        crest = pd.DataFrame(df0.apply(lambda d: abs(d.max())/ np.sqrt ((d ** 2).sum()/d.size)), columns = ['crest'])
        shape = pd.DataFrame(df0.apply(lambda d: np.sqrt ((d ** 2).sum()/d.size) / abs(d.mean())), columns = ['shape'])
        impulse = pd.DataFrame(df0.apply(lambda d: abs(d.max()) / abs(d.mean())), columns = ['impulse'])
        p2p = pd.DataFrame(df0.apply(lambda d: abs(d.max()) + abs(d.min())), columns = ['p2p'])
        
        clearence = pd.DataFrame(df0.apply(lambda d: ((np.sqrt(d.abs())).sum()/len(d))**2),columns = ['clearence'])
        entropy = pd.DataFrame(df0.apply(lambda d: stats.entropy(pd.cut(d, 500).value_counts())),columns = ['entropy'])
        df1 = pd.concat([kurtosis, skew, rms, std, mn, mx, 
                         mean, clearence, entropy, crest, shape, 
                         impulse, p2p], axis=1).reset_index().rename(columns={"index": "factor"})
        df2 = pd.concat([indx,df1],axis = 1)
        df = pd.concat([df,df2])
        i+=window_size

    a = df.pivot(index='indx', columns='factor', values=['kurtosis','skew','rms','std','max','min',
                                                         'mean', "clearence", "entropy", "crest", "shape",
                                                        "impulse", "p2p"])
    a.columns = a.columns.swaplevel(0, 1)
    a.sort_index(axis=1, level=0, inplace=True)
    return a

  0%|          | 0/1279 [00:00<?, ?it/s]

  0%|          | 0/1514 [00:00<?, ?it/s]

In [2]:
train_df[train_df["FAN_TYPE"] == 2].reset_index(drop = True)

Unnamed: 0,SAMPLE_ID,SAMPLE_PATH,FAN_TYPE,LABEL
0,TRAIN_0000,./train/TRAIN_0000.wav,2,0
1,TRAIN_0003,./train/TRAIN_0003.wav,2,0
2,TRAIN_0004,./train/TRAIN_0004.wav,2,0
3,TRAIN_0005,./train/TRAIN_0005.wav,2,0
4,TRAIN_0006,./train/TRAIN_0006.wav,2,0
...,...,...,...,...
635,TRAIN_1272,./train/TRAIN_1272.wav,2,0
636,TRAIN_1274,./train/TRAIN_1274.wav,2,0
637,TRAIN_1275,./train/TRAIN_1275.wav,2,0
638,TRAIN_1276,./train/TRAIN_1276.wav,2,0


In [3]:
train_df[train_df["FAN_TYPE"] == 0].reset_index(drop = True)

Unnamed: 0,SAMPLE_ID,SAMPLE_PATH,FAN_TYPE,LABEL
0,TRAIN_0001,./train/TRAIN_0001.wav,0,0
1,TRAIN_0002,./train/TRAIN_0002.wav,0,0
2,TRAIN_0008,./train/TRAIN_0008.wav,0,0
3,TRAIN_0009,./train/TRAIN_0009.wav,0,0
4,TRAIN_0013,./train/TRAIN_0013.wav,0,0
...,...,...,...,...
634,TRAIN_1267,./train/TRAIN_1267.wav,0,0
635,TRAIN_1269,./train/TRAIN_1269.wav,0,0
636,TRAIN_1271,./train/TRAIN_1271.wav,0,0
637,TRAIN_1273,./train/TRAIN_1273.wav,0,0


In [4]:
train_0 = train_df[train_df["FAN_TYPE"] == 0].reset_index(drop = True)
train_2 = train_df[train_df["FAN_TYPE"] == 2].reset_index(drop = True)

test_0 = test_df[test_df["FAN_TYPE"] == 0].reset_index(drop = True)
test_2 = test_df[test_df["FAN_TYPE"] == 2].reset_index(drop = True)

train_0_features = get_mfcc_feature(train_0)
train_2_features = get_mfcc_feature(train_2)
test_0_features = get_mfcc_feature(test_0)
test_2_features = get_mfcc_feature(test_2)


  0%|          | 0/639 [00:00<?, ?it/s]

  0%|          | 0/640 [00:00<?, ?it/s]

  0%|          | 0/779 [00:00<?, ?it/s]

  0%|          | 0/735 [00:00<?, ?it/s]

In [5]:
print(" train_0_features.shape:", len(train_0_features),
     "\n train_2_features.shape:", len(train_2_features),
     "\n test_0_features.shape:", len(test_0_features),
     "\n test_2_features.shape:", len(test_2_features))

 train_0_features.shape: 639 
 train_2_features.shape: 640 
 test_0_features.shape: 779 
 test_2_features.shape: 735


In [6]:
######################### TRAIN_0 ##########################

feature_frame = pd.DataFrame()

feature_frame["col"] = train_0_features[0]
f_data = calc(feature_frame)

for i in tqdm(range(len(train_0_features)-1)):
    feature_frame["col"] = train_0_features[i+1]
    calc_data = calc(feature_frame)
    
    f_data = pd.concat([f_data, calc_data])

train_0_features = f_data

######################### TRAIN_2 ##########################

feature_frame = pd.DataFrame()

feature_frame["col"] = train_2_features[0]
f_data = calc(feature_frame)

for i in tqdm(range(len(train_2_features)-1)):
    feature_frame["col"] = train_2_features[i+1]
    calc_data = calc(feature_frame)
    
    f_data = pd.concat([f_data, calc_data])

train_2_features = f_data

######################### TEST_0 ##########################

feature_frame = pd.DataFrame()

feature_frame["col"] = test_0_features[0]
f_data = calc(feature_frame)

for i in tqdm(range(len(test_0_features)-1)):
    feature_frame["col"] = test_0_features[i+1]
    calc_data = calc(feature_frame)
    
    f_data = pd.concat([f_data, calc_data])

test_0_features = f_data

######################### TEST_2 ##########################

feature_frame = pd.DataFrame()

feature_frame["col"] = test_2_features[0]
f_data = calc(feature_frame)

for i in tqdm(range(len(test_2_features)-1)):
    feature_frame["col"] = test_2_features[i+1]
    calc_data = calc(feature_frame)
    
    f_data = pd.concat([f_data, calc_data])

test_2_features = f_data

  0%|          | 0/638 [00:00<?, ?it/s]

  0%|          | 0/639 [00:00<?, ?it/s]

  0%|          | 0/778 [00:00<?, ?it/s]

  0%|          | 0/734 [00:00<?, ?it/s]

In [11]:
model_0 = IsolationForest(n_estimators=1000, max_samples=1024, contamination='auto', random_state=CFG['SEED'], max_features = 13, verbose=0)
model_0.fit(train_0_features) # model train

model_2 = IsolationForest(n_estimators=1000, max_samples=1024, contamination='auto', random_state=CFG['SEED'], max_features = 13, verbose=0)
model_2.fit(train_2_features) # model train

def get_pred_label(model_pred):
    # IsolationForest 모델 출력 (1:정상, -1:불량) 이므로 (0:정상, 1:불량)로 Label 변환
    model_pred = np.where(model_pred == 1, 0, model_pred)
    model_pred = np.where(model_pred == -1, 1, model_pred)
    return model_pred

test_0_pred = model_0.predict(test_0_features) # model prediction
test_0_pred = get_pred_label(test_0_pred)

test_2_pred = model_2.predict(test_2_features) # model prediction
test_2_pred = get_pred_label(test_2_pred)

counts = 0
for i in test_0_pred:
    if i == 1:
        counts += 1
for i in test_2_pred:
    if i == 1:
        counts += 1

        
pred = []
count_0 = 0
count_2 = 0

for i in range(len(test_df)):
    if test_df["FAN_TYPE"][i] == 0:
        pred.append(test_0_pred[count_0])
        count_0 += 1
    elif test_df["FAN_TYPE"][i] == 2:
        pred.append(test_2_pred[count_2])
        count_2 += 1
print("불량의 개수:",counts)

submit = pd.read_csv('sample_submission.csv')

submit['LABEL'] = pred

submit.to_csv("convert04.csv",index=False)

불량의 개수: 773


In [50]:
print(len(test_0_pred), len(test_2_pred), len(test_df))

779 735 1514


In [61]:
pred = []
count_0 = 0
count_2 = 0

for i in range(len(test_df)):
    if test_df["FAN_TYPE"][i] == 0:
        pred.append(test_0_pred[count_0])
        count_0 += 1
    elif test_df["FAN_TYPE"][i] == 2:
        pred.append(test_2_pred[count_2])
        count_2 += 1
pred = np.array(pred)

In [62]:
test_pred = get_pred_label(pred)

submit = pd.read_csv('sample_submission.csv')

submit['LABEL'] = test_pred

submit.to_csv("convert04.csv",index=False)

In [28]:
test_pred.shape

(1514,)

In [33]:
test_pred

array([1, 1, 1, ..., 1, 1, 1])

# normal

In [2]:
train_features

factor,col,col,col,col,col,col,col,col,col,col,col,col,col
Unnamed: 0_level_1,clearence,crest,entropy,impulse,kurtosis,max,mean,min,p2p,rms,shape,skew,std
indx,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
0,2.081300,3.136436,2.449938,42.798183,106.291641,96.714211,-2.259774,-332.822968,429.537170,30.835705,13.645483,-9.550573,30.873629
0,2.108886,3.476965,2.026843,77.943680,103.992744,142.282898,-1.825458,-438.514435,580.797363,40.921580,22.417160,-9.284210,41.041477
0,2.288417,3.179535,2.277892,68.531120,107.446182,123.302338,-1.799217,-419.292419,542.594727,38.779989,21.553817,-9.626598,38.890438
0,1.964656,3.150285,2.414482,45.536301,106.261932,97.461487,-2.140303,-333.860535,431.322021,30.937354,14.454660,-9.545494,30.984501
0,1.897056,2.925884,2.360550,33.174187,107.532234,90.000252,-2.712960,-333.146606,423.146851,30.760022,11.338176,-9.697346,30.760544
...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,1.843895,2.739916,2.341461,29.761150,108.934677,83.516144,-2.806214,-331.278717,414.794861,30.481280,10.862067,-9.839322,30.471092
0,2.001020,3.140083,2.448231,45.116367,106.127670,96.614929,-2.141461,-331.944550,428.559479,30.768268,14.367889,-9.539426,30.814259
0,2.142347,2.984763,2.453548,34.475525,107.480064,92.459663,-2.681893,-335.417572,427.877228,30.977224,11.550507,-9.676153,30.982174
0,2.015984,3.036482,2.445422,35.713036,107.057167,94.095650,-2.634770,-335.176880,429.272522,30.988380,11.761321,-9.631348,30.997490


In [3]:
train_features.shape

(1279, 13)

# 불량 개수 726개 -> 정확도 90%

In [2]:
model = IsolationForest(n_estimators=200, max_samples=1024, contamination='auto', random_state=CFG['SEED'], max_features = 8, verbose=0)
model.fit(train_features)

def get_pred_label(model_pred):
    # IsolationForest 모델 출력 (1:정상, -1:불량) 이므로 (0:정상, 1:불량)로 Label 변환
    model_pred = np.where(model_pred == 1, 0, model_pred)
    model_pred = np.where(model_pred == -1, 1, model_pred)
    return model_pred

test_pred = model.predict(test_features) # model prediction


counts = 0
for i in test_pred:
    if i == 1:
        counts += 1
print("불량의 개수:",counts)
        
test_pred = get_pred_label(test_pred)

submit = pd.read_csv('sample_submission.csv')

submit['LABEL'] = test_pred

submit.to_csv("ThresholdAndExtractFeaturesVer2_4.csv",index=False)

불량의 개수: 726


# 불량 개수 728개 -> 정확도 91%

# 불량 개수 744개 -> 정확도 75%

In [103]:
model = IsolationForest(n_estimators=200, max_samples=2048, contamination='auto', random_state=CFG['SEED'], max_features = 13, verbose=0)
model.fit(train_features)

def get_pred_label(model_pred):
    # IsolationForest 모델 출력 (1:정상, -1:불량) 이므로 (0:정상, 1:불량)로 Label 변환
    model_pred = np.where(model_pred == 1, 0, model_pred)
    model_pred = np.where(model_pred == -1, 1, model_pred)
    return model_pred

# test_pred = model.predict(test_features) # model prediction

test_pred_proba = model.decision_function(test_features)

threshold = 0.052

pred = (test_pred_proba < threshold)*1

counts = 0
for i in pred:
    if i == 1:
        counts += 1
print("불량의 개수:",counts)


submit = pd.read_csv('sample_submission.csv')

submit['LABEL'] = pred

submit.to_csv("Adjustthreshold.csv",index=False)

불량의 개수: 825


In [81]:
pred

array([1, 0, 0, ..., 0, 0, 1])

In [None]:

test_pred = get_pred_label(test_pred)

submit = pd.read_csv('sample_submission.csv')

submit['LABEL'] = test_pred

submit.to_csv("ThresholdAndExtractFeaturesVer2_5.csv",index=False)

# 불량 개수 679개 -> 정확도 89%

In [8]:
model = IsolationForest(n_estimators=200, max_samples=512, contamination='auto', random_state=CFG['SEED'], max_features = 13, verbose=0)
model.fit(train_features)

def get_pred_label(model_pred):
    # IsolationForest 모델 출력 (1:정상, -1:불량) 이므로 (0:정상, 1:불량)로 Label 변환
    model_pred = np.where(model_pred == 1, 0, model_pred)
    model_pred = np.where(model_pred == -1, 1, model_pred)
    return model_pred

test_pred = model.predict(test_features) # model prediction

counts = 0
for i in test_pred:
    if i == 1:
        counts += 1
print("불량의 개수:",counts)
        
test_pred = get_pred_label(test_pred)

submit = pd.read_csv('sample_submission.csv')

submit['LABEL'] = test_pred

submit.to_csv("ThresholdAndExtractFeaturesVer2_6.csv",index=False)

불량의 개수: 679


# 불량 개수 656개 -> 정확도 88%

In [11]:
model = IsolationForest(n_estimators=200, max_samples=256, contamination='auto', random_state=CFG['SEED'], max_features = 13, verbose=0)
model.fit(train_features)

def get_pred_label(model_pred):
    # IsolationForest 모델 출력 (1:정상, -1:불량) 이므로 (0:정상, 1:불량)로 Label 변환
    model_pred = np.where(model_pred == 1, 0, model_pred)
    model_pred = np.where(model_pred == -1, 1, model_pred)
    return model_pred

test_pred = model.predict(test_features) # model prediction

counts = 0
for i in test_pred:
    if i == 1:
        counts += 1
print("불량의 개수:",counts)
        
test_pred = get_pred_label(test_pred)

submit = pd.read_csv('sample_submission.csv')

submit['LABEL'] = test_pred

submit.to_csv("ThresholdAndExtractFeaturesVer2_7.csv",index=False)

불량의 개수: 656


# adjust threshold threshold to 0.0001

In [5]:
test_pred_proba = model.decision_function(test_features)

threshold = 0.0001
pred = (test_pred_proba<threshold)*1

submit = pd.read_csv('sample_submission.csv')

submit['LABEL'] = pred

submit.to_csv("Ver2Threshold000001.csv",index=False)

# adjust threshold threshold to 0.00005

In [6]:
test_pred_proba = model.decision_function(test_features)

threshold = 0.00005
pred = (test_pred_proba<threshold)*1

submit = pd.read_csv('sample_submission.csv')

submit['LABEL'] = pred

submit.to_csv("Ver2Threshold0000005.csv",index=False)

# basic threshold

In [7]:
test_pred_proba = model.decision_function(test_features)

threshold = 0.1
pred = (test_pred_proba<threshold)*1

submit = pd.read_csv('sample_submission.csv')

submit['LABEL'] = pred

submit.to_csv("Ver2Threshold.csv",index=False)

In [8]:
pred

array([1, 1, 1, ..., 1, 1, 1])

In [10]:
import pandas as pd

## converting an array to pandas Series
pred = pd.Series(pred)

pred.value_counts()

1    1483
0      31
dtype: int64