In [24]:
import gc
import pandas as pd
import numpy as np
import collections
import xgboost as xgb
from pathlib import Path
import soundfile as sf
from tqdm.auto import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression


# path = Path('/content/drive/MyDrive/TZ/stc/input/wav_data')
path = Path('/content/drive/MyDrive/TZ/stc/input/')
SEED = 13

In [3]:
def find_length() -> int:
    """
    we need make all waves equal length and we need find mean
    return: mean(all length waves) * 8000   
    """
    tmp = []
    for p in (path / 'wav_data').glob('*.wav'):
        d, sr = sf.read(p)
        tmp.append(d.shape[0]/sr)

    print(f'Length waves max : {np.max(tmp)}, min: {np.min(tmp)}, mean: {np.mean(tmp)}')    
    return int(sr * np.mean(tmp))


def make_data(path: Path, length: int) -> list:
    """
    path: Path, path to wave file
    length: int, set length to make eq. len waves
            if > we cut, < expand zeros
    after each waves make FFT, reshape and get mean features
    return: list    
    """
    d, _ = sf.read(p)   
    if d.shape[0] < length:
        d = np.append(d, [0]*(length-d.shape[0]), axis = 0)
    else:
        d = d[:length]
    d_f = np.fft.fft(d)[:len(d)//2]
    return d_f.reshape(352, 54).mean(axis = 1)


def make_target(name: str) ->pd.DataFrame:
    """
    name: str, f.txt
    """
    data = pd.read_csv(path / name, header = None)
    data['target'] = 0
    data.columns = ['id', 'target']
    data['id'] = data['id'].str.split('.').str[0]
    return data


length = find_length()
dct = collections.defaultdict(list)
for p in tqdm((path / 'wav_data').glob('*.wav')):
    dct[p.stem].append(make_data(p, length))
    dct[p.stem] = np.ravel(dct[p.stem])

Length waves max : 10.6565, min: 2.4035, mean: 4.752068014705882


0it [00:00, ?it/s]

In [4]:
#make target
f = make_target('f.txt')
m = make_target('m.txt')

#make data
result = pd.DataFrame.from_dict(
    dct,
    orient='index',
    dtype = 'float64'
)

# columns
result = result.reset_index()
result.columns = ['id'] +  [f'col_{i}' for i in range(352)]  

#merge data&target by id
data = result.merge(f, on='id', how='left')
data.fillna(1, inplace= True)
#check correct
assert all(sorted(data[data.target == 0]['id'].values) == f.id.values) == True, 'merge not correct for woman'
assert all(sorted(data[data.target == 1]['id'].values) == m.id.values) == True, 'merge not correct for man'

  subarr = np.array(values, dtype=dtype, copy=copy)


In [6]:
data.head()

Unnamed: 0,id,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,col_11,col_12,col_13,col_14,col_15,col_16,col_17,col_18,col_19,col_20,col_21,col_22,col_23,col_24,col_25,col_26,col_27,col_28,col_29,col_30,col_31,col_32,col_33,col_34,col_35,col_36,col_37,col_38,...,col_313,col_314,col_315,col_316,col_317,col_318,col_319,col_320,col_321,col_322,col_323,col_324,col_325,col_326,col_327,col_328,col_329,col_330,col_331,col_332,col_333,col_334,col_335,col_336,col_337,col_338,col_339,col_340,col_341,col_342,col_343,col_344,col_345,col_346,col_347,col_348,col_349,col_350,col_351,target
0,A30000A2,0.608733,0.211616,0.052127,0.038498,-0.008882,-0.013668,0.006393,-0.019994,-0.021558,-0.001814,-0.010876,-0.018198,0.000682,-0.016977,-0.02116,-0.028615,0.025883,0.047539,-0.261646,0.029137,-0.067078,-0.308794,0.432776,0.15552,-0.741885,0.210501,0.021954,-0.083125,0.034002,-0.043001,0.023855,-0.01099,0.01049,-0.13762,-0.028034,-0.128635,-0.054013,0.099095,-0.062081,...,0.00975,-0.011083,0.00657,0.007731,0.00093,0.007507,0.001487,0.002958,-0.001566,0.005,0.002226,0.00179,-0.003101,0.0067,-0.001355,0.004891,-0.001753,0.004209,0.002078,0.002472,0.003935,0.000162,0.002193,0.006261,0.005018,0.000216,0.000548,-0.000327,0.003644,0.005405,0.004511,0.00495,-0.002255,0.000717,0.005937,0.003952,0.000701,0.000792,0.001419,0.0
1,A30000D3,0.223829,-0.004449,-0.020257,0.040592,0.021104,-0.04074,-0.005802,-0.021012,-0.022852,-0.016162,-0.015511,-0.018825,-0.024549,0.015027,-0.003884,0.02661,0.005097,-0.22311,0.404322,-0.122968,0.508957,0.09995,-1.528821,1.85882,-1.30297,0.12161,0.128881,0.037571,0.05489,-0.113391,0.107514,-0.051356,0.061173,0.020392,0.036249,-0.085959,0.093457,-0.320196,0.144801,...,-0.004497,0.003132,-0.001003,0.002135,0.002156,-0.001054,0.005195,0.004149,-0.004758,-0.001093,-0.001985,0.002715,-0.003402,0.001412,-0.001697,-0.000251,-0.005371,-0.002451,-0.002761,0.001001,0.000201,0.004199,-0.005072,0.004478,-0.000271,0.002013,-0.007799,0.002562,-0.000307,0.000252,-0.001681,-0.004992,0.001953,-0.000756,-0.000291,0.001325,-0.003386,0.004058,-0.003248,0.0
2,A30000A1,-0.015923,-0.007111,0.016331,0.002326,0.058463,-0.017597,-0.017817,0.017739,0.017942,-1.3e-05,-0.012193,-0.01639,-0.009737,0.007668,0.002225,0.046502,-0.159088,0.267481,-0.146186,-0.011652,-0.040167,0.275212,-0.180232,-0.071587,0.142944,0.07867,-0.068931,0.068759,-0.201813,0.068275,-0.042007,0.075226,-0.096158,0.165772,-0.071677,-0.069861,0.031927,-0.264425,0.099084,...,-0.000867,-0.002974,0.001234,0.003142,-0.001154,-0.001672,0.001865,0.000432,0.003641,-0.00347,0.003474,-7.4e-05,0.001337,0.001783,0.001581,-0.005745,0.006262,0.003316,0.000395,-0.001784,-0.00105,-0.000615,0.000749,0.000931,-0.00102,-0.001159,0.000423,0.001099,0.002585,0.001473,0.003402,0.002369,0.00045,-0.003123,0.004382,-0.001683,0.001782,0.000166,0.000146,0.0
3,A30000C3,0.175443,-0.044406,-0.033658,-0.043898,0.030009,0.027941,-0.046713,0.003748,-0.023657,0.046384,-0.013095,0.021587,0.008911,0.044094,0.029605,-0.000518,0.153857,0.562847,1.118692,-2.607494,1.599095,-1.138622,0.296654,-0.097137,0.320428,-0.54186,0.083076,-0.468505,0.112439,-0.274769,0.040204,-0.257311,0.035019,0.138029,0.164149,-0.155004,0.738321,0.404725,-1.606853,...,-0.034137,-0.010549,-0.026222,-0.003771,-0.025131,-0.013349,-0.021232,-0.009299,-0.02098,-0.001254,-0.019947,-0.00473,-0.011724,-0.006409,-0.018005,-0.002885,-0.024039,-0.017516,0.002641,-0.040071,-0.015169,-0.002349,-0.03198,-0.003227,-0.015735,-0.009834,-0.014872,-0.015621,-0.016058,-0.014425,-0.00449,-0.023531,-0.005287,-0.018372,-0.022267,-0.000578,-0.020915,-0.011531,-0.015112,0.0
4,A30000L1,0.243987,0.022751,0.014529,-0.01787,-0.012017,-0.04583,0.001703,-0.016831,0.033343,0.024066,0.004597,0.013056,0.019139,0.027395,-0.051763,0.112559,-0.164154,0.03074,0.096613,0.173386,-0.096644,-0.179253,-0.837252,0.958825,-0.115018,0.117855,0.114437,-0.168785,0.023737,0.028901,-0.080174,-0.000564,0.085063,0.061854,-0.183014,0.013965,0.171683,-0.06956,-0.231829,...,-0.001602,0.002368,-0.001981,-0.005146,0.006962,-0.002078,-0.002608,0.004656,-0.0026,0.005069,0.002904,-0.004428,-0.001707,-0.006277,-0.003018,0.000722,-0.003917,0.000518,0.000617,-0.001829,-0.006767,0.006508,-0.002572,-0.002928,-0.002175,-0.003765,0.003022,0.009693,-0.004339,0.009469,-0.000309,0.004705,0.001026,0.003306,-0.005454,0.001741,-0.002313,0.003034,0.001057,0.0


In [7]:
data = data.sample(frac=1).reset_index(drop=True)
y = data.target.values
X = data.drop(['id','target'], axis = 1)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=SEED)
clf = LogisticRegression(random_state=SEED).fit(X_train, y_train)
y_ = clf.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_)

0.41848299912816045

In [9]:
model = xgb.XGBClassifier(
    n_estimators=1000,
    max_depth=4,
    learning_rate=0.05,
    verbosity=0,
    objective='binary:logistic',
    subsample=0.95,
    colsample_bytree=0.95,
    random_state=SEED,
    n_jobs=2,
    )
model.fit(X_train, y_train)
y_ = model.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_)

0.8831734960767218

In [49]:
sk = StratifiedKFold(random_state=SEED)
tmp = []
for tidx,vidx in sk.split(X, y):
    tr, vl = X.iloc[tidx], X.iloc[vidx]
    tr_y, vl_y = y[tidx], y[vidx]
    print(tr.shape, len(tr_y), vl.shape, len(vl_y))
    model = xgb.XGBClassifier(
        n_estimators=500,
        max_depth=4,
        learning_rate=0.05,
        verbosity=0,
        objective='binary:logistic',
        subsample=0.5,
        colsample_bytree=0.5,
        random_state=SEED,
        n_jobs=2,
        )
    model.fit(tr, tr_y)
    y_ = model.predict_proba(vl)[:, 1]
    score = roc_auc_score(vl_y, y_)
    tmp.append(score)
    del model
    gc.collect()   



(163, 352) 163 (41, 352) 41
(163, 352) 163 (41, 352) 41
(163, 352) 163 (41, 352) 41
(163, 352) 163 (41, 352) 41
(164, 352) 164 (40, 352) 40


In [50]:
tmp

[0.9214285714285714,
 0.9190476190476191,
 0.9571428571428572,
 0.9476190476190476,
 0.9525]

In [26]:
tmp

[0.9166666666666666,
 0.9166666666666666,
 0.969047619047619,
 0.9047619047619047,
 0.9325]