In [1]:
import gc
import pandas as pd
import numpy as np
import collections
import xgboost as xgb
from pathlib import Path
import soundfile as sf
from tqdm.auto import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression

from IPython.display import Audio

# path = Path('/content/drive/MyDrive/TZ/stc/input/wav_data')
path = Path('/content/drive/MyDrive/TZ/stc/input/')
SEED = 13

In [2]:
def find_length() -> int:
    """
    we need make all waves equal length and we need find mean
    return: mean(all length waves) * 8000   
    """
    tmp = []
    for p in (path / 'wav_data').glob('*.wav'):
        d, sr = sf.read(p)
        tmp.append(d.shape[0]/sr)

    print(f'Length waves max : {np.max(tmp)}, min: {np.min(tmp)}, mean: {np.mean(tmp)}')    
    return int(sr * np.mean(tmp))


def make_data(path: Path, length: int) -> list:
    """
    path: Path, path to wave file
    length: int, set length to make eq. len waves
            if > we cut, < expand zeros
    after each waves make FFT, reshape and get mean features
    return: list    
    """
    d, _ = sf.read(p)   
    if d.shape[0] < length:
        d = np.append(d, [0]*(length-d.shape[0]), axis = 0)
    else:
        d = d[:length]
    d_f = np.fft.fft(d)[:len(d)//2]
    return d_f.reshape(352, 54).mean(axis = 1)


def make_target(name: str, value: int) ->pd.DataFrame:
    """
    name: str, f.txt
    value: int, taget value [0, 1]
    """
    data = pd.read_csv(path / name, header = None)
    data['target'] = value
    data.columns = ['id', 'target']
    data['id'] = data['id'].str.split('.').str[0]
    return data


def rand_path_test(
    count: int,
    man: pd.DataFrame,
    woman: pd.DataFrame
):
    dct = {}
    for _ in range(count):
        id_man = np.random.choice(man.id.values)
        id_woman = np.random.choice(woman.id.values)
        dct[id_man] = man[man.id == id_man].target.values[0]
        dct[id_woman] = woman[woman.id == id_woman].target.values[0]
    return dct

In [3]:
#make target
f = make_target('f.txt', 0)
m = make_target('m.txt', 1)

test_dct = rand_path_test(
    count = 4,
    man=f,
    woman=m
)
length = find_length()
dct = collections.defaultdict(list)
for p in tqdm((path / 'wav_data').glob('*.wav')):
    if not p.stem in test_dct.keys():
        dct[p.stem].append(make_data(p, length))
        dct[p.stem] = np.ravel(dct[p.stem])

Length waves max : 10.6565, min: 2.4035, mean: 4.752068014705882


0it [00:00, ?it/s]

In [4]:
f[f.id == 'A30002S9'].target.values[0]

0

In [5]:
#make data
result = pd.DataFrame.from_dict(
    dct,
    orient='index',
    dtype = 'float64'
)
for k in test_dct.keys():
    assert (k in result.index) == False, f'error test id in train data id:{k}'


# columns
result = result.reset_index()
result.columns = ['id'] +  [f'col_{i}' for i in range(352)]  

#merge data&target by id
data = result.merge(f, on='id', how='left')
data.fillna(1, inplace= True)
# drop test ids
f = f[~f.id.isin(test_dct.keys())].reset_index(drop=True)
m = m[~m.id.isin(test_dct.keys())].reset_index(drop=True)
#check correct
assert all(sorted(data[data.target == 0]['id'].values) == f.id.values) == True, 'merge not correct for woman'
assert all(sorted(data[data.target == 1]['id'].values) == m.id.values) == True, 'merge not correct for man'

  subarr = np.array(values, dtype=dtype, copy=copy)


In [6]:
data = data.sample(frac=1).reset_index(drop=True)
y = data.target.values
X = data.drop(['id','target'], axis = 1)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=SEED)
clf = LogisticRegression(random_state=SEED).fit(X_train, y_train)
y_ = clf.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_)

0.4807692307692308

In [8]:
model = xgb.XGBClassifier(
    n_estimators=1000,
    max_depth=4,
    learning_rate=0.05,
    verbosity=0,
    objective='binary:logistic',
    subsample=0.95,
    colsample_bytree=0.95,
    random_state=SEED,
    n_jobs=2,
    )
model.fit(X_train, y_train)
y_ = model.predict_proba(X_test)[:, 1]
model.save_model('xgb_model.json')
roc_auc_score(y_test, y_)

0.7528846153846154

In [9]:
sk = StratifiedKFold(n_splits=3, random_state=SEED)
tmp = []
for i, (tidx,vidx) in enumerate(sk.split(X, y)):
    tr, vl = X.iloc[tidx], X.iloc[vidx]
    tr_y, vl_y = y[tidx], y[vidx]
    model = xgb.XGBClassifier(
        n_estimators=500,
        max_depth=4,
        learning_rate=0.05,
        objective='binary:logistic',
        subsample=0.5,
        colsample_bytree=0.5,
        random_state=SEED,
        n_jobs=2,
        )
    model.fit(tr, tr_y)
    y_ = model.predict_proba(vl)[:, 1]
    score = roc_auc_score(vl_y, y_)
    print(f'Fold: {i + 1}: {score}')
    tmp.append(score)
    del model
    gc.collect()   

print('Overall Score:', np.mean(tmp), np.std(tmp))



Fold: 1: 0.8567493112947658
Fold: 2: 0.8769513314967861
Fold: 3: 0.7253787878787878
Overall Score: 0.8196931435567798 0.067198356479291


In [10]:
# %%writefile test_gender.py
import pathlib

def test_gender(
    path_file: str,
    path_model: str,
    lenght: int = 38016,
    target: int = None,
) -> None:
    """
    path_file : str, path to wave file
    path_model: str, path to pretrain model(xgb)


    return: score&(man or woman)
    """
    model = xgb.XGBClassifier()
    model.load_model(path_model)

    if not isinstance(path_file, pathlib.PosixPath):
        path_file = pathlib.Path(path_file)
        
    d, _ = sf.read(path_file)   
    if d.shape[0] < length:
        d = np.append(d, [0]*(length-d.shape[0]), axis = 0)
    else:
        d = d[:length]
    d_f = np.fft.fft(d)[:len(d)//2]
    d_f = d_f.reshape(352, 54).mean(axis = 1)
    y_ = model.predict_proba(d_f.astype('float64').reshape((1,-1)))
    idx = np.argmax(y_)
    if idx == 1: gender = 'Man'
    else: gender = 'Woman'
    result_str = f'Predict id: {path_file.stem}, score: {y_[0][idx]}, prob: {y_[0]},  gender: {gender}'
    ans = False
    if target == 0 or target == 1:
        if idx == target:
            ans = True           
            result_str  = result_str + f' Target: {ans}'
        else:           
            result_str  = result_str + f' Target: {ans}'
    print(result_str)

In [11]:
for k, v in test_dct.items(): 
    test_gender(
        path_file = path / 'wav_data' / f'{k}.wav',
        path_model = '/content/xgb_model.json',
        lenght = 38016,
        target = v
        )

Predict id: A30000X4, score: 0.9014936685562134, prob: [0.90149367 0.09850636],  gender: Woman Target: True
Predict id: A30001S4, score: 0.9645574688911438, prob: [0.03544253 0.96455747],  gender: Man Target: True
Predict id: A30000C3, score: 0.8821460604667664, prob: [0.88214606 0.11785392],  gender: Woman Target: True
Predict id: A30001A6, score: 0.8514837026596069, prob: [0.1485163 0.8514837],  gender: Man Target: True
Predict id: A30000O1, score: 0.8342952132225037, prob: [0.8342952 0.1657048],  gender: Woman Target: True
Predict id: A30003S2, score: 0.5480421781539917, prob: [0.45195782 0.5480422 ],  gender: Man Target: True
Predict id: A30000L3, score: 0.841105043888092, prob: [0.84110504 0.15889496],  gender: Woman Target: True




In [12]:
d, sr = sf.read('/content/drive/MyDrive/TZ/stc/input/wav_data/A30000X1.wav')
Audio(d, rate=sr)


In [13]:
# %load test_gender.py

In [14]:
test_gender(
    path_file = '/content/drive/MyDrive/TZ/stc/input/wav_data/A30000X1.wav',
    path_model = '/content/xgb_model.json',
    lenght = 38016,
    target = None
    )

Predict id: A30000X1, score: 0.9809482097625732, prob: [0.9809482  0.01905181],  gender: Woman


