In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import joblib

env_kaggle=False

kaggle_input="./kaggle/input"
kaggle_working="./kaggle/working"
model_dir='./kaggle/temp'

if env_kaggle:
    kaggle_input="/kaggle/input/hms-harmful-brain-activity-classification"
    kaggle_working="/kaggle/working"
    model_dir='/kaggle/input/gitred/kaggle/temp'

kaggle_temp=kaggle_working


pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', None)

plen=0.01


In [4]:
# format test data
train_data=pd.read_csv(f"{kaggle_input}/train.csv")
test_data=pd.read_csv(f"{kaggle_input}/test.csv")
f_names=['f_train.csv','f_test.csv']
fi=1

f_data=train_data.copy() if fi==0 else test_data.copy()
f_data=f_data.loc[:len(f_data)*plen,:]

# embedding averages for spectogram
for i in range(len(f_data)):
    spect_id=f_data.loc[i,'spectrogram_id']
    spect_data=pd.read_parquet(f"{kaggle_input}/{'train' if fi==0 else 'test'}_spectrograms/{spect_id}.parquet")
    spect_col='LL_0.98'
    avgs=[sum(spect_data.loc[i:i+5,spect_col])/5 for i in range(0,min(len(spect_data)-6,300),5)]
    for t in range(60):
#         f_data.loc[i,f'savg{t}']=avgs[t] 
        f_data.loc[i,f'savg{t}']=avgs[t] if t<len(avgs) else ''

# keeping cols
vote_cols=['seizure_vote','lpd_vote','gpd_vote','lrda_vote','grda_vote','other_vote']
savg_cols=[f'savg{i}' for i in range(60)]
f_data=f_data[[col for col in vote_cols+savg_cols] if fi==0 else [col for col in ['eeg_id']+savg_cols]]
if fi==0:
    # converting votes to probs
    for i in range(len(f_data)):
        sum_votes=sum(f_data.loc[i,vote_cols])
        f_data.loc[i,vote_cols]=f_data.loc[i,vote_cols].apply(lambda x: (x/sum_votes) )

# saving to csv
f_data.to_csv(f'{kaggle_working}/{f_names[fi]}',index=False)



In [5]:
# predict the test set votes
f_test=pd.read_csv(f"{kaggle_working}/f_test.csv")
f_test.fillna(method='bfill',inplace=True)
f_test.fillna(method='ffill',inplace=True)
f_test.fillna(0,inplace=True)


# load models
vote_cols=['seizure_vote','lpd_vote','gpd_vote','lrda_vote','grda_vote','other_vote']
models=[f'model{i}' for i in range(len(vote_cols))]
for i in range(len(vote_cols)):
    models[i]=joblib.load(f'{model_dir}/models/{vote_cols[i]}.joblib')

# predicting
train_cols=[col for col in f_test.columns if col not in ['eeg_id']]
p_votes=pd.DataFrame(columns=vote_cols)
for i in range(len(models)):
    p_votes[vote_cols[i]]=models[i].predict(f_test[train_cols])
p_votes

# generalizing probs
for i in range(len(p_votes)):
   p_votes.loc[i,vote_cols]=p_votes.loc[i,vote_cols].apply(lambda x: (x/sum(p_votes.loc[i,vote_cols])) )
p_votes

# joining with eeg_id
p_votes=pd.concat([f_test['eeg_id'],p_votes],axis=1)

# writing sub file
p_votes.to_csv(f'{kaggle_working}/submission.csv',index=False)
print(p_votes)
print('---------------------------done------------------------------')


       eeg_id  seizure_vote  lpd_vote  gpd_vote  lrda_vote  grda_vote  \
0  3911565283      0.088635  0.154679  0.247601    0.07534   0.052989   

   other_vote  
0    0.380755  
---------------------------done------------------------------
