In [5]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import joblib
import os

kaggle_input="../kaggle/input"
kaggle_working="../kaggle/working"

pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', None)

plen=1


In [6]:
# format train and test data
train_data=pd.read_csv(f"{kaggle_input}/train.csv")
test_data=pd.read_csv(f"{kaggle_input}/test.csv")
f_names=['f_train.csv','f_test.csv']
for fi in range(len(f_names)):
    f_data=train_data if fi==0 else test_data
    f_data=f_data.loc[:len(f_data)*plen,:]
    # embedding averages for spectogram
    for i in range(len(f_data)):
        spect_id=f_data.loc[i,'spectrogram_id']
        spect_data=pd.read_parquet(f"{kaggle_input}/{'train' if fi==0 else 'test'}_spectrograms/{spect_id}.parquet")
        spect_col='LL_0.98'
        avgs=[sum(spect_data.loc[i:i+5,spect_col])/5 for i in range(0,min(len(spect_data),300),5)]
        for t in range(len(avgs)):
            f_data.loc[i,f'savg{t}']=avgs[t]

    # keeping cols
    vote_cols=['seizure_vote','lpd_vote','gpd_vote','lrda_vote','grda_vote','other_vote']
    savg_cols=[f'savg{i}' for i in range(60)]
    f_data=f_data[[col for col in vote_cols+savg_cols] if fi==0 else [col for col in ['eeg_id']+savg_cols]]
    if fi==0:
        # converting votes to probs
        for i in range(len(f_data)):
            sum_votes=sum(f_data.loc[i,vote_cols])
            f_data.loc[i,vote_cols]=f_data.loc[i,vote_cols].apply(lambda x: (x/sum_votes) )

    # saving to csv
    f_data.to_csv(f'{kaggle_working}/{f_names[fi]}',index=False)
    

    





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  f_data.loc[i,f'savg{t}']=avgs[t]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  f_data.loc[i,f'savg{t}']=avgs[t]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  f_data.loc[i,f'savg{t}']=avgs[t]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer]

In [9]:
# train the models
os.mkdir(f'{kaggle_working}/models')

f_train=pd.read_csv(f"{kaggle_working}/f_train.csv")
# f_train=f_train.iloc[:int(len(f_train)*plen),:]
f_train.fillna(method='bfill',inplace=True)
vote_cols=['seizure_vote','lpd_vote','gpd_vote','lrda_vote','grda_vote','other_vote']
models=[RandomForestRegressor(n_estimators=10, random_state=0, oob_score=True) for i in range(len(vote_cols))]
train_cols=[col for col in f_train.columns if col not in vote_cols]

for i in range(len(vote_cols)):
    print(f'training {vote_cols[i]} model')
    models[i].fit(f_train[train_cols],f_train[vote_cols[i]])
    joblib.dump(models[i],f'{kaggle_working}/models/{vote_cols[i]}.joblib')
    print(f'trained and saved at {kaggle_working}/models/{vote_cols[i]}.joblib')


training seizure_vote model
trained and saved at ./kaggle/working/models/seizure_vote.joblib
training lpd_vote model
trained and saved at ./kaggle/working/models/lpd_vote.joblib
training gpd_vote model
trained and saved at ./kaggle/working/models/gpd_vote.joblib
training lrda_vote model
trained and saved at ./kaggle/working/models/lrda_vote.joblib
training grda_vote model
trained and saved at ./kaggle/working/models/grda_vote.joblib
training other_vote model
trained and saved at ./kaggle/working/models/other_vote.joblib


In [10]:
# predict the test set votes
f_test=pd.read_csv(f"{kaggle_working}/f_test.csv")
# load models
vote_cols=['seizure_vote','lpd_vote','gpd_vote','lrda_vote','grda_vote','other_vote']
models=[f'model{i}' for i in range(len(vote_cols))]
for i in range(len(vote_cols)):
    models[i]=joblib.load(f'{kaggle_working}/models/{vote_cols[i]}.joblib')

# predicting
train_cols=[col for col in f_test.columns if col not in ['eeg_id']]
p_votes=pd.DataFrame(columns=vote_cols)
for i in range(len(models)):
    p_votes[vote_cols[i]]=models[i].predict(f_test[train_cols])
p_votes

# generalizing probs
for i in range(len(p_votes)):
   p_votes.loc[i,vote_cols]=p_votes.loc[i,vote_cols].apply(lambda x: (x/sum(p_votes.loc[i,vote_cols])) )
p_votes

# joining with eeg_id
p_votes=pd.concat([f_test['eeg_id'],p_votes],axis=1)

# writing sub file
p_votes.to_csv(f'{kaggle_working}/submission.csv',index=False)

print("done")