In [None]:
import pandas as pd
import numpy as np

In [None]:
#Test Data
cis_pd_testing_id=pd.read_csv('test_data_Id/cis-pd.CIS-PD_Test_Data_IDs.csv')
real_pd_testing_id=pd.read_csv('test_data_Id/real-pd.REAL-PD_Test_Data_IDs.csv')

#Training Data
cis_pd_training_id=pd.read_csv('data_labels/CIS-PD_Training_Data_IDs_Labels.csv')
real_pd_training_id=pd.read_csv('data_labels/REAL-PD_Training_Data_IDs_Labels.csv')

#Ancillary Data
cis_pd_ancillary_id=pd.read_csv('data_labels/CIS-PD_Ancillary_Data_IDs_Labels.csv')
real_pd_ancillary_id=pd.read_csv('data_labels/REAL-PD_Ancillary_Data_IDs_Labels.csv')

cis_pd_id=cis_pd_training_id.append(pd.DataFrame(data=cis_pd_ancillary_id),ignore_index=True)
real_pd_id=real_pd_training_id.append(pd.DataFrame(data=real_pd_ancillary_id),ignore_index=True)



In [None]:
## Modeling

from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

from sklearn.feature_selection import RFE
from lightgbm import LGBMRegressor

import lightgbm as lgb
from catboost import CatBoostRegressor, FeaturesData, Pool
from sklearn.metrics import mean_squared_error

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


#cispd=================================================================================
def SelectFromModel_lightgbm(df,label,max_feature,rn):
    data1=df.copy()
    train=data1[~data1[label].isnull()]

    # Splitting the dataset into the Training set and Test set
    x_train, x_test, y_train, y_test = train_test_split(train.drop(['subject_id','measurement_id','on_off','dyskinesia','tremor'], axis = 1), train[label], 
                                                        train_size=0.7, 
                                                        stratify = train[label],
                                                        random_state=rn)
    
    
    lgbc=LGBMRegressor(n_estimators=150, learning_rate=0.001, num_leaves=32, colsample_bytree=0.2,
                reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40)

    embeded_lgb_selector = SelectFromModel(lgbc, max_features=max_feature)
    embeded_lgb_selector.fit(x_train, y_train)

    embeded_lgb_support = embeded_lgb_selector.get_support()
    embeded_lgb_feature = x_train.loc[:,embeded_lgb_support].columns.tolist()
    embeded_lgb_feature=['subject_id','measurement_id','on_off','dyskinesia','tremor']+embeded_lgb_feature
    return(embeded_lgb_feature)



def model_result_cispd(df_train,num,prin=0):
    feat_tremor=SelectFromModel_lightgbm(df_train,'tremor',num,1)
    feat_dyskinesia=SelectFromModel_lightgbm(df_train,'dyskinesia',num,1)
    feat_on_off=SelectFromModel_lightgbm(df_train,'on_off',num,1)
    if prin == 1:        
        print('===============All Features=================')
        for label in ['tremor','dyskinesia','on_off']:
            rn=1
            a=lightgbm(df_train,label,rn)
            b=catboost(df_train,label,rn)
            print('lightgbm test_MSE '+label+' :',a)
            print('catboost test_MSE '+label+' :',b)
            print('Average test_MSE '+label+' :',(a+b)/2)
            print('')

    print('==============Selected Features==============')
    for label,feat in zip(['tremor','dyskinesia','on_off'],[feat_tremor,feat_dyskinesia,feat_on_off]):
        rn=1
        a=lightgbm(df_train[feat],label,rn)
        b=catboost(df_train[feat],label,rn)
        print('lightgbm test_MSE '+label+' :',a)
        print('catboost test_MSE '+label+' :',b)
        print('Average test_MSE '+label+' :',(a+b)/2)
        print('')




#realpd=================================================================================
def SelectFromModel_lightgbm_realpd(df,label,max_feature,rn):
    data1=df.copy()
    data1=data1[data1.tremor!=4]
    train=data1[~data1[label].isnull()]

    # Splitting the dataset into the Training set and Test set
    x_train, x_test, y_train, y_test = train_test_split(train.drop(['subject_id','measurement_id','on_off','dyskinesia','tremor'], axis = 1), train[label], 
                                                        train_size=0.7, 
                                                        stratify = train[label],
                                                        random_state=rn)
    
    
    lgbc=LGBMRegressor(n_estimators=150, learning_rate=0.001, num_leaves=32, colsample_bytree=0.2,
                reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40)

    embeded_lgb_selector = SelectFromModel(lgbc, max_features=max_feature)
    embeded_lgb_selector.fit(x_train, y_train)

    embeded_lgb_support = embeded_lgb_selector.get_support()
    embeded_lgb_feature = x_train.loc[:,embeded_lgb_support].columns.tolist()
    embeded_lgb_feature=['subject_id','measurement_id','on_off','dyskinesia','tremor']+embeded_lgb_feature
    return(embeded_lgb_feature)

def label_realpd(data):
    label=real_pd_training_id.append(pd.DataFrame(real_pd_ancillary_id), ignore_index=True)
    label=pd.merge(label,realpd_clinical,on='subject_id')
    df_train=pd.merge(label,data,on='measurement_id')
    df_train['subject_id']=preprocessing.LabelEncoder().fit(df_train['subject_id']).transform(df_train['subject_id'])
    return(df_train)


def model_result_realpd(df_train,num,prin=0):
    feat_tremor=SelectFromModel_lightgbm_realpd(df_train,'tremor',num,1)
    feat_dyskinesia=SelectFromModel_lightgbm_realpd(df_train,'dyskinesia',num,1)
    feat_on_off=SelectFromModel_lightgbm_realpd(df_train,'on_off',num,1)
    if prin == 1:        
        print('===============All Features=================')
        for label in ['tremor','dyskinesia','on_off']:
            rn=1
            a=lightgbm_real(df_train,label,rn)
            b=catboost_real(df_train,label,rn)
            print('lightgbm test_MSE '+label+' :',a)
            print('catboost test_MSE '+label+' :',b)
            print('Average test_MSE '+label+' :',(a+b)/2)
            print('')

    print('==============Selected Features==============')
    for label,feat in zip(['tremor','dyskinesia','on_off'],[feat_tremor,feat_dyskinesia,feat_on_off]):
        rn=1
        a=lightgbm_real(df_train[feat],label,rn)
        b=catboost_real(df_train[feat],label,rn)
        print('lightgbm test_MSE '+label+' :',a)
        print('catboost test_MSE '+label+' :',b)
        print('Average test_MSE '+label+' :',(a+b)/2)
        print('')



## Recursive Feature Elimination# Modeling#########################################################################################



#cispd=========================================
def rfem(df,label,rn,rang):
    data1=df.copy()
    train=data1[~data1[label].isnull()]

    # Splitting the dataset into the Training set and Test set
    x_train, x_test, y_train, y_test = train_test_split(train.drop(['subject_id','measurement_id','on_off','dyskinesia','tremor'], axis = 1), train[label], 
                                                        train_size=0.7, 
                                                        stratify = train[label],
                                                        random_state=rn)

    X_norm=x_train

    #estimator=LinearRegression()
    estimator=LGBMRegressor(n_estimators=150, learning_rate=0.001, num_leaves=32, colsample_bytree=0.2,
                    reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40)
    
    print('Label: ',label)
    print('')
    for index in rang:
        rfe_selector = RFE(estimator=estimator, n_features_to_select=index, step=10, verbose=False)
        rfe_selector.fit(X_norm, y_train)
        rfe_support = rfe_selector.get_support()
        rfe_feature = x_train.loc[:,rfe_support].columns.tolist()
        print(str(len(rfe_feature)), 'selected features')
        rfe_feature=['subject_id','measurement_id','on_off','dyskinesia','tremor']+rfe_feature
        a=lightgbm(data[rfe_feature],label,rn)
        b=catboost(data[rfe_feature],label,rn)
        print('lightgbm test_MSE '+label+' :',a)
        print('catboost test_MSE '+label+' :',b)
        print('Average test_MSE '+label+' :',(a+b)/2)
        print('')
        
def rfem2(df,label,nfeat,rn):
    data1=df.copy()
    train=data1[~data1[label].isnull()]

    # Splitting the dataset into the Training set and Test set
    x_train, x_test, y_train, y_test = train_test_split(train.drop(['subject_id','measurement_id','on_off','dyskinesia','tremor'], axis = 1), train[label], 
                                                        train_size=0.7, 
                                                        stratify = train[label],
                                                        random_state=rn)

    X_norm=x_train

    #estimator=LinearRegression()
    estimator=LGBMRegressor(n_estimators=150, learning_rate=0.001, num_leaves=32, colsample_bytree=0.2,
                    reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40)
    
    print('Label: ',label)
    print('')
    rfe_selector = RFE(estimator=estimator, n_features_to_select=nfeat, step=10, verbose=False)
    rfe_selector.fit(X_norm, y_train)
    rfe_support = rfe_selector.get_support()
    rfe_feature = x_train.loc[:,rfe_support].columns.tolist()
    print(str(len(rfe_feature)), 'selected features')
    rfe_feature=['subject_id','measurement_id','on_off','dyskinesia','tremor']+rfe_feature
    a=lightgbm(df[rfe_feature],label,rn)
    b=catboost(df[rfe_feature],label,rn)
    print('lightgbm test_MSE '+label+' :',a)
    print('catboost test_MSE '+label+' :',b)
    print('Average test_MSE '+label+' :',(a+b)/2)
    print('')
    return(rfe_feature)


#realpd===========================================
def rfem_real(df,label,rn,rangel):
    data1=df.copy()
    data1=data1[data1.tremor!=4]
    train=data1[~data1[label].isnull()]

    # Splitting the dataset into the Training set and Test set
    x_train, x_test, y_train, y_test = train_test_split(train.drop(['subject_id','measurement_id','on_off','dyskinesia','tremor'], axis = 1), train[label], 
                                                        train_size=0.7, 
                                                        stratify = train[label],
                                                        random_state=rn)

    X_norm=x_train

    estimator=LGBMRegressor(n_estimators=150, learning_rate=0.001, num_leaves=32, colsample_bytree=0.2,
                    reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40)
    
    print('Label: ',label)
    print('')
    for index in rangel:
        rfe_selector = RFE(estimator=estimator, n_features_to_select=index, step=10, verbose=False)
        rfe_selector.fit(X_norm, y_train)
        rfe_support = rfe_selector.get_support()
        rfe_feature = x_train.loc[:,rfe_support].columns.tolist()
        print(str(len(rfe_feature)), 'selected features')
        rfe_feature=['subject_id','measurement_id','on_off','dyskinesia','tremor']+rfe_feature
        a=lightgbm_real(df[rfe_feature],label,rn)
        b=catboost_real(df[rfe_feature],label,rn)
        print('lightgbm test_MSE '+label+' :',a)
        print('catboost test_MSE '+label+' :',b)
        print('Average test_MSE '+label+' :',(a+b)/2)
        print('')

    
    
def rfem2_real(df,label,nfeat,rn):
    data1=df.copy()
    data1=data1[data1.tremor!=4]
    train=data1[~data1[label].isnull()]

    # Splitting the dataset into the Training set and Test set
    x_train, x_test, y_train, y_test = train_test_split(train.drop(['subject_id','measurement_id','on_off','dyskinesia','tremor'], axis = 1), train[label], 
                                                        train_size=0.7, 
                                                        stratify = train[label],
                                                        random_state=rn)

    X_norm=x_train

    #estimator=LinearRegression()
    estimator=LGBMRegressor(n_estimators=150, learning_rate=0.001, num_leaves=32, colsample_bytree=0.2,
                    reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40)
    
    print('Label: ',label)
    print('')
    rfe_selector = RFE(estimator=estimator, n_features_to_select=nfeat, step=10, verbose=False)
    rfe_selector.fit(X_norm, y_train)
    rfe_support = rfe_selector.get_support()
    rfe_feature = x_train.loc[:,rfe_support].columns.tolist()
    print(str(len(rfe_feature)), 'selected features')
    rfe_feature=['subject_id','measurement_id','on_off','dyskinesia','tremor']+rfe_feature
    a=lightgbm_real(df[rfe_feature],label,rn)
    b=catboost_real(df[rfe_feature],label,rn)
    print('lightgbm test_MSE '+label+' :',a)
    print('catboost test_MSE '+label+' :',b)
    print('Average test_MSE '+label+' :',(a+b)/2)
    print('')
    return(rfe_feature)
    
# Modeling########################################################################################

#CisPD===========================================
def lightgbm(df4,label,rn):
    train=df4[~df4[label].isnull()]

    # Splitting the dataset into the Training set and Test set
    x_train, x_test, y_train, y_test = train_test_split(train.drop(['subject_id','measurement_id','on_off','dyskinesia','tremor'], axis = 1), train[label], 
                                                        train_size=0.7, 
                                                        stratify = train[label],
                                                        random_state=rn)

    d_train = lgb.Dataset(x_train, label=y_train)

    params = {}
    params['learning_rate'] = 0.03
    params['boosting_type'] = 'gbdt'
    params['objective'] = 'regression'
    params['metric'] = 'rmse'
    params['max_depth'] = 5
    params['feature_fraction'] = 0.8
    clf = lgb.train(params, d_train, 150)
    y_pred=clf.predict(x_test)
    y_pred2=clf.predict(x_train)

    #print('lightgbm train_MSE '+label+' :',round(mean_squared_error(y_train, y_pred2),3))
    return round(mean_squared_error(y_test, y_pred),3)

def rf(df4,label,rn):
    train=df4[~df4[label].isnull()]

    # Splitting the dataset into the Training set and Test set
    x_train, x_test, y_train, y_test = train_test_split(train.drop(['subject_id','measurement_id','on_off','dyskinesia','tremor'], axis = 1), train[label], 
                                                        train_size=0.7, 
                                                        stratify = train[label],
                                                        random_state=rn)

    clf = RandomForestRegressor(n_estimators=100,random_state = 42)
    clf.fit(x_train, y_train);
    y_pred=clf.predict(x_test)
    y_pred2=clf.predict(x_train)

    #print('lightgbm train_MSE '+label+' :',round(mean_squared_error(y_train, y_pred2),3))
    return round(mean_squared_error(y_test, y_pred),3)
#This will give indexes of the categorical features
def categorical_index(df,cols):
    cat=[]
    for c in cols:
        try:
            cat.append(df.columns.get_loc(c))
        except:
            pass # doing nothing on exception
    return cat

def catboost(df4,label,rn):    
    train=df4[~df4[label].isnull()]
    X_train, X_test, y_train, y_test = train_test_split(train.drop(['subject_id','measurement_id','on_off','dyskinesia','tremor'], axis = 1), train[label], 
                                                        train_size=0.7, 
                                                        stratify = train[label],
                                                        random_state=rn)
    train_dataset = Pool(data=X_train,
                         label=y_train,
                         cat_features=categorical_index(X_train,['']))

    eval_dataset = Pool(data=X_test,
                        label=y_test,
                        cat_features=categorical_index(X_train,['']))



    model = CatBoostRegressor(iterations=50, learning_rate=0.1, depth=6,verbose=False)
    # Fit model
    model.fit(train_dataset)
    # Get predictions
    preds = model.predict(eval_dataset)
    preds2 = model.predict(train_dataset)

    #print('catboost train_MSE '+label+' :',round(mean_squared_error(y_train, preds2),3))
    return round(mean_squared_error(y_test, preds),3)


#RealPD===========================================
def lightgbm_real(df2,label,rn):
    df4=df2.copy()
    df4=df4[df4.tremor!=4]
    train=df4[~df4[label].isnull()]

    # Splitting the dataset into the Training set and Test set
    x_train, x_test, y_train, y_test = train_test_split(train.drop(['subject_id','measurement_id','on_off','dyskinesia','tremor'], axis = 1), train[label], 
                                                        train_size=0.7, 
                                                        stratify = train[label],
                                                        random_state=rn)

    d_train = lgb.Dataset(x_train, label=y_train)

    params = {}
    params['learning_rate'] = 0.03
    params['boosting_type'] = 'gbdt'
    params['objective'] = 'regression'
    params['metric'] = 'rmse'
    params['max_depth'] = 5
    params['feature_fraction'] = 0.8
    clf = lgb.train(params, d_train, 150)
    y_pred=clf.predict(x_test)
    y_pred2=clf.predict(x_train)

    #print('lightgbm train_MSE '+label+' :',round(mean_squared_error(y_train, y_pred2),3))
    return round(mean_squared_error(y_test, y_pred),3)

def rf_real(df2,label,rn):
    df4=df2.copy()
    df4=df4[df4.tremor!=4]
    train=df4[~df4[label].isnull()]


    # Splitting the dataset into the Training set and Test set
    x_train, x_test, y_train, y_test = train_test_split(train.drop(['subject_id','measurement_id','on_off','dyskinesia','tremor'], axis = 1), train[label], 
                                                        train_size=0.7, 
                                                        stratify = train[label],
                                                        random_state=rn)

    clf = RandomForestRegressor(n_estimators=100,random_state = 42)
    clf.fit(x_train, y_train);
    y_pred=clf.predict(x_test)
    y_pred2=clf.predict(x_train)

    #print('lightgbm train_MSE '+label+' :',round(mean_squared_error(y_train, y_pred2),3))
    return round(mean_squared_error(y_test, y_pred),3)
#This will give indexes of the categorical features
def categorical_index(df,cols):
    cat=[]
    for c in cols:
        try:
            cat.append(df.columns.get_loc(c))
        except:
            pass # doing nothing on exception
    return cat

def catboost_real(df2,label,rn):    
    df4=df2.copy()
    df4=df4[df4.tremor!=4]
    train=df4[~df4[label].isnull()]

    X_train, X_test, y_train, y_test = train_test_split(train.drop(['subject_id','measurement_id','on_off','dyskinesia','tremor'], axis = 1), train[label], 
                                                        train_size=0.7, 
                                                        stratify = train[label],
                                                        random_state=rn)
    train_dataset = Pool(data=X_train,
                         label=y_train,
                         cat_features=categorical_index(X_train,['']))

    eval_dataset = Pool(data=X_test,
                        label=y_test,
                        cat_features=categorical_index(X_train,['']))



    model = CatBoostRegressor(iterations=50, learning_rate=0.1, depth=6,verbose=False)
    # Fit model
    model.fit(train_dataset)
    # Get predictions
    preds = model.predict(eval_dataset)
    preds2 = model.predict(train_dataset)

    #print('catboost train_MSE '+label+' :',round(mean_squared_error(y_train, preds2),3))
    return round(mean_squared_error(y_test, preds),3)



#cispd============================================================
def get_lgbm_varimp(df4,label, max_vars=50):
    train=df4[~df4[label].isnull()]

    # Splitting the dataset into the Training set and Test set
    x_train, x_test, y_train, y_test = train_test_split(train.drop(['subject_id','measurement_id','on_off','dyskinesia','tremor'], axis = 1), train[label], 
                                                        train_size=0.7, 
                                                        stratify = train[label],
                                                        random_state=1)

    d_train = lgb.Dataset(x_train, label=y_train)

    params = {}
    params['learning_rate'] = 0.03
    params['boosting_type'] = 'gbdt'
    params['objective'] = 'regression'
    params['metric'] = 'rmse'
    params['max_depth'] = 5
    params['feature_fraction'] = 0.8
    clf = lgb.train(params, d_train, 150)
    
    X=x_train
    model=clf
    num=max_vars
    
    feature_imp = pd.DataFrame({'Value':model.feature_importance(),'Feature':X.columns}).sort_values(by='Value',ascending=False)
    plt.figure(figsize=(40, 40))
    sns.set(font_scale = 5)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", 
                                                        ascending=False)[0:num])
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances-01.png')
    plt.show()
    return(feature_imp.Feature)



#realpd============================================================
def get_lgbm_varimp_realpd(df4,label, max_vars=50):
    train=df4.copy()
    train=train[train.tremor!=4]
    train=train[~train[label].isnull()]

    # Splitting the dataset into the Training set and Test set
    x_train, x_test, y_train, y_test = train_test_split(train.drop(['subject_id','measurement_id','on_off','dyskinesia','tremor'], axis = 1), train[label], 
                                                        train_size=0.7, 
                                                        stratify = train[label],
                                                        random_state=1)

    d_train = lgb.Dataset(x_train, label=y_train)

    params = {}
    params['learning_rate'] = 0.03
    params['boosting_type'] = 'gbdt'
    params['objective'] = 'regression'
    params['metric'] = 'rmse'
    params['max_depth'] = 5
    params['feature_fraction'] = 0.8
    clf = lgb.train(params, d_train, 150)
    
    X=x_train
    model=clf
    num=max_vars
    
    feature_imp = pd.DataFrame({'Value':model.feature_importance(),'Feature':X.columns}).sort_values(by='Value',ascending=False)
    plt.figure(figsize=(40, 40))
    sns.set(font_scale = 5)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", 
                                                        ascending=False)[0:num])
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances-01.png')
    plt.show()
    return(feature_imp.Feature)
    

# Cis PD

In [None]:
#Part1: till hurst
data1=pd.read_csv('analysis2_cispd_comp_training_abhiroop_tillhurst.csv')
print(data1.shape)
data1.head()

In [None]:
#Part2: Wavelet Features
data2=pd.read_csv('cispd_wavelet_training_features.csv')
print(data2.shape)
data2.head()

In [None]:
#Part3: # Last set of features
data3=pd.read_csv('cispd_comp_training_abhiroop_lastfeatures.csv')
print(data3.shape)
data3.head()

In [None]:
#Clinical Features for cipd
data4=pd.read_csv('cispd_clinical_preprocessed.csv')
print(data4.shape)
data4.head()

In [None]:
# Combined Features
data=pd.merge(data1,data2,on=['measurement_id'])
data=pd.merge(data,data3,on=['measurement_id'])
data=pd.merge(cis_pd_id,data,on='measurement_id')
data=pd.merge(data,data4,on=['subject_id'])

# CIS-PD Modeling

### Optimal features set for tremor,dyskinesia,on_off on cispd using Recursive Feature Elimination

In [None]:
#Note: optimal features(125/150/250) numbers are selected using Recursive Feature Elimination grid search and then finally used here
feat_tremor_cispd=rfem2(data,'tremor',125,1)
feat_dyskinesia_cispd=rfem2(data,'dyskinesia',150,1)
feat_on_off_cispd=rfem2(data,'on_off',250,1)

### feature importnace graph



In [None]:
imp1=get_lgbm_varimp(data[feat_tremor_cispd],'tremor', max_vars=50)

imp2=get_lgbm_varimp(data[feat_dyskinesia_cispd],'dyskinesia', max_vars=50)

imp3=get_lgbm_varimp(data[feat_on_off_cispd],'on_off', max_vars=50)

# Real PD

In [None]:
#Clinical Data for Realpd
realpd_clinical=pd.read_csv('realpd_clinical_preprocessed.csv')
print(realpd_clinical.shape)
realpd_clinical.head()

In [None]:
#Part1: till hurst from smartphone
realpd_1_smartphone=pd.read_csv('analysis2_realpd_comp_training_abhiroop_tillhurst_smartphone.csv')
print(realpd_1_smartphone.shape)

#Part1: till hurst from smartwatch
realpd_1_smartwatch=pd.read_csv('analysis2_realpd_comp_training_abhiroop_tillhurst_smartwatch.csv')
print(realpd_1_smartwatch.shape)

In [None]:
#Part2: Wavelet Features for smartphone
realpd_2_smartphone=pd.read_csv('realpd_wavelet_features_smartphone_training.csv')
print(realpd_2_smartphone.shape)

#Part2: Wavelet Features for smartwatch
realpd_2_smartwatch=pd.read_csv('realpd_wavelet_features_smartwatch_training.csv')
print(realpd_2_smartwatch.shape)

In [None]:
#Part3: last set of features for smartphone
realpd_3_smartphone=pd.read_csv('realpd_comp_training_abhiroop_lastfeatures_smartphone.csv')
print(realpd_3_smartphone.shape)

#Part3: last set of features for smartwatch
realpd_3_smartwatch=pd.read_csv('realpd_comp_training_abhiroop_lastfeatures_smartwatch.csv')
print(realpd_3_smartwatch.shape)

In [None]:
# SmartPhone
realpd_smartphone=pd.merge(realpd_1_smartphone,realpd_2_smartphone,on='measurement_id')
realpd_smartphone=pd.merge(realpd_smartphone,realpd_3_smartphone,on='measurement_id')

#columns name change
realpd_smartphone_dummy=realpd_smartphone.copy()
ls=list(realpd_smartphone.columns)
ls.remove('measurement_id')
realpd_smartphone=realpd_smartphone[ls]
realpd_smartphone.columns=[i+'_phone' for i in list(realpd_smartphone[ls].columns)]
realpd_smartphone['measurement_id']=realpd_smartphone_dummy['measurement_id']
print(realpd_smartphone.shape)
realpd_smartphone.head()

In [None]:
# SmartWatch
realpd_smartwatch=pd.merge(realpd_1_smartwatch,realpd_2_smartwatch,on=['measurement_id','device_id_acc'])
realpd_smartwatch=pd.merge(realpd_smartwatch,realpd_3_smartwatch,on=['measurement_id','device_id_acc'])

#columns name change
realpd_smartwatch_dummy=realpd_smartwatch.copy()
ls=list(realpd_smartwatch.columns)[1:]
ls.remove('measurement_id')
realpd_smartwatch=realpd_smartwatch[ls]
realpd_smartwatch.columns=[i+'_watch' for i in list(realpd_smartwatch[ls].columns)]
realpd_smartwatch[['measurement_id','device_id_acc']]=realpd_smartwatch_dummy[['measurement_id','device_id_acc']]

realpd_smartwatch['device_id_acc'] = preprocessing.LabelEncoder().fit(realpd_smartwatch['device_id_acc']).transform(realpd_smartwatch['device_id_acc'])

print(realpd_smartwatch.shape)
realpd_smartwatch.head()


In [None]:
# Smartphone and Smartwatch together
realpd=pd.merge(realpd_smartphone,realpd_smartwatch,on=['measurement_id'])
print(realpd.shape)
realpd.head()

### Optimal features set for tremor,dyskinesia,on_off on smartwatch,smartphone and combined dataset for realpd using Recursive Feature Elimination

In [None]:
#Note: optimal features numbers are selected using Recursive Feature Elimination grid search and then finally used here
feat_tremor_watch=rfem2_real(label_realpd(realpd_smartwatch),'tremor',75,1)
feat_dyskinesia_watch=rfem2_real(label_realpd(realpd_smartwatch),'dyskinesia',100,1)
feat_on_off_watch=rfem2_real(label_realpd(realpd_smartwatch),'on_off',275,1)

feat_tremor_phone=rfem2_real(label_realpd(realpd_smartphone),'tremor',175,1)
feat_dyskinesia_phone=rfem2_real(label_realpd(realpd_smartphone),'dyskinesia',125,1)
feat_on_off_phone=rfem2_real(label_realpd(realpd_smartphone),'on_off',200,1)

feat_tremor_both=rfem2_real(label_realpd(realpd),'tremor',150,1)
feat_dyskinesia_both=rfem2_real(label_realpd(realpd),'dyskinesia',175,1)
feat_on_off_both=rfem2_real(label_realpd(realpd),'on_off',200,1)

# Prediction on test data

In [None]:
#CisPD===========================================
def lightgbm_finalpredicion(df4,df4_test,label):
    train=df4[~df4[label].isnull()].copy()
    test=df4_test.copy()
    
    x_train=train.drop(['subject_id','measurement_id','on_off','dyskinesia','tremor'], axis = 1)
    y_train=train[label]
    x_test= test.drop(['subject_id','measurement_id'], axis = 1)

    d_train = lgb.Dataset(x_train, label=y_train)

    params = {}
    params['learning_rate'] = 0.03
    params['boosting_type'] = 'gbdt'
    params['objective'] = 'regression'
    params['metric'] = 'rmse'
    params['max_depth'] = 5
    params['feature_fraction'] = 0.8
    clf = lgb.train(params, d_train, 150)
    y_pred=clf.predict(x_test)
    return y_pred

#This will give indexes of the categorical features
def categorical_index(df,cols):
    cat=[]
    for c in cols:
        try:
            cat.append(df.columns.get_loc(c))
        except:
            pass # doing nothing on exception
    return cat

def catboost_finalpredicion(df4,df4_test,label):    
    train=df4[~df4[label].isnull()].copy()
    test=df4_test.copy()
    
    X_train=train.drop(['subject_id','measurement_id','on_off','dyskinesia','tremor'], axis = 1)
    y_train=train[label]
    X_test= test.drop(['subject_id','measurement_id'], axis = 1)
    
    train_dataset = Pool(data=X_train,
                         label=y_train,
                         cat_features=categorical_index(X_train,['']))

    eval_dataset = Pool(data=X_test,
                        cat_features=categorical_index(X_train,['']))



    model = CatBoostRegressor(iterations=50, learning_rate=0.1, depth=6,verbose=False)
    # Fit model
    model.fit(train_dataset)
    # Get predictions
    preds = model.predict(eval_dataset)
    return preds

def label_realpd(data):
    label=real_pd_training_id.append(pd.DataFrame(real_pd_ancillary_id), ignore_index=True)
    label=pd.merge(label,realpd_clinical,on='subject_id')
    df_train=pd.merge(label,data,on='measurement_id')
    return(df_train)


## Cis PD

In [None]:
###Part1: till hurst
data1_test=pd.read_csv('analysis2_cispd_comp_testing_abhiroop_tillhurst.csv')
print(data1_test.shape)

###Part2: Wavelet Features
data2_test=pd.read_csv('cispd_wavelet_testing_features.csv')
print(data2_test.shape)

###Part3: Last set of features
data3_test=pd.read_csv('cispd_comp_testing_abhiroop_lastfeatures.csv')
print(data3_test.shape)

###Updated Clinical Features
data4=pd.read_csv('cispd_clinical_preprocessed.csv')

### Combined Features
data_test=pd.merge(data1_test,data2_test,on=['measurement_id'])
data_test=pd.merge(data_test,data3_test,on=['measurement_id'])
data_test=pd.merge(cis_pd_testing_id,data_test,on='measurement_id')
data_test=pd.merge(data_test,data4,on=['subject_id'])
print(data_test.shape)
data_test.head()

## Realpd

In [None]:
#training smartwatch data
realpd_1_smartwatch_dum=pd.read_csv('analysis2_realpd_comp_training_abhiroop_tillhurst_smartwatch.csv')
realpd_2_smartwatch_dum=pd.read_csv('realpd_wavelet_features_smartwatch_training.csv')
realpd_3_smartwatch_dum=pd.read_csv('realpd_comp_training_abhiroop_lastfeatures_smartwatch.csv')
realpd_smartwatch_dum=pd.merge(realpd_1_smartwatch_dum,realpd_2_smartwatch_dum,on=['measurement_id','device_id_acc'])
realpd_smartwatch_dum=pd.merge(realpd_smartwatch_dum,realpd_3_smartwatch_dum,on=['measurement_id','device_id_acc'])


##Clinical Data
realpd_clinical_test=pd.read_csv('realpd_clinical_preprocessed.csv')
label_test=pd.merge(real_pd_testing_id,realpd_clinical_test,on='subject_id')
print('label: ',label_test.shape)

##Part1: till hurst from smartphone
realpd_1_smartphone_test=pd.read_csv('analysis2_realpd_comp_testing_abhiroop_tillhurst_smartphone.csv')
print('part1_phone: ',realpd_1_smartphone_test.shape)

##Part1: till hurst from smartwatch
realpd_1_smartwatch_test=pd.read_csv('analysis2_realpd_comp_testing_abhiroop_tillhurst_smartwatch.csv')
print('part1_watch: ',realpd_1_smartwatch_test.shape)


##Part2: Wavelet Features from smartphone
realpd_2_smartphone_test=pd.read_csv('realpd_wavelet_features_smartphone_testing.csv')
print('part2_phone: ',realpd_2_smartphone_test.shape)

##Part2: Wavelet Features from smartwatch
realpd_2_smartwatch_test=pd.read_csv('realpd_wavelet_features_smartwatch_testing.csv')
print('part2_watch: ',realpd_2_smartwatch_test.shape)


##Part3: last set of features from smartphone
realpd_3_smartphone_test=pd.read_csv('realpd_comp_testing_abhiroop_lastfeatures_smartphone.csv')
print('part3_phone: ',realpd_3_smartphone_test.shape)

##Part3: last set of features from smartwatch
realpd_3_smartwatch_test=pd.read_csv('realpd_comp_testing_abhiroop_lastfeatures_smartwatch.csv')
print('part3_watch: ',realpd_3_smartwatch_test.shape)


## Combined Data

### SmartPhone==============
realpd_smartphone_test=pd.merge(realpd_1_smartphone_test,realpd_2_smartphone_test,on='measurement_id')
realpd_smartphone_test=pd.merge(realpd_smartphone_test,realpd_3_smartphone_test,on='measurement_id')

#columns name change
realpd_smartphone_dummy_test=realpd_smartphone_test.copy()
ls=list(realpd_smartphone_test.columns)
ls.remove('measurement_id')
realpd_smartphone_test=realpd_smartphone_test[ls]
realpd_smartphone_test.columns=[i+'_phone' for i in list(realpd_smartphone_test[ls].columns)]
realpd_smartphone_test['measurement_id']=realpd_smartphone_dummy_test['measurement_id']
print('phone: ',realpd_smartphone_test.shape)


### SmartWatch================
realpd_smartwatch_test=pd.merge(realpd_1_smartwatch_test,realpd_2_smartwatch_test,on=['measurement_id','device_id_acc'])
realpd_smartwatch_test=pd.merge(realpd_smartwatch_test,realpd_3_smartwatch_test,on=['measurement_id','device_id_acc'])

#columns name change
realpd_smartwatch_dummy_test=realpd_smartwatch_test.copy()
ls=list(realpd_smartwatch_test.columns)[1:]
ls.remove('measurement_id')
realpd_smartwatch_test=realpd_smartwatch_test[ls]
realpd_smartwatch_test.columns=[i+'_watch' for i in list(realpd_smartwatch_test[ls].columns)]
realpd_smartwatch_test[['measurement_id','device_id_acc']]=realpd_smartwatch_dummy_test[['measurement_id','device_id_acc']]
realpd_smartwatch_test['device_id_acc']=preprocessing.LabelEncoder().fit(realpd_smartwatch_dum['device_id_acc']).transform(realpd_smartwatch_test['device_id_acc'])
print('watch: ',realpd_smartwatch_test.shape)


### Both SmartPhone & SmartWatch
realpd_test=pd.merge(realpd_smartphone_test,realpd_smartwatch_test,on=['measurement_id'])
print('combined: ',realpd_test.shape)

realpd_smartwatch_test=pd.merge(realpd_smartwatch_test,label_test,on='measurement_id')
realpd_smartphone_test=pd.merge(realpd_smartphone_test,label_test,on='measurement_id')
realpd_test=pd.merge(realpd_test,label_test,on='measurement_id')
print('final_phone: ',realpd_smartphone_test.shape)
print('final_watch: ',realpd_smartwatch_test.shape)
print('final_combined: ',realpd_test.shape)

In [None]:
#Submission templates
sub4_tremor=pd.read_csv('BEAT-PD_SC3_Tremor_Submission_Template.csv')
sub4_dyskinesia=pd.read_csv('BEAT-PD_SC2_Dyskinesia_Submission_Template.csv')
sub4_on_off=pd.read_csv('BEAT-PD_SC1_OnOff_Submission_Template.csv')

#adding type columns(source of data)
sub_cis=data_test[['measurement_id']].copy()
sub_cis['type']='cispd'

sub_both=realpd_test[['measurement_id']].copy()
sub_both['type2']='both'

sub_watch=realpd_smartwatch_test[['measurement_id']].copy()
sub_watch['type']='watch'

sub_phone=realpd_smartphone_test[['measurement_id']].copy()
sub_phone['type']='phone'

#appending all test data
sub = sub_cis.append(pd.DataFrame(data = sub_watch), ignore_index=True)
sub = sub.append(pd.DataFrame(data = sub_phone), ignore_index=True)
sub = pd.merge(sub,sub_both,on='measurement_id',how='left')
sub.loc[~sub.type2.isnull(),'type']='both'
sub=sub.drop('type2',axis=1)
sub=sub.drop_duplicates()

#adding type columns(source of data) in Submission templates
sub4_tremor=pd.merge(sub4_tremor,sub,on='measurement_id',how='left')
sub4_dyskinesia=pd.merge(sub4_dyskinesia,sub,on='measurement_id',how='left')
sub4_on_off=pd.merge(sub4_on_off,sub,on='measurement_id',how='left')

## Prediction on test data for cispd

In [None]:
cispd_tremor=catboost_finalpredicion(data[feat_tremor_cispd],data_test,'tremor')
cispd_dyskinesia=catboost_finalpredicion(data[feat_dyskinesia_cispd],data_test,'dyskinesia')
cispd_on_off=catboost_finalpredicion(data[feat_on_off_cispd],data_test,'on_off')

cispd_pred=pd.DataFrame({'measurement_id':data_test.measurement_id,
             'tremor_cis':cispd_tremor,
             'dyskinesia_cis':cispd_dyskinesia,
             'on_off_cis':cispd_on_off})

## Prediction on test data for realpd

In [None]:
#smartwatch=====================
realpd_tremor_watch=catboost_finalpredicion(label_realpd(realpd_smartwatch)[feat_tremor_watch],realpd_smartwatch_test,'tremor')
realpd_dyskinesia_watch=catboost_finalpredicion(label_realpd(realpd_smartwatch)[feat_dyskinesia_watch],realpd_smartwatch_test,'dyskinesia')
realpd_on_off_watch=catboost_finalpredicion(label_realpd(realpd_smartwatch)[feat_on_off_watch],realpd_smartwatch_test,'on_off')

watch_pred=pd.DataFrame({'measurement_id':realpd_smartwatch_test.measurement_id,
             'tremor_watch':realpd_tremor_watch,
             'dyskinesia_watch':realpd_dyskinesia_watch,
             'on_off_watch':realpd_on_off_watch})

#smartphone=======================
realpd_tremor_phone=catboost_finalpredicion(label_realpd(realpd_smartphone)[feat_tremor_phone],realpd_smartphone_test,'tremor')
realpd_dyskinesia_phone=catboost_finalpredicion(label_realpd(realpd_smartphone)[feat_dyskinesia_phone],realpd_smartphone_test,'dyskinesia')
realpd_on_off_phone=catboost_finalpredicion(label_realpd(realpd_smartphone)[feat_on_off_phone],realpd_smartphone_test,'on_off')

phone_pred=pd.DataFrame({'measurement_id':realpd_smartphone_test.measurement_id,
             'tremor_phone':realpd_tremor_phone,
             'dyskinesia_phone':realpd_dyskinesia_phone,
             'on_off_phone':realpd_on_off_phone})

#both=======================
realpd_tremor_both=catboost_finalpredicion(label_realpd(realpd)[feat_tremor_both],realpd_test,'tremor')
realpd_dyskinesia_both=catboost_finalpredicion(label_realpd(realpd)[feat_dyskinesia_both],realpd_test,'dyskinesia')
realpd_on_off_both=catboost_finalpredicion(label_realpd(realpd)[feat_on_off_both],realpd_test,'on_off')

both_pred=pd.DataFrame({'measurement_id':realpd_test.measurement_id,
             'tremor_both':realpd_tremor_both,
             'dyskinesia_both':realpd_dyskinesia_both,
             'on_off_both':realpd_on_off_both})


### tremor

In [None]:
#combining all prediction from different data(smarttphone, smartwatch and both) for tremor=====================
final_tremor=pd.merge(sub4_tremor,cispd_pred,on='measurement_id',how='left')
final_tremor=pd.merge(final_tremor,watch_pred,on='measurement_id',how='left')
final_tremor=pd.merge(final_tremor,phone_pred,on='measurement_id',how='left')
final_tremor=pd.merge(final_tremor,both_pred,on='measurement_id',how='left')
final_tremor['prediction2']=np.nan

#taking prediction from best dataset for tremor
final_tremor['prediction2']=np.where(final_tremor.type=='cispd',final_tremor.tremor_cis,
        np.where(final_tremor.type=='phone',final_tremor.tremor_phone,final_tremor.tremor_watch))
final_tremor=final_tremor[['measurement_id','prediction2']]
final_tremor.columns=['measurement_id','prediction']
final_tremor.prediction=np.where(final_tremor.prediction<=0,0,final_tremor.prediction)

### dyskinesia

In [None]:
#combining all prediction from different data(smarttphone, smartwatch and both) for dyskinesia=====================
final_dyskinesia=pd.merge(sub4_dyskinesia,cispd_pred,on='measurement_id',how='left')
final_dyskinesia=pd.merge(final_dyskinesia,watch_pred,on='measurement_id',how='left')
final_dyskinesia=pd.merge(final_dyskinesia,phone_pred,on='measurement_id',how='left')
final_dyskinesia=pd.merge(final_dyskinesia,both_pred,on='measurement_id',how='left')
final_dyskinesia['prediction2']=np.nan

#taking prediction from best dataset for dyskinesia
final_dyskinesia['prediction2']=np.where(final_dyskinesia.type=='cispd',final_dyskinesia.dyskinesia_cis,
        np.where(final_dyskinesia.type=='watch',final_dyskinesia.dyskinesia_watch,final_dyskinesia.dyskinesia_phone))
final_dyskinesia=final_dyskinesia[['measurement_id','prediction2']]
final_dyskinesia.columns=['measurement_id','prediction']
final_dyskinesia.prediction=np.where(final_dyskinesia.prediction<=0,0,final_dyskinesia.prediction)

### on_off

In [None]:
##combining all prediction from different data(smarttphone, smartwatch and both) for on_off=====================
final_on_off=pd.merge(sub4_on_off,cispd_pred,on='measurement_id',how='left')
final_on_off=pd.merge(final_on_off,watch_pred,on='measurement_id',how='left')
final_on_off=pd.merge(final_on_off,phone_pred,on='measurement_id',how='left')
final_on_off=pd.merge(final_on_off,both_pred,on='measurement_id',how='left')
final_on_off['prediction2']=np.nan

#taking prediction from best dataset for on_off
final_on_off['prediction2']=np.where(final_on_off.type=='cispd',final_on_off.on_off_cis,
        np.where(final_on_off.type=='both',final_on_off.on_off_both,
                 np.where(final_on_off.type=='watch',final_on_off.on_off_watch,final_on_off.on_off_phone)))
final_on_off=final_on_off[['measurement_id','prediction2']]
final_on_off.columns=['measurement_id','prediction']
final_on_off.prediction=np.where(final_on_off.prediction<=0,0,final_on_off.prediction)

In [None]:
final_tremor.shape,final_dyskinesia.shape,final_on_off.shape

In [None]:
#Predictions of 3 subchallenges
final_tremor.to_csv('final_tremor.csv',index=False)
final_dyskinesia.to_csv('final_dyskinesia.csv',index=False)
final_on_off.to_csv('final_on_off.csv',index=False)