<a href="https://www.kaggle.com/code/averma111/pytorch-amp-prediction?scriptVersionId=128716334" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [76]:
import warnings
import pandas as pd
import numpy as np
import amp_pd_peptide_310
warnings.simplefilter(action='ignore')
import torch 
import copy

In [77]:
ROOT_PATH ='/kaggle/input/amp-parkinsons-disease-progression-prediction/'
train_proteins ='train_proteins.csv'
train_peptides = 'train_peptides.csv'
train_clinical = 'train_clinical_data.csv'

In [78]:
proteins = pd.read_csv(ROOT_PATH+train_proteins)
peptides = pd.read_csv(ROOT_PATH+train_peptides)
clinical = pd.read_csv(ROOT_PATH+train_clinical)

In [79]:
def summary(text, df):
    print(f'{text} shape: {df.shape}')
    summ = pd.DataFrame(df.dtypes, columns=['dtypes'])
    summ['null'] = df.isnull().sum()
    summ['unique'] = df.nunique()
    summ['min'] = df.min()
    summ['median'] = df.median()
    summ['max'] = df.max()
    summ['mean'] = df.mean()
    summ['std'] = df.std()
    #summ['inf'] = np.isinf(df).sum().sum()
    summ['duplicate'] = df.duplicated().sum()
    return summ

In [80]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [81]:
summary('proteins',reduce_mem_usage(proteins))

Memory usage of dataframe is 8.88 MB
Memory usage after optimization is: 2.94 MB
Decreased by 66.9%
proteins shape: (232741, 5)


Unnamed: 0,dtypes,null,unique,min,median,max,mean,std,duplicate
visit_id,category,0,1113,,,,,,0
visit_month,int8,0,15,0.0,24.0,108.0,26.0992,22.87472,0
patient_id,int32,0,248,55.0,29313.0,65043.0,32593.88,18608.48,0
UniProt,category,0,227,,,,,,0
NPX,float32,0,218795,84.6082,113556.0,613851008.0,2712077.0,22241550.0,0


In [82]:
summary('peptides',reduce_mem_usage(peptides))

Memory usage of dataframe is 44.94 MB
Memory usage after optimization is: 14.13 MB
Decreased by 68.6%
peptides shape: (981834, 6)


Unnamed: 0,dtypes,null,unique,min,median,max,mean,std,duplicate
visit_id,category,0,1113,,,,,,0
visit_month,int8,0,15,0.0,24.0,108.0,26.105061,22.9139,0
patient_id,int32,0,248,55.0,29313.0,65043.0,32603.465361,18605.93,0
UniProt,category,0,227,,,,,,0
Peptide,category,0,968,,,,,,0
PeptideAbundance,float32,0,738931,10.9985,74308.296875,178752000.0,642890.25,3377989.0,0


In [83]:
summary('clinical',reduce_mem_usage(clinical))

Memory usage of dataframe is 0.16 MB
Memory usage after optimization is: 0.12 MB
Decreased by 22.9%
clinical shape: (2615, 8)


Unnamed: 0,dtypes,null,unique,min,median,max,mean,std,duplicate
visit_id,category,0,2615,,,,,,0
patient_id,int32,0,248,55.0,29417.0,65043.0,32651.743786,18535.7587,0
visit_month,int8,0,17,0.0,24.0,108.0,31.190822,25.199053,0
updrs_1,float16,1,32,0.0,6.0,33.0,7.113281,5.527344,0
updrs_2,float16,2,36,0.0,5.0,40.0,6.746094,6.320312,0
updrs_3,float16,25,72,0.0,19.0,86.0,19.421875,15.007812,0
updrs_4,float16,1038,19,0.0,0.0,20.0,1.861328,3.021484,0
upd23b_clinical_state_on_medication,category,1327,2,,,,,,0


In [84]:
clinical['updrs_1'].fillna(clinical['updrs_1'].mean(), inplace=True)
clinical['updrs_2'].fillna(clinical['updrs_2'].mean(), inplace=True)
clinical['updrs_3'].fillna(clinical['updrs_3'].mean(), inplace=True)
clinical['updrs_4'].fillna(clinical['updrs_1'].mean(), inplace=True)

In [85]:
clinical.drop(['upd23b_clinical_state_on_medication'],axis = 1, inplace = True)

In [86]:
def check_cycle(merge_data):
    merge_data = merge_data.reset_index(drop=True)
    merge_data = merge_data.drop_duplicates()
    merge_data = merge_data.reset_index(drop=True)

    cycle = []
    same_cycle = 1

    prev_patient_id = merge_data.loc[0, 'patient_id']  
    prev_visit_month = merge_data.loc[0, 'visit_month']  

    for i, j in merge_data.iterrows():
        if((prev_patient_id == j.patient_id) and (prev_visit_month == j.visit_month)):
            cycle.append(same_cycle)
        elif((prev_patient_id == j.patient_id) and (prev_visit_month != j.visit_month)):
            same_cycle = same_cycle + 1
            cycle.append(same_cycle)
        else:
            same_cycle = 1
            cycle.append(same_cycle)

        prev_patient_id = j.patient_id
        prev_visit_month = j.visit_month

    return cycle

In [87]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=999)
scaler = StandardScaler()

In [88]:
not_required = ['O00533','O14498','O15240','O15394','O43505','O60888','P00738','P01034','P01042','P01717',
         'P02452','P02649','P02751','P02753','P02787','P04075','P04156','P04180','P04216','P05060',
         'P05067','P05155','P05408','P05452','P06396','P07195','P07225','P07602','P07711','P07858',
         'P08133','P08253','P08571','P09104','P09486','P09871','P10645','P11142','P13521','P13591',
         'P13611','P13987','P14313','P14618','P17174','P19021','P23083','P23142','P39060','P40925',
         'P43121','P49908','P54289','P55290','P61278','P61769','P61916','P98160','Q02818','Q06481',
         'Q08380','Q12907','Q13332','Q14118','Q14508','Q14515','Q15904','Q16610','Q6UXB8','Q7Z3B1',
         'Q8NBJ4','Q92520','Q92823','Q96KN2','Q99435','Q99674','Q9BY67','Q9NQ79','Q9NYU2','Q9UHG2',
         'P01594','Q13449','Q99829']

In [89]:
def data_prep(clinical, peptides, proteins, train=1):
    
    peptides = peptides[~peptides['UniProt'].isin(not_required)].reset_index()
    proteins = proteins[~proteins['UniProt'].isin(not_required)].reset_index()
    
    merge_data = clinical.merge(proteins, how='left', on=['visit_id', 'visit_month', 'patient_id'])

    #merge_data['UniProt'].fillna("other", inplace=True)
    merge_data['NPX'].fillna((merge_data['NPX'].mean()), inplace=True)

    if(train):
        merge_data[["UniProt"]] = encoder.fit_transform(merge_data[["UniProt"]])
    else :
        merge_data[["UniProt"]] = encoder.transform(merge_data[["UniProt"]])

        
    cycle = check_cycle(merge_data)
    merge_data['cycle'] = cycle
    
    if(train):
        cols_normalize = merge_data.columns.difference(['updrs_1','updrs_2','updrs_3','updrs_4','visit_id','patient_id'])
        norm_train_df = pd.DataFrame(scaler.fit_transform(merge_data[cols_normalize]), 
                             columns=cols_normalize, 
                             index=merge_data.index)
        join_df = merge_data[['updrs_1','updrs_2','updrs_3','updrs_4','visit_id','patient_id']].join(norm_train_df)
        merge_data = join_df.reindex(columns = merge_data.columns)
    else :
        merge_data['month_raw'] =  copy.deepcopy(merge_data.loc[:, 'visit_month'])
        cols_normalize = merge_data.columns.difference(['visit_id','patient_id','updrs_test','row_id','month_raw'])
        norm_train_df = pd.DataFrame(scaler.transform(merge_data[cols_normalize]), 
                             columns=cols_normalize, 
                             index=merge_data.index)
        join_df = merge_data[['visit_id','patient_id','updrs_test','row_id','month_raw']].join(norm_train_df)
        merge_data = join_df.reindex(columns = merge_data.columns)
    
    merge_data.drop(['index'],axis = 1, inplace = True)
    
    return merge_data

In [90]:
train_data =  data_prep(clinical, peptides, proteins, train=1)
train_data = reduce_mem_usage(train_data)
train_data.head()

Memory usage of dataframe is 12.20 MB
Memory usage after optimization is: 8.20 MB
Decreased by 32.8%


Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,UniProt,NPX,cycle
0,55_0,55,-1.171875,10.0,6.0,15.0,7.105469,-1.708008,-0.131836,-1.220703
1,55_0,55,-1.171875,10.0,6.0,15.0,7.105469,-1.683594,-0.130737,-1.220703
2,55_0,55,-1.171875,10.0,6.0,15.0,7.105469,-1.65918,-0.131104,-1.220703
3,55_0,55,-1.171875,10.0,6.0,15.0,7.105469,-1.635742,-0.13208,-1.220703
4,55_0,55,-1.171875,10.0,6.0,15.0,7.105469,-1.611328,-0.128662,-1.220703


In [91]:
submission = pd.read_csv("/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/sample_submission.csv")

test_proteins = pd.read_csv("/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_proteins.csv")
test_proteins.drop(['group_key'],axis = 1, inplace = True)

test_peptides = pd.read_csv("/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_peptides.csv")
test_peptides.drop(['group_key'],axis = 1, inplace = True)

test_clinical = pd.read_csv("/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test.csv")
test_clinical.drop(['group_key'],axis = 1, inplace = True)

In [92]:
test_data = data_prep(test_clinical, test_peptides, test_proteins, train=0)
test_data = reduce_mem_usage(test_data)
test_data.head()

Memory usage of dataframe is 0.12 MB
Memory usage after optimization is: 0.06 MB
Decreased by 51.1%


Unnamed: 0,visit_id,visit_month,patient_id,updrs_test,row_id,UniProt,NPX,cycle,month_raw
0,3342_0,-1.171875,3342,updrs_1,3342_0_updrs_1,,0.006779,-1.220703,0
1,3342_0,-1.171875,3342,updrs_2,3342_0_updrs_2,,0.006779,-1.220703,0
2,3342_0,-1.171875,3342,updrs_3,3342_0_updrs_3,,0.006779,-1.220703,0
3,3342_0,-1.171875,3342,updrs_4,3342_0_updrs_4,,0.006779,-1.220703,0
4,50423_0,-1.171875,50423,updrs_1,50423_0_updrs_1,-1.708008,-0.130981,-1.220703,0
