<a href="https://www.kaggle.com/code/averma111/pytorch-amp-prediction?scriptVersionId=128715095" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [31]:
import warnings
import pandas as pd
import numpy as np
import amp_pd_peptide_310
warnings.simplefilter(action='ignore')
import torch 

In [32]:
ROOT_PATH ='/kaggle/input/amp-parkinsons-disease-progression-prediction/'
train_proteins ='train_proteins.csv'
train_peptides = 'train_peptides.csv'
train_clinical = 'train_clinical_data.csv'

In [33]:
proteins = pd.read_csv(ROOT_PATH+train_proteins)
peptides = pd.read_csv(ROOT_PATH+train_peptides)
clinical = pd.read_csv(ROOT_PATH+train_clinical)

In [34]:
def summary(text, df):
    print(f'{text} shape: {df.shape}')
    summ = pd.DataFrame(df.dtypes, columns=['dtypes'])
    summ['null'] = df.isnull().sum()
    summ['unique'] = df.nunique()
    summ['min'] = df.min()
    summ['median'] = df.median()
    summ['max'] = df.max()
    summ['mean'] = df.mean()
    summ['std'] = df.std()
    #summ['inf'] = np.isinf(df).sum().sum()
    summ['duplicate'] = df.duplicated().sum()
    return summ

In [35]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [36]:
summary('proteins',reduce_mem_usage(proteins))

Memory usage of dataframe is 8.88 MB
Memory usage after optimization is: 2.94 MB
Decreased by 66.9%
proteins shape: (232741, 5)


Unnamed: 0,dtypes,null,unique,min,median,max,mean,std,duplicate
visit_id,category,0,1113,,,,,,0
visit_month,int8,0,15,0.0,24.0,108.0,26.0992,22.87472,0
patient_id,int32,0,248,55.0,29313.0,65043.0,32593.88,18608.48,0
UniProt,category,0,227,,,,,,0
NPX,float32,0,218795,84.6082,113556.0,613851008.0,2712077.0,22241550.0,0


In [37]:
summary('peptides',reduce_mem_usage(peptides))

Memory usage of dataframe is 44.94 MB
Memory usage after optimization is: 14.13 MB
Decreased by 68.6%
peptides shape: (981834, 6)


Unnamed: 0,dtypes,null,unique,min,median,max,mean,std,duplicate
visit_id,category,0,1113,,,,,,0
visit_month,int8,0,15,0.0,24.0,108.0,26.105061,22.9139,0
patient_id,int32,0,248,55.0,29313.0,65043.0,32603.465361,18605.93,0
UniProt,category,0,227,,,,,,0
Peptide,category,0,968,,,,,,0
PeptideAbundance,float32,0,738931,10.9985,74308.296875,178752000.0,642890.25,3377989.0,0


In [38]:
summary('clinical',reduce_mem_usage(clinical))

Memory usage of dataframe is 0.16 MB
Memory usage after optimization is: 0.12 MB
Decreased by 22.9%
clinical shape: (2615, 8)


Unnamed: 0,dtypes,null,unique,min,median,max,mean,std,duplicate
visit_id,category,0,2615,,,,,,0
patient_id,int32,0,248,55.0,29417.0,65043.0,32651.743786,18535.7587,0
visit_month,int8,0,17,0.0,24.0,108.0,31.190822,25.199053,0
updrs_1,float16,1,32,0.0,6.0,33.0,7.113281,5.527344,0
updrs_2,float16,2,36,0.0,5.0,40.0,6.746094,6.320312,0
updrs_3,float16,25,72,0.0,19.0,86.0,19.421875,15.007812,0
updrs_4,float16,1038,19,0.0,0.0,20.0,1.861328,3.021484,0
upd23b_clinical_state_on_medication,category,1327,2,,,,,,0


In [39]:
clinical['updrs_1'].fillna(clinical['updrs_1'].mean(), inplace=True)
clinical['updrs_2'].fillna(clinical['updrs_2'].mean(), inplace=True)
clinical['updrs_3'].fillna(clinical['updrs_3'].mean(), inplace=True)
clinical['updrs_4'].fillna(clinical['updrs_1'].mean(), inplace=True)

In [40]:
clinical.drop(['upd23b_clinical_state_on_medication'],axis = 1, inplace = True)

In [41]:
def check_cycle(merge_data):
    merge_data = merge_data.reset_index(drop=True)
    merge_data = merge_data.drop_duplicates()
    merge_data = merge_data.reset_index(drop=True)

    cycle = []
    same_cycle = 1

    prev_patient_id = all_merge.loc[0, 'patient_id']  
    prev_visit_month = all_merge.loc[0, 'visit_month']  

    for i, j in all_merge.iterrows():
        if((prev_patient_id == j.patient_id) and (prev_visit_month == j.visit_month)):
            cycle.append(same_cycle)
        elif((prev_patient_id == j.patient_id) and (prev_visit_month != j.visit_month)):
            same_cycle = same_cycle + 1
            cycle.append(same_cycle)
        else:
            same_cycle = 1
            ycle.append(same_cycle)

        prev_patient_id = j.patient_id
        prev_visit_month = j.visit_month

    return cycle