# Data loading and inspection

In [27]:
%run "utilspro.py"
# Unzipping the provided dataset
with zipfile.ZipFile("data/amp-parkinsons-disease-progression-prediction_2.zip", 'r') as zip_ref:
    zip_ref.extractall("data/amp-parkinsons-disease-progression-prediction_2")

# Load each dataset into a dataframe
train_peptides = pd.read_csv("data/amp-parkinsons-disease-progression-prediction_2/train_peptides.csv")
train_proteins = pd.read_csv("data/amp-parkinsons-disease-progression-prediction_2/train_proteins.csv")
train_clinical_data = pd.read_csv("data/amp-parkinsons-disease-progression-prediction_2/train_clinical_data.csv")
supplemental_clinical_data = pd.read_csv("data/amp-parkinsons-disease-progression-prediction_2/supplemental_clinical_data.csv")


# Inspecting the first few rows of each dataset
dfs = {
    "Train Peptides": train_peptides,
    "Train Proteins": train_proteins,
    "Train Clinical Data": train_clinical_data,
    "Supplemental Clinical Data": supplemental_clinical_data
}

# Displaying the first few rows of each dataframe and setting truncation to False
pd.set_option('display.max_colwidth', None)
for name, df in dfs.items():
    print(name)
    print("/n")
    print(df.head())

Train Peptides
/n
  visit_id  visit_month  patient_id UniProt   
0     55_0            0          55  O00391  \
1     55_0            0          55  O00533   
2     55_0            0          55  O00533   
3     55_0            0          55  O00533   
4     55_0            0          55  O00533   

                                  Peptide  PeptideAbundance  
0                           NEQEQPLGQWHLS           11254.3  
1                             GNPEPTFSWTK          102060.0  
2                         IEIPSSVQQVPTIIK          174185.0  
3  KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK           27278.9  
4                            SMEQNGPGLEYR           30838.7  
Train Proteins
/n
  visit_id  visit_month  patient_id UniProt       NPX
0     55_0            0          55  O00391   11254.3
1     55_0            0          55  O00533  732430.0
2     55_0            0          55  O00584   39585.8
3     55_0            0          55  O14498   41526.9
4     55_0            0          55  O1

# Data cleaning


In [28]:
# Check for missing values in each dataset
missing_data_info = {}
for name, dataset in dfs.items():
    missing_data_info[name] = dataset.isnull().sum()

missing_data_info

{'Train Peptides': visit_id            0
 visit_month         0
 patient_id          0
 UniProt             0
 Peptide             0
 PeptideAbundance    0
 dtype: int64,
 'Train Proteins': visit_id       0
 visit_month    0
 patient_id     0
 UniProt        0
 NPX            0
 dtype: int64,
 'Train Clinical Data': visit_id                                  0
 patient_id                                0
 visit_month                               0
 updrs_1                                   1
 updrs_2                                   2
 updrs_3                                  25
 updrs_4                                1038
 upd23b_clinical_state_on_medication    1327
 dtype: int64,
 'Supplemental Clinical Data': visit_id                                  0
 patient_id                                0
 visit_month                               0
 updrs_1                                 213
 updrs_2                                 214
 updrs_3                                   5
 updrs_4

# Summary of missing values

1. Train Peptides:No missing values.
2. Train Proteins:No missing values.
3. Train Clinical Data:
    * updrs_1: 1 missing value
    * updrs_2: 2 missing values
    * updrs_3: 25 missing values
    * updrs_4: 1038 missing values
    * upd23b_clinical_state_on_medication: 1327 missing values

# Checking for duplicated entries in each dataset


In [31]:
# Checking for duplicate entries in each dataset
duplicate_data_info = {}
for name, dataset in dfs.items():
    duplicate_data_info[name] = dataset.duplicated().sum()

duplicate_data_info

{'Train Peptides': 0,
 'Train Proteins': 0,
 'Train Clinical Data': 0,
 'Supplemental Clinical Data': 0}

None of the datasets have duplicate entries

# Checking for datatypes

In [33]:
# Check data types for each column in every dataset
data_types_info = {}
for name, dataset in dfs.items():
    data_types_info[name] = dataset.dtypes
data_types_info

{'Train Peptides': visit_id             object
 visit_month           int64
 patient_id            int64
 UniProt              object
 Peptide              object
 PeptideAbundance    float64
 dtype: object,
 'Train Proteins': visit_id        object
 visit_month      int64
 patient_id       int64
 UniProt         object
 NPX            float64
 dtype: object,
 'Train Clinical Data': visit_id                                object
 patient_id                               int64
 visit_month                              int64
 updrs_1                                float64
 updrs_2                                float64
 updrs_3                                float64
 updrs_4                                float64
 upd23b_clinical_state_on_medication     object
 dtype: object,
 'Supplemental Clinical Data': visit_id                                object
 patient_id                               int64
 visit_month                              int64
 updrs_1                                f

1. Train Peptides:
    * visit_id, UniProt, Peptide: Object (String)
    * visit_month, patient_id: Integer
    * PeptideAbundance: Float
2. Train Proteins:
    * visit_id, UniProt: Object (String)
    * visit_month, patient_id: Integer
    * NPX: Float
3. Train Clinical Data:
    * visit_id, upd23b_clinical_state_on_medication: Object (String)
    * patient_id, visit_month: Integer
    * updrs_1 to updrs_4: Float
4. Supplemental Clinical Data:
    * visit_id, upd23b_clinical_state_on_medication: Object (String)
    * patient_id, visit_month: Integer
    * updrs_1 to updrs_4: Float

All the data types seem appropriate for their respective columns. String columns are represented as objects, numerical columns are either integers or floats based on the nature of the data they hold.

# Statistical overview

In [38]:
# Obtain basic statistics for numerical columns in train_peptides dataset
train_peptides_stats = train_peptides.describe()
train_peptides_stats

Unnamed: 0,visit_month,patient_id,PeptideAbundance
count,981834.0,981834.0,981834.0
mean,26.105061,32603.465361,642890.2
std,22.913897,18605.934422,3377989.0
min,0.0,55.0,10.9985
25%,6.0,16566.0,28174.25
50%,24.0,29313.0,74308.3
75%,48.0,49995.0,221338.8
max,108.0,65043.0,178752000.0


In [39]:
# Obtain basic statistics for numerical columns in train_proteins dataset
train_proteins_stats = train_proteins.describe()
train_proteins_stats

Unnamed: 0,visit_month,patient_id,NPX
count,232741.0,232741.0,232741.0
mean,26.099205,32593.881873,2712077.0
std,22.874719,18608.479506,22241550.0
min,0.0,55.0,84.6082
25%,6.0,16566.0,29464.4
50%,24.0,29313.0,113556.0
75%,48.0,49995.0,563894.0
max,108.0,65043.0,613851000.0


The NPX values vary significantly, which might be expected given the diverse nature of protein expressions.


In [40]:
# Obtain basic statistics for numerical columns in train_clinical_data dataset
train_clinical_data_stats = train_clinical_data.describe()
train_clinical_data_stats

Unnamed: 0,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4
count,2615.0,2615.0,2614.0,2613.0,2590.0,1577.0
mean,32651.743786,31.190822,7.110559,6.74359,19.421236,1.861763
std,18535.7587,25.199053,5.525955,6.32323,15.000289,3.022112
min,55.0,0.0,0.0,0.0,0.0,0.0
25%,16574.0,10.5,3.0,1.0,6.0,0.0
50%,29417.0,24.0,6.0,5.0,19.0,0.0
75%,50611.0,48.0,10.0,10.0,29.0,3.0
max,65043.0,108.0,33.0,40.0,86.0,20.0


The UPDRS scores range from 0 (no symptoms) to higher values indicating more severe symptoms. It's worth noting the variation in the scores, as this will be crucial when predicting disease progression.

Lastly, let's obtain the basic statistics for the supplemental_clinical_data dataset.

In [41]:
# Obtain basic statistics for numerical columns in supplemental_clinical_data dataset
supplemental_clinical_data_stats = supplemental_clinical_data.describe()
supplemental_clinical_data_stats

Unnamed: 0,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4
count,2223.0,2223.0,2010.0,2009.0,2218.0,1295.0
mean,32478.016194,12.910481,5.68408,6.507715,22.917944,0.840154
std,18637.562796,13.060532,4.366964,4.968132,12.342596,1.860247
min,35.0,0.0,0.0,0.0,0.0,0.0
25%,16703.0,0.0,2.0,2.0,14.0,0.0
50%,32915.0,6.0,5.0,5.0,22.0,0.0
75%,47243.0,24.0,8.0,10.0,31.0,0.0
max,65530.0,36.0,27.0,34.0,72.0,12.0


Similar to the train_clinical_data, the UPDRS scores in the supplemental dataset also range from 0 (no symptoms) to higher values indicating more severe symptoms.