# Data preprocessing

In [309]:
%run utilspro.py

In [372]:
# Concatenate the two dataframes
combined_clinical_and_suppl = pd.concat([train_clinical_data, supplemental_clinical_data], axis=0)

# Exclude the updrs columns and medication state from the list of columns
columns_to_consider = [col for col in combined_clinical_and_suppl.columns if not col.startswith('updrs') and col != 'upd23b_clinical_state_on_medication']

# Drop duplicates based on the selected columns
unique_df = combined_clinical_and_suppl.drop_duplicates(subset=columns_to_consider)

# Merge the dataframes on the common columns
enriched_peptide_data = pd.merge(train_peptides, unique_df, on=['visit_id', 'visit_month', 'patient_id'], how='left')
enriched_protein_data = pd.merge(enriched_peptide_data, train_proteins, on=['visit_id', 'visit_month', 'patient_id','UniProt'], how='left')

# check for null values for  when udprs 3 is not null
enriched_protein_data = enriched_protein_data[enriched_protein_data['updrs_3'].notnull()]
# set the clinical state to off medication for all month 0 and 1,  that have null records
enriched_protein_data.loc[(enriched_protein_data['visit_month'].isin([0,1,2,3])) & (enriched_protein_data['upd23b_clinical_state_on_medication'].isnull()), 'upd23b_clinical_state_on_medication'] = 'Off'

In [373]:
# check the null values for each column
enriched_protein_data.isnull().sum()

visit_id                                    0
visit_month                                 0
patient_id                                  0
UniProt                                     0
Peptide                                     0
PeptideAbundance                            0
updrs_1                                     0
updrs_2                                     0
updrs_3                                     0
updrs_4                                441672
upd23b_clinical_state_on_medication    396104
NPX                                         0
dtype: int64

In [375]:
# for medication state null records, check the visit status and set the medication state to that value
# Sort the data by 'patient_id' and 'visit_month'
clinical_data_sorted = enriched_protein_data.sort_values(by=['patient_id', 'visit_month'])

# Forward-fill the 'upd23b_clinical_state_on_medication' column within each 'patient_id' group
clinical_data_sorted['upd23b_clinical_state_on_medication'] = clinical_data_sorted.groupby('patient_id')['upd23b_clinical_state_on_medication'].fillna(method='ffill')

# Check remaining null values in 'upd23b_clinical_state_on_medication'
remaining_nulls = clinical_data_sorted['upd23b_clinical_state_on_medication'].isnull().sum()
remaining_nulls

84711

In [376]:
# Apply backward fill for the remaining null values within each 'patient_id' group
clinical_data_sorted['upd23b_clinical_state_on_medication'] = clinical_data_sorted.groupby('patient_id')['upd23b_clinical_state_on_medication'].fillna(method='bfill')

# Check if there are any more null values in 'upd23b_clinical_state_on_medication'
remaining_nulls_after_bfill = clinical_data_sorted['upd23b_clinical_state_on_medication'].isnull().sum()
remaining_nulls_after_bfill


63683

In [317]:
# check for those patients if the medication state is null for all the records
patients_with_nulls = clinical_data_sorted[clinical_data_sorted['upd23b_clinical_state_on_medication'].isnull()]['patient_id'].unique()
for patient in patients_with_nulls:
    if clinical_data_sorted[clinical_data_sorted['patient_id'] == patient]['upd23b_clinical_state_on_medication'].isnull().sum() == len(clinical_data_sorted[clinical_data_sorted['patient_id'] == patient]):
        print("Patient {} has null values for all records".format(patient))
    else:
        print("Patient {} has null values for some records".format(patient))
# sum all the null values for the medication state
clinical_data_sorted['upd23b_clinical_state_on_medication'].isnull().sum()

Patient 942 has null values for all records
Patient 5036 has null values for all records
Patient 6420 has null values for all records
Patient 7151 has null values for all records
Patient 13360 has null values for all records
Patient 17201 has null values for all records
Patient 20581 has null values for all records
Patient 20707 has null values for all records
Patient 20792 has null values for all records
Patient 23175 has null values for all records
Patient 24818 has null values for all records
Patient 27987 has null values for all records
Patient 31270 has null values for all records
Patient 33108 has null values for all records
Patient 47171 has null values for all records
Patient 51708 has null values for all records
Patient 52119 has null values for all records
Patient 57468 has null values for all records
Patient 58674 has null values for all records
Patient 60788 has null values for all records


63683

In [377]:
# set the medication state to off for all the patients with null medication state
clinical_data_sorted.loc[clinical_data_sorted['upd23b_clinical_state_on_medication'].isnull(), 'upd23b_clinical_state_on_medication'] = 'Off'

# Apply the interpolation function to each patient's data
clinical_data_interpolated = clinical_data_sorted.groupby('patient_id').apply(interpolate_updrs)

# Check remaining null values for UPDRS scores after interpolation
remaining_nulls_updrs_after_interpolation = clinical_data_interpolated[['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']].isnull().sum()
# Backward fill within each 'patient_id' group
clinical_data_interpolated['updrs_4'] = clinical_data_interpolated.groupby('patient_id')['updrs_4'].fillna(method='bfill')

In [378]:
#check if there are any null values in the medication state column
clinical_data_interpolated.isnull().sum()

visit_id                                    0
visit_month                                 0
patient_id                                  0
UniProt                                     0
Peptide                                     0
PeptideAbundance                            0
updrs_1                                     0
updrs_2                                     0
updrs_3                                     0
updrs_4                                244966
upd23b_clinical_state_on_medication         0
NPX                                         0
dtype: int64

In [379]:
# Forward fill for the remaining null values within each 'patient_id' group
clinical_data_interpolated['updrs_4'] = clinical_data_interpolated.groupby('patient_id')['updrs_4'].fillna(method='ffill')

# If there are still null values, fill them with the median value of the 'updrs_3' column
clinical_data_interpolated['updrs_4'] = clinical_data_interpolated['updrs_4'].fillna(clinical_data_interpolated['updrs_4'].median())

# Check remaining null values for 'updrs_3'
remaining_nulls = clinical_data_interpolated.isnull().sum()
remaining_nulls

visit_id                               0
visit_month                            0
patient_id                             0
UniProt                                0
Peptide                                0
PeptideAbundance                       0
updrs_1                                0
updrs_2                                0
updrs_3                                0
updrs_4                                0
upd23b_clinical_state_on_medication    0
NPX                                    0
dtype: int64

In [382]:
# Sort data by patient_id and visit_month for sequential calculations
clinical_data_sorted_2 = clinical_data_interpolated.sort_values(by=['patient_id', 'visit_month'])

# Add Time Since Diagnosis feature
clinical_data_sorted_2['time_since_diagnosis'] = clinical_data_sorted_2.groupby('patient_id')['visit_month'].transform(lambda x: x - x.min())
# Encode the 'UniProt' column
clinical_data_sorted_2 = frequency_encode(clinical_data_sorted_2, 'UniProt')
# Encode  medication state to numeric for interaction (On=1, Off=0)
clinical_data_sorted_2['medication_numeric'] = clinical_data_sorted_2['upd23b_clinical_state_on_medication'].map({'On': 1, 'Off': 0})

optimized_info_df_v5(clinical_data_sorted_2)

Unnamed: 0,Column,Dtype,Null Count,Unique Count,% Missing,1st Mode,2nd Mode,3rd Mode
0,visit_id,object,0,1058,0.0,16574_12,35477_48,35477_60
1,visit_month,int64,0,15,0.0,0,12,24
2,patient_id,int64,0,248,0.0,23391,62723,26210
3,UniProt,object,0,227,0.0,P02787,P02768,P01024
4,Peptide,object,0,968,0.0,AYQGVAAPFPK,YKAAFTEC(UniMod_4)C(UniMod_4)QAADK,TLLSNLEEAK
5,PeptideAbundance,float64,0,710694,0.0,109591.0,127056.0,144004.0
6,updrs_1,float64,0,30,0.0,1.0,2.0,5.0
7,updrs_2,float64,0,29,0.0,0.0,2.0,3.0
8,updrs_3,float64,0,64,0.0,0.0,1.0,2.0
9,updrs_4,float64,0,17,0.0,0.0,4.0,3.0


In [389]:
%run utilspro.py


In [391]:
# Feature selection
all_features = ['updrs_1', 'updrs_2', 'updrs_4', 'time_since_diagnosis', 'medication_numeric', 'UniProt_encoded' ,'NPX', 'PeptideAbundance']
results = iterative_modeling(clinical_data_sorted_2, all_features)

# Display the results
for metrics in results:
     print(f"Features: {metrics['features']}")
     print(f"Mean Squared Error: {metrics['mse']}")
     print(f"Mean Absolute Error: {metrics['mae']}")
     print(f"R-squared: {metrics['r2']}")
     print("-------------------------")

Features: ['medication_numeric', 'updrs_1']
Mean Squared Error: 178.75597666583218
Mean Absolute Error: 10.967519273658393
R-squared: 0.20668183021479847
-------------------------
Features: ['medication_numeric', 'updrs_1', 'updrs_2']
Mean Squared Error: 91.30949819071914
Mean Absolute Error: 7.596853945179897
R-squared: 0.5947688835932927
-------------------------
Features: ['medication_numeric', 'updrs_1', 'updrs_2', 'updrs_4']
Mean Squared Error: 91.27749383911883
Mean Absolute Error: 7.601382807390646
R-squared: 0.5949109187526775
-------------------------
Features: ['medication_numeric', 'updrs_1', 'updrs_2', 'updrs_4', 'time_since_diagnosis']
Mean Squared Error: 91.05072653281412
Mean Absolute Error: 7.6187067096844645
R-squared: 0.5959173109738511
-------------------------
Features: ['medication_numeric', 'updrs_1', 'updrs_2', 'updrs_4', 'time_since_diagnosis', 'UniProt_encoded']
Mean Squared Error: 91.05074634796644
Mean Absolute Error: 7.6187083294821925
R-squared: 0.595917223

## 3.  Encoding, Feature Scaling, and Splitting the data 

In [393]:
# Set the selected features in a list and the target variable
selected_features = ['updrs_1', 'updrs_2', 'updrs_4', 'time_since_diagnosis', 'medication_numeric', 'UniProt_encoded', 'NPX']
target = 'updrs_3'

X = clinical_data_sorted_2[selected_features]
y = clinical_data_sorted_2[target]

# Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled.shape, X_test_scaled.shape

((746099, 7), (186525, 7))

The data has been successfully scaled and encoded. The training set contains 746099 samples, while the test set contains 186525 samples. Each sample has 7 features: 'updrs_1', 'updrs_2', 'updrs_4', 'time_since_diagnosis', 'medication_numeric', 'UniProt_encoded', 'NPX'.