In [8]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
# Load the Parkinson's dataset
data = pd.read_csv('D:\data science\parkinsons.csv')

In [10]:
# Get a glimpse of the data
# View the first few rows
print(data.head())

   subject#  age  sex  test_time  total_UPDRS  Jitter(%)  Jitter(Abs)  \
0         1   72    0     5.6431       34.398    0.00662     0.000034   
1         1   72    0     5.6431       34.398    0.00662     0.000034   
2         1   72    0    12.6660       34.894    0.00300     0.000017   
3         1   72    0    19.6810       35.389    0.00481     0.000025   
4         1   72    0    25.6470       35.810    0.00528     0.000027   

   Jitter:RAP  Jitter:PPQ5  Jitter:DDP  ...  Shimmer(dB)  Shimmer:APQ3  \
0     0.00401      0.00317     0.01204  ...        0.230       0.01438   
1     0.00401      0.00317     0.01204  ...        0.230       0.01438   
2         NaN      0.00150     0.00395  ...        0.179       0.00994   
3     0.00205      0.00208     0.00616  ...        0.181       0.00734   
4     0.00191          NaN     0.00573  ...        0.327       0.01106   

   Shimmer:APQ5  Shimmer:APQ11  Shimmer:DDA       NHR     HNR     RPDE  \
0       0.01309        0.01662      0.0431

In [11]:
# Check data types and missing values
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5883 entries, 0 to 5882
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   subject#       5883 non-null   int64  
 1   age            5883 non-null   int64  
 2   sex            5883 non-null   int64  
 3   test_time      5883 non-null   float64
 4   total_UPDRS    5883 non-null   float64
 5   Jitter(%)      5880 non-null   float64
 6   Jitter(Abs)    5881 non-null   float64
 7   Jitter:RAP     5880 non-null   float64
 8   Jitter:PPQ5    5875 non-null   float64
 9   Jitter:DDP     5876 non-null   float64
 10  Shimmer        5865 non-null   float64
 11  Shimmer(dB)    5877 non-null   float64
 12  Shimmer:APQ3   5877 non-null   float64
 13  Shimmer:APQ5   5871 non-null   float64
 14  Shimmer:APQ11  5878 non-null   float64
 15  Shimmer:DDA    5877 non-null   float64
 16  NHR            5877 non-null   float64
 17  HNR            5882 non-null   float64
 18  RPDE    

In [12]:
# Get summary statistics
print(data.describe())

          subject#          age          sex    test_time  total_UPDRS  \
count  5883.000000  5883.000000  5883.000000  5883.000000  5883.000000   
mean     21.498555    64.799932     0.317865    92.857197    29.019748   
std      12.371906     8.822473     0.465686    53.431137    10.697081   
min       1.000000    36.000000     0.000000    -4.262500     7.000000   
25%      10.000000    58.000000     0.000000    46.847500    21.371000   
50%      22.000000    65.000000     0.000000    91.524000    27.576000   
75%      33.000000    72.000000     1.000000   138.430000    36.399000   
max      42.000000    85.000000     1.000000   215.490000    54.992000   

         Jitter(%)  Jitter(Abs)   Jitter:RAP  Jitter:PPQ5   Jitter:DDP  ...  \
count  5880.000000  5881.000000  5880.000000  5875.000000  5876.000000  ...   
mean      0.006153     0.000044     0.002986     0.003275     0.008940  ...   
std       0.005622     0.000036     0.003122     0.003731     0.009376  ...   
min       0.00083

In [13]:
# ---- Handle missing values ----
# Identify missing values
missing_values = data.isnull().sum()
print(missing_values)

subject#          0
age               0
sex               0
test_time         0
total_UPDRS       0
Jitter(%)         3
Jitter(Abs)       2
Jitter:RAP        3
Jitter:PPQ5       8
Jitter:DDP        7
Shimmer          18
Shimmer(dB)       6
Shimmer:APQ3      6
Shimmer:APQ5     12
Shimmer:APQ11     5
Shimmer:DDA       6
NHR               6
HNR               1
RPDE              4
DFA               2
PPE               2
dtype: int64


In [14]:
# Impute missing values in numerical features
numerical_features = data.select_dtypes(include=['int64', 'float64'])
for column in numerical_features:
    median = data[column].median()
    data[column].fillna(median, inplace=True)

In [15]:
# Impute missing values in categorical features
categorical_features = data.select_dtypes(include=['object'])
for column in categorical_features:
    mode = data[column].mode()[0]
    data[column].fillna(mode, inplace=True)

In [16]:
# ---- Remove irrelevant attributes ----
# Drop patient IDs and timestamps (adjust column names as needed)
data.drop(['patient_id', 'timestamp'], axis=1, inplace=True)

KeyError: "['patient_id', 'timestamp'] not found in axis"

In [17]:
# ---- Create new features ----
# Derive day of the week from timestamps (assuming a timestamp column exists)
if 'timestamp' in data.columns:
    data['day_of_week'] = pd.to_datetime(data['timestamp']).dt.dayofweek

In [18]:
# List of irrelevant attributes to be removed
irrelevant_columns = ['patient_id', 'timestamp']  # Replace with actual irrelevant columns

In [19]:
# Remove the irrelevant attributes from the dataset
data = data.drop(columns=irrelevant_columns)

KeyError: "['patient_id', 'timestamp'] not found in axis"

In [22]:
# Step 1: Remove irrelevant attributes
irrelevant_columns = ['subject#', 'test_time']  # Replace with actual irrelevant columns
data = data.drop(columns=irrelevant_columns)

In [24]:
# Step 2: Impute missing values
# Impute missing values in numerical features using mean or median
numerical_columns = ['age', 'total_UPDRS']  # Replace with actual numerical columns
imputer = SimpleImputer(strategy='median')  # Use median for imputation
data[numerical_columns] = imputer.fit_transform(data[numerical_columns])

In [25]:
# Impute missing values in categorical features using mode
categorical_columns = ['sex']  # Replace with actual categorical columns
imputer = SimpleImputer(strategy='most_frequent')  # Use mode for imputation
data[categorical_columns] = imputer.fit_transform(data[categorical_columns])

In [28]:
# Example: Combining features
data['age_gender'] = data['age'] * (data['sex'] == 'Male') 
# Example combination of age and gender

In [30]:
# Split the data into features and target variable
X = data.drop(columns=['age_gender']) 
# Replace 'target_column' with the actual target column
y = data['age_gender']

In [31]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
# Step 2: Build Baseline Model
baseline_model = RandomForestRegressor(random_state=42)
baseline_model.fit(X_train, y_train)

In [33]:
# Step 3: Evaluate Baseline Model
baseline_predictions = baseline_model.predict(X_test)
baseline_mse = mean_squared_error(y_test, baseline_predictions)
print("Baseline Mean Squared Error:", baseline_mse)

Baseline Mean Squared Error: 0.0
