In [1]:
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from loguru import logger

In [2]:
# Configure loguru
logger.add("../logs/diabetes_rai_analysis.log", rotation="500 MB")

1

In [3]:
# Load the diabetes dataset
diabetes = load_diabetes()
X, y = diabetes.data, diabetes.target

In [4]:
# Load and convert to pandas DataFrame
feature_names = diabetes.feature_names
X_df = pd.DataFrame(X, columns=feature_names)
y_df = pd.Series(y, name='target')

In [5]:
# First, split off the test set (20% of the data)
X_temp, X_test, y_temp, y_test = train_test_split(X_df, y_df, test_size=0.2, random_state=42)

In [6]:
# Then split the remaining data into train and validation sets (80% train, 20% validation of the remaining 80% data)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

In [7]:
# Save the data as Parquet files
pq.write_table(pa.Table.from_pandas(X_train), '../data/interim/X_train.parquet')
pq.write_table(pa.Table.from_pandas(X_val), '../data/interim/X_val.parquet')
pq.write_table(pa.Table.from_pandas(X_test), '../data/interim/X_test.parquet')
pq.write_table(pa.Table.from_pandas(y_train.to_frame()), '../data/interim/y_train.parquet')
pq.write_table(pa.Table.from_pandas(y_val.to_frame()), '../data/interim/y_val.parquet')
pq.write_table(pa.Table.from_pandas(y_test.to_frame()), '../data/interim/y_test.parquet')

logger.info("Data preparation completed and saved to interim directory as Parquet files")

# Log information about the dataset
logger.info(f"Features: {feature_names}")
logger.info(f"Total number of samples: {X_df.shape[0]}")
logger.info(f"Number of training samples: {X_train.shape[0]}")
logger.info(f"Number of validation samples: {X_val.shape[0]}")
logger.info(f"Number of test samples: {X_test.shape[0]}")
logger.info(f"Number of features: {X_df.shape[1]}")
logger.info("Target variable: Quantitative measure of disease progression one year after baseline")

logger.success("Data setup complete. Ready for analysis.")


[32m2024-07-19 12:07:56.368[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mData preparation completed and saved to interim directory as Parquet files[0m
[32m2024-07-19 12:07:56.373[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1mFeatures: ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6'][0m
[32m2024-07-19 12:07:56.377[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m13[0m - [1mTotal number of samples: 442[0m
[32m2024-07-19 12:07:56.380[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m14[0m - [1mNumber of training samples: 264[0m
[32m2024-07-19 12:07:56.385[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m15[0m - [1mNumber of validation samples: 89[0m
[32m2024-07-19 12:07:56.389[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m16[0m - [1mNumber of test samples: 89[0m
[32m2024-07-19 12:07:56.393[0m | [1mINFO    [0m | [36m

In [8]:
X_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     442 non-null    float64
 1   sex     442 non-null    float64
 2   bmi     442 non-null    float64
 3   bp      442 non-null    float64
 4   s1      442 non-null    float64
 5   s2      442 non-null    float64
 6   s3      442 non-null    float64
 7   s4      442 non-null    float64
 8   s5      442 non-null    float64
 9   s6      442 non-null    float64
dtypes: float64(10)
memory usage: 34.7 KB


In [9]:
X_df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641


In [10]:
y_df.head()

0    151.0
1     75.0
2    141.0
3    206.0
4    135.0
Name: target, dtype: float64