In [55]:
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from loguru import logger

In [56]:
pd.set_option('display.float_format', '{:.4f}'.format)

In [57]:
# Configure loguru
logger.add("../logs/diabetes_rai_analysis.log", rotation="500 MB")

4

In [58]:
# Load the diabetes dataset
diabetes = load_diabetes()
X, y = diabetes.data, diabetes.target

In [59]:
# Load and convert to pandas DataFrame
feature_names = diabetes.feature_names
X_df = pd.DataFrame(X, columns=feature_names)
y_df = pd.Series(y, name='target')

In [60]:
y_df.describe()

count   442.0000
mean    152.1335
std      77.0930
min      25.0000
25%      87.0000
50%     140.5000
75%     211.5000
max     346.0000
Name: target, dtype: float64

In [61]:
newy_df = y_df.copy()

# Remap the values: If value >= 100, set to 1; otherwise, set to 0
newy_df = np.where(newy_df >= 100, 1, 0)

# If you want the result to be a pandas Series
y_df = pd.Series(newy_df, name='target')

In [62]:
# First, split off the test set (20% of the data)
X_temp, X_test, y_temp, y_test = train_test_split(X_df, y_df, test_size=0.2, random_state=42)

In [63]:
# Then split the remaining data into train and validation sets (80% train, 20% validation of the remaining 80% data)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

In [64]:
# Save the data as Parquet files
pq.write_table(pa.Table.from_pandas(X_train), '../data/interim/X_train.parquet')
pq.write_table(pa.Table.from_pandas(X_val), '../data/interim/X_val.parquet')
pq.write_table(pa.Table.from_pandas(X_test), '../data/interim/X_test.parquet')
pq.write_table(pa.Table.from_pandas(y_train.to_frame()), '../data/interim/y_train.parquet')
pq.write_table(pa.Table.from_pandas(y_val.to_frame()), '../data/interim/y_val.parquet')
pq.write_table(pa.Table.from_pandas(y_test.to_frame()), '../data/interim/y_test.parquet')

logger.info("Data preparation completed and saved to interim directory as Parquet files")

# Log information about the dataset
logger.info(f"Features: {feature_names}")
logger.info(f"Total number of samples: {X_df.shape[0]}")
logger.info(f"Number of training samples: {X_train.shape[0]}")
logger.info(f"Number of validation samples: {X_val.shape[0]}")
logger.info(f"Number of test samples: {X_test.shape[0]}")
logger.info(f"Number of features: {X_df.shape[1]}")
logger.info("Target variable: Quantitative measure of disease progression one year after baseline")

logger.success("Data setup complete. Ready for analysis.")


[32m2024-08-08 17:10:22.481[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mData preparation completed and saved to interim directory as Parquet files[0m
[32m2024-08-08 17:10:22.488[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1mFeatures: ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6'][0m
[32m2024-08-08 17:10:22.496[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m13[0m - [1mTotal number of samples: 442[0m
[32m2024-08-08 17:10:22.502[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m14[0m - [1mNumber of training samples: 264[0m
[32m2024-08-08 17:10:22.507[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m15[0m - [1mNumber of validation samples: 89[0m
[32m2024-08-08 17:10:22.519[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m16[0m - [1mNumber of test samples: 89[0m
[32m2024-08-08 17:10:22.537[0m | [1mINFO    [0m | [36m

In [65]:
X_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     442 non-null    float64
 1   sex     442 non-null    float64
 2   bmi     442 non-null    float64
 3   bp      442 non-null    float64
 4   s1      442 non-null    float64
 5   s2      442 non-null    float64
 6   s3      442 non-null    float64
 7   s4      442 non-null    float64
 8   s5      442 non-null    float64
 9   s6      442 non-null    float64
dtypes: float64(10)
memory usage: 34.7 KB


In [66]:
X_df.describe()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
count,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0
mean,-0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,0.0,0.0
std,0.0476,0.0476,0.0476,0.0476,0.0476,0.0476,0.0476,0.0476,0.0476,0.0476
min,-0.1072,-0.0446,-0.0903,-0.1124,-0.1268,-0.1156,-0.1023,-0.0764,-0.1261,-0.1378
25%,-0.0373,-0.0446,-0.0342,-0.0367,-0.0342,-0.0304,-0.0351,-0.0395,-0.0332,-0.0332
50%,0.0054,-0.0446,-0.0073,-0.0057,-0.0043,-0.0038,-0.0066,-0.0026,-0.0019,-0.0011
75%,0.0381,0.0507,0.0312,0.0356,0.0284,0.0298,0.0293,0.0343,0.0324,0.0279
max,0.1107,0.0507,0.1706,0.132,0.1539,0.1988,0.1812,0.1852,0.1336,0.1356


In [67]:
X_df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.0381,0.0507,0.0617,0.0219,-0.0442,-0.0348,-0.0434,-0.0026,0.0199,-0.0176
1,-0.0019,-0.0446,-0.0515,-0.0263,-0.0084,-0.0192,0.0744,-0.0395,-0.0683,-0.0922
2,0.0853,0.0507,0.0445,-0.0057,-0.0456,-0.0342,-0.0324,-0.0026,0.0029,-0.0259
3,-0.0891,-0.0446,-0.0116,-0.0367,0.0122,0.025,-0.036,0.0343,0.0227,-0.0094
4,0.0054,-0.0446,-0.0364,0.0219,0.0039,0.0156,0.0081,-0.0026,-0.032,-0.0466


In [68]:
y_df.head()

0    1
1    0
2    1
3    1
4    1
Name: target, dtype: int64

In [69]:
X_df[X_df['sex'] == 0]

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
