In [1]:
import pandas as pd
df=pd.read_csv("sample_diabetes_mellitus_data.csv",index_col=0)



In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split

class DataLoader:
    def __init__(self, file_path):
        self.file_path = file_path

    def load_data(self):
        """
        加载数据，返回DataFrame。
        """
        data = pd.read_csv(self.file_path, index_col=0)
        return data

    def split_data(self, data):
        """
        将数据分为训练集和测试集，默认使用 80% 训练集和 20% 测试集。
        """
        train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
        return train_data, test_data


In [3]:
class DropNaPreprocessor:
    def __init__(self, columns):
        self.columns = columns

    def process(self, data):
        return data.dropna(subset=self.columns)


In [4]:
class FillNaPreprocessor:
    def __init__(self, columns):
        self.columns = columns

    def process(self, data):
        for column in self.columns:
            data[column].fillna(data[column].mean(), inplace=True)
        return data

In [5]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

class DataTransformer:
    def __init__(self, data):
        """
        初始化 DataTransformer 类，自动区分分类和数值特征
        :param data: pandas DataFrame, 要处理的数据集
        """
        self.data = data
        
        self.categorical_features = self.data.select_dtypes(include=['object']).columns.tolist()
        self.numerical_features = self.data.select_dtypes(include=['number']).columns.tolist()

        self.preprocessor = ColumnTransformer(
            transformers=[
                ('cat', OneHotEncoder(), self.categorical_features),  # 对分类特征进行独热编码
                ('num', 'passthrough', self.numerical_features)       # 保持数值特征不变
            ]
        )

    def transform(self):

        transformed_data = self.preprocessor.fit_transform(self.data)
        
        cat_feature_names = self.preprocessor.named_transformers_['cat'].get_feature_names_out(self.categorical_features)
    

        all_feature_names = list(cat_feature_names) + self.numerical_features

        df_transformed = pd.DataFrame(transformed_data, columns=all_feature_names)

        return df_transformed



In [6]:
from sklearn.ensemble import RandomForestClassifier

class Model:
    def __init__(self, model=RandomForestClassifier, model_params=None):
        self.model_class = model
        self.model_params = model_params if model_params else {}
        self.model = self.model_class(**self.model_params)
    
    def train(self, X_train, y_train):
        """
        Train the model with feature matrix X_train and target vector y_train.
        """
        self.model.fit(X_train, y_train)

    def predict(self, X_test):
        """
        Predict probabilities using the trained model with feature matrix X_test.
        """
        return self.model.predict_proba(X_test)


In [15]:
data_loader = DataLoader('sample_diabetes_mellitus_data.csv')
data=data_loader.load_data()

In [17]:
drop_na_preprocessor = DropNaPreprocessor(columns=['age', 'gender', 'ethnicity'])
data = drop_na_preprocessor.process(data)

In [18]:
fill_na_preprocessor = FillNaPreprocessor(columns=['height', 'weight'])
data = fill_na_preprocessor.process(data)

In [19]:
dataTransformer_train=DataTransformer(data)
data=dataTransformer_train.transform()

In [20]:
data=data.dropna()

In [22]:
train_data,test_data=data_loader.split_data(data)
X_train=train_data.iloc[:,:-1]
y_train=train_data.iloc[:,-1]

In [23]:
model=Model( model=RandomForestClassifier, model_params=None)
model.train(X_train,y_train)

In [27]:
X_test=test_data.iloc[:,:-1]
y_test=test_data.iloc[:,-1]

In [32]:
y_test

8417    0.0
7784    0.0
9151    0.0
8482    0.0
7714    0.0
8810    0.0
7745    1.0
9259    0.0
8391    0.0
9246    0.0
8899    1.0
8553    0.0
7859    0.0
8986    0.0
8393    0.0
7974    0.0
7836    0.0
9108    0.0
8200    0.0
7806    0.0
9109    0.0
8442    0.0
8574    0.0
9351    0.0
7759    0.0
9241    0.0
8813    0.0
8366    1.0
8148    0.0
7846    0.0
8464    0.0
8928    0.0
9352    0.0
7802    0.0
8103    0.0
8031    0.0
8621    0.0
9105    0.0
8399    0.0
8132    0.0
8179    0.0
7933    0.0
7792    0.0
8273    0.0
8933    0.0
8792    0.0
7895    0.0
8494    0.0
Name: diabetes_mellitus, dtype: float64

In [35]:
from sklearn.metrics import roc_auc_score
y_pre_pro= model.predict(X_test)[:, 1]
roc_auc=roc_auc_score(y_test,y_pre_pro)
roc_auc

0.6