In [None]:
import numpy as np
import pandas as pd
from matplotlib.lines import lineStyles
from sympy.physics.quantum.gate import normalized
from sympy.printing.pytorch import torch
from sklearn.model_selection import train_test_split

In [None]:
train_df = pd.read_csv('./data/input/train.csv')
train_df.head()

In [None]:
test_df = pd.read_csv('./data/input/test.csv')
test_df.head()

In [None]:
full_df = pd.concat([train_df, test_df])
full_df.head()

In [None]:
full_df.drop(columns=['id'], inplace=True)
full_df.head()

对性别特征项的处理

性别特征项取值有 'Mal'，'Female' 'Other' 三个取值

In [None]:
full_df['gender'].value_counts().sort_values(ascending=False).plot(kind='bar')

In [None]:
{'Male': (full_df['gender'] == 'Male').sum() / full_df.shape[0] * 100,
 'Female': (full_df['gender'] == 'Female').sum() / full_df.shape[0] * 100,
 'Other': (full_df['gender'] == 'Other').sum() / full_df.shape[0] * 100
 }

In [None]:
full_df.info()

In [None]:
gender_onehot = pd.get_dummies(full_df['gender'], prefix='gender', drop_first=True)
full_df = pd.concat([full_df, gender_onehot], axis=1)
full_df.head()

## 对种族特征项的处理

种族属于

In [None]:
ethn_onehot = pd.get_dummies(full_df['ethnicity'], prefix='ethn', drop_first=True)
full_df = pd.concat([full_df, ethn_onehot], axis=1)
full_df.head()

In [None]:
edu_onehot = pd.get_dummies(full_df['education_level'], prefix='edu', drop_first=True)
full_df = pd.concat([full_df, edu_onehot], axis=1)
full_df.head()

In [None]:
income_onehot = pd.get_dummies(full_df['income_level'], prefix='income', drop_first=True)
full_df = pd.concat([full_df, income_onehot], axis=1)
full_df.head()

In [None]:
smoke_onehot = pd.get_dummies(full_df['smoking_status'], prefix='smoke', drop_first=True)
full_df = pd.concat([full_df, smoke_onehot], axis=1)
full_df.head()

In [None]:
employ_onehot = pd.get_dummies(full_df['employment_status'], prefix='employ', drop_first=True)
full_df = pd.concat([full_df, employ_onehot], axis=1)
full_df.head()

In [None]:
full_df.drop(columns=['gender', 'ethnicity', 'education_level', 'income_level', 'smoking_status', 'employment_status'], inplace=True)
full_df.head()

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
normalized_column = ['age', 'alcohol_consumption_per_week', 'physical_activity_minutes_per_week', 'diet_score', 'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi', 'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate', 'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol', 'triglycerides']
full_normalized = scaler.fit_transform(full_df[normalized_column])
full_normalized

In [None]:
full_normalized = pd.DataFrame(
    scaler.transform(full_df[normalized_column]),  # 归一化结果（numpy数组）
    columns=normalized_column,  # 列名与原列一致
    index=full_df[normalized_column].index  # 索引与原训练集一致（避免合并时错位）
)
full_normalized

In [None]:
full_df[normalized_column] = full_normalized
full_df

In [None]:
full_df.dtypes

In [None]:
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader, TensorDataset

target = 'diagnosed_diabetes'
epochs = 1000

train_x_split, test_x_split, train_y_split, test_y_split = train_test_split(full_df[0:train_df.shape[0]].values, full_df[target][0:train_df.shape[0]], test_size=0.2, random_state=42)

train_x_tensor = torch.tensor(train_x_split.astype(np.float64), dtype=torch.float64)
train_y_tensor = torch.tensor(train_y_split, dtype=torch.float64).unsqueeze(1)
valid_x_tensor = torch.tensor(test_x_split.astype(np.float64), dtype=torch.float64)
valid_y_tensor = torch.tensor(test_y_split.values, dtype=torch.float64).unsqueeze(1)

test_x_tensor = torch.tensor(full_df[train_df.shape[0]:].to_numpy(dtype=np.float64), dtype=torch.float64)

train_dataset = TensorDataset(train_x_tensor, train_y_tensor)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [None]:
class LogistcRegression(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LogistcRegression, self).__init__()

        self.linear = torch.nn.Linear(input_dim, output_dim, dtype=torch.float64)

    def forward(self, x):
        out = self.linear(x)
        return torch.sigmoid(out)


In [None]:
model = LogistcRegression(train_x_tensor.shape[1], 1)

criterion = torch.nn.BCELoss()

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

num_epochs = 1

In [None]:
train_losses = []
valid_losses = []
for epoch in range(num_epochs):
    for bath_idx, (data, target) in enumerate(train_dataloader):
        y_pred = model(data)
        loss = criterion(y_pred, target)

        valid_pred = model(valid_x_tensor)
        valid_loss = criterion(valid_pred, valid_y_tensor)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_losses.append(loss.item())
        valid_losses.append(valid_loss.item())

    if (epoch + 1) % 100 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
        print(f"Validation Loss: {valid_loss.item():.4f}")



In [None]:
print(train_losses[-1])

In [None]:
import matplotlib.pyplot as plt

plt.subplot(1, 2, 1)

plt.plot(train_losses, linestyle= '-', color='blue')
plt.plot(valid_losses, linestyle= '-', color='red')
plt.show()
