In [2]:
# encoding = 'utf-8'

import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score, f1_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression  # 导入逻辑回归模型
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [3]:
df = pd.read_csv('no_missing_data.csv', header=0, encoding='utf-8')

df

FileNotFoundError: [Errno 2] No such file or directory: 'no_missing_data.csv'

In [5]:
df.shape

(4833, 52)

In [6]:
# 要交换的两行的索引
index1, index2 = 14, 0

# 执行交换
df.loc[index1], df.loc[index2] = df.loc[index2].copy(), df.loc[index1].copy()

# 删除不需要的列
columns_to_drop = ['StudentNumber','useraccount_id', 'SchoolName', 'AlgScaleScore']
df = df.drop(columns=columns_to_drop, axis=1)

# 定义变量类型
binary_variables =  ['RetakerFlag','FRL_Status', 'Sex', 'Race_Indicator', 'Hispanic_Indicator']
nominal_variables = ['SchoolNumber']
continuous_variables = ['TotalNumberofAbsences', 'FSAMath_2018_AchievementLevel','FSAMath_2018_ScaleScore','sum_session', 'total_question_time_taken', 'total_number_of_questions', 'video_id', 'Unnamed: 0_x', 'video_completed', 'video_pause', 'video_play', 'video_seek', 'avg_user_gave_correct_answer' ,'tys_finish', 'tys_previous',  'tys_review_incorrect_question', 'tys_review_solution_video','wall_page_load']

# 创建预处理器和模型
continuous_preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', continuous_preprocessor, continuous_variables),
        # ('cat', OneHotEncoder(handle_unknown='ignore'), nominal_variables)
    ])

# 定义逻辑回归模型
logistic_model = LogisticRegression(max_iter=1000)  # 增加迭代次数以确保收敛

# 创建完整的pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', logistic_model)  # 使用逻辑回归作为分类器
])

# X = df.drop('risk', axis=1)
columns_to_keep =  binary_variables +  nominal_variables + continuous_variables # 
# print(columns_to_keep)
# 删除不在合并列表中的列
X = df[columns_to_keep]

y = df['risk']

In [7]:
X

Unnamed: 0,RetakerFlag,FRL_Status,Sex,Race_Indicator,Hispanic_Indicator,SchoolNumber,TotalNumberofAbsences,FSAMath_2018_AchievementLevel,FSAMath_2018_ScaleScore,sum_session,...,video_completed,video_pause,video_play,video_seek,avg_user_gave_correct_answer,tys_finish,tys_previous,tys_review_incorrect_question,tys_review_solution_video,wall_page_load
0,0,1,1,1.0,1,251,5,2.0,328.0,20.0,...,20.0,298.0,272.0,608.0,0.100000,69.0,14.0,92.0,1.0,54.0
1,0,1,0,1.0,0,1681,2,3.0,349.0,10.0,...,2.0,60.0,58.0,120.0,0.100000,0.0,0.0,0.0,0.0,3.0
2,0,0,1,1.0,1,3971,5,4.0,359.0,10.0,...,32.0,352.0,294.0,3878.0,0.700000,0.0,0.0,0.0,0.0,16.0
3,0,1,0,1.0,0,171,7,1.0,307.0,20.0,...,9.0,176.0,173.0,72.0,0.100000,3.0,7.0,5.0,0.0,43.0
4,0,1,0,1.0,0,1451,8,3.0,346.0,40.0,...,45.0,331.0,378.0,335.0,0.350000,11.0,0.0,9.0,20.0,40.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4828,0,0,0,1.0,1,3731,9,3.0,351.0,80.0,...,4.0,32.0,34.0,4.0,0.771429,4.0,1.0,17.0,0.0,11.0
4829,0,1,1,0.0,0,3101,6,5.0,380.0,60.0,...,0.0,26.0,20.0,4.0,0.600000,2.0,3.0,0.0,0.0,15.0
4830,0,1,1,0.0,0,251,4,5.0,360.0,30.0,...,18.0,946.0,802.0,1130.0,0.600000,76.0,62.0,149.0,0.0,151.0
4831,0,1,1,1.0,1,403,1,3.0,345.0,40.0,...,8.0,32.0,72.0,72.0,0.600000,0.0,0.0,0.0,0.0,4.0


In [8]:
y

0       1
1       1
2       1
3       1
4       1
       ..
4828    1
4829    1
4830    1
4831    1
4832    0
Name: risk, Length: 4833, dtype: int64

In [10]:
df[columns_to_keep + ['risk']]

Unnamed: 0,RetakerFlag,FRL_Status,Sex,Race_Indicator,Hispanic_Indicator,SchoolNumber,TotalNumberofAbsences,FSAMath_2018_AchievementLevel,FSAMath_2018_ScaleScore,sum_session,...,video_pause,video_play,video_seek,avg_user_gave_correct_answer,tys_finish,tys_previous,tys_review_incorrect_question,tys_review_solution_video,wall_page_load,risk
0,0,1,1,1.0,1,251,5,2.0,328.0,20.0,...,298.0,272.0,608.0,0.100000,69.0,14.0,92.0,1.0,54.0,1
1,0,1,0,1.0,0,1681,2,3.0,349.0,10.0,...,60.0,58.0,120.0,0.100000,0.0,0.0,0.0,0.0,3.0,1
2,0,0,1,1.0,1,3971,5,4.0,359.0,10.0,...,352.0,294.0,3878.0,0.700000,0.0,0.0,0.0,0.0,16.0,1
3,0,1,0,1.0,0,171,7,1.0,307.0,20.0,...,176.0,173.0,72.0,0.100000,3.0,7.0,5.0,0.0,43.0,1
4,0,1,0,1.0,0,1451,8,3.0,346.0,40.0,...,331.0,378.0,335.0,0.350000,11.0,0.0,9.0,20.0,40.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4828,0,0,0,1.0,1,3731,9,3.0,351.0,80.0,...,32.0,34.0,4.0,0.771429,4.0,1.0,17.0,0.0,11.0,1
4829,0,1,1,0.0,0,3101,6,5.0,380.0,60.0,...,26.0,20.0,4.0,0.600000,2.0,3.0,0.0,0.0,15.0,1
4830,0,1,1,0.0,0,251,4,5.0,360.0,30.0,...,946.0,802.0,1130.0,0.600000,76.0,62.0,149.0,0.0,151.0,1
4831,0,1,1,1.0,1,403,1,3.0,345.0,40.0,...,32.0,72.0,72.0,0.600000,0.0,0.0,0.0,0.0,4.0,1


### 一、原始lr模型

In [33]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# 设置随机种子以确保结果可复现
torch.manual_seed(0)

<torch._C.Generator at 0x7f9370f5eed0>

In [34]:
data = df[columns_to_keep + ['risk']]
data.shape

(4833, 25)

In [35]:
data.head()

Unnamed: 0,RetakerFlag,FRL_Status,Sex,Race_Indicator,Hispanic_Indicator,SchoolNumber,TotalNumberofAbsences,FSAMath_2018_AchievementLevel,FSAMath_2018_ScaleScore,sum_session,...,video_pause,video_play,video_seek,avg_user_gave_correct_answer,tys_finish,tys_previous,tys_review_incorrect_question,tys_review_solution_video,wall_page_load,risk
0,0,1,1,1.0,1,251,5,2.0,328.0,20.0,...,298.0,272.0,608.0,0.1,69.0,14.0,92.0,1.0,54.0,1
1,0,1,0,1.0,0,1681,2,3.0,349.0,10.0,...,60.0,58.0,120.0,0.1,0.0,0.0,0.0,0.0,3.0,1
2,0,0,1,1.0,1,3971,5,4.0,359.0,10.0,...,352.0,294.0,3878.0,0.7,0.0,0.0,0.0,0.0,16.0,1
3,0,1,0,1.0,0,171,7,1.0,307.0,20.0,...,176.0,173.0,72.0,0.1,3.0,7.0,5.0,0.0,43.0,1
4,0,1,0,1.0,0,1451,8,3.0,346.0,40.0,...,331.0,378.0,335.0,0.35,11.0,0.0,9.0,20.0,40.0,1


In [39]:
X = data.drop('risk',1)
y = data[['risk']]

X = torch.tensor(np.array(X)).float()
y = torch.tensor(np.array(y).flatten()).float()

In [None]:
# 划分数据集
train_size = int(num_samples * 0.8)
test_size = num_samples - train_size
train_dataset = TensorDataset(X[:train_size], y[:train_size])
test_dataset = TensorDataset(X[train_size:], y[train_size:])
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# 定义逻辑回归模型
class LogisticRegressionModel(nn.Module):
    def __init__(self, input_dim):
        super(LogisticRegressionModel, self).__init__()
        self.linear = nn.Linear(input_dim, 1)

    def forward(self, x):
        outputs = torch.sigmoid(self.linear(x))
        return outputs

model = LogisticRegressionModel(num_features)

# 损失函数和优化器
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练模型
num_epochs = 10
for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels.view(-1, 1))
        loss.backward()
        optimizer.step()

# 评估模型
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for inputs, labels in test_loader:
        outputs = model(inputs)
        predicted = (outputs.data > 0.5).float()
        total += labels.size(0)
        correct += (predicted.view(-1) == labels).sum().item()

accuracy = 100 * correct / total
print(f'Accuracy: {accuracy}%')

In [41]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# 评估模型
model.eval()
y_true = []
y_pred = []

with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        predicted = (outputs.data > 0.5).float().view(-1)
        y_true.extend(labels.tolist())
        y_pred.extend(predicted.tolist())

# 计算评价指标
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

# 生成混淆矩阵
conf_matrix = confusion_matrix(y_true, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'Confusion Matrix:\n{conf_matrix}')

Accuracy: 0.6804550155118925
Precision: 0.6804550155118925
Recall: 1.0
F1 Score: 0.8098461538461539
Confusion Matrix:
[[  0 309]
 [  0 658]]


### 二、改进lr模型-添加loss

In [49]:
# ['FRL_Status', 'Sex', 'Race_Indicator', 'Hispanic_Indicator'] 这几个特征在的位置为1,2,3,4,所以索引列表为[1,2,3,4]
sensitive_feature_indices = [1,2,3,4]

In [46]:
# 构建逻辑回归模型
class LogisticRegressionModel(nn.Module):
    def __init__(self, input_dim):
        super(LogisticRegressionModel, self).__init__()
        self.linear = nn.Linear(input_dim, 1)

    def forward(self, x):
        outputs = torch.sigmoid(self.linear(x))
        return outputs

model = LogisticRegressionModel(num_features)

# 自定义损失函数
def custom_loss(outputs, labels, inputs, lambda_fairness=0.01):
    # 基本损失：二元交叉熵
    criterion = nn.BCELoss()
    loss = criterion(outputs, labels.view(-1, 1))

    # 添加公平性损失
    fairness_loss = 0
    for idx in sensitive_feature_indices:
        fairness_loss += calculate_group_fairness_metric(outputs, labels, inputs[:, idx])
    total_loss = loss + lambda_fairness * fairness_loss

    return total_loss

def calculate_group_fairness_metric(outputs, labels, sensitive_feature):
    epsilon = 1e-6  # 小的常数以防止除以零

    # 将数据分为两个群体
    group_0 = (sensitive_feature == 0)
    group_1 = (sensitive_feature == 1)

    # 计算每个群体的 FPR 和 FNR
    fpr_0 = torch.mean(outputs[group_0 & (labels == 0)]) if group_0.any() and (labels[group_0] == 0).any() else torch.tensor(0.0)
    fnr_0 = torch.mean(1 - outputs[group_0 & (labels == 1)]) if group_0.any() and (labels[group_0] == 1).any() else torch.tensor(0.0)
    fpr_1 = torch.mean(outputs[group_1 & (labels == 0)]) if group_1.any() and (labels[group_1] == 0).any() else torch.tensor(0.0)
    fnr_1 = torch.mean(1 - outputs[group_1 & (labels == 1)]) if group_1.any() and (labels[group_1] == 1).any() else torch.tensor(0.0)

    # 计算公平性指标
    # return torch.abs(fpr_0 - fpr_1) + torch.abs(fnr_0 - fnr_1) + epsilon
    return torch.abs(fnr_0 * fpr_1/(fpr_0 * fnr_1 + epsilon)-1)


# 优化器
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练模型
num_epochs = 10
for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = custom_loss(outputs, labels, inputs)
        loss.backward()
        optimizer.step()

In [47]:
# 评估模型
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for inputs, labels in test_loader:
        outputs = model(inputs)
        predicted = (outputs.data > 0.5).float()
        total += labels.size(0)
        correct += (predicted.view(-1) == labels).sum().item()

accuracy = 100 * correct / total
print(f'Accuracy: {accuracy}%')

Accuracy: 67.94208893485005%


In [48]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# 评估模型
model.eval()
y_true = []
y_pred = []

with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        predicted = (outputs.data > 0.5).float().view(-1)
        y_true.extend(labels.tolist())
        y_pred.extend(predicted.tolist())

# 计算评价指标
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

# 生成混淆矩阵
conf_matrix = confusion_matrix(y_true, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'Confusion Matrix:\n{conf_matrix}')

Accuracy: 0.6794208893485005
Precision: 0.6801242236024845
Recall: 0.9984802431610942
F1 Score: 0.8091133004926109
Confusion Matrix:
[[  0 309]
 [  1 657]]
