In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')


student_data = r"C:\Users\MK\Desktop\task\task\student_data.csv"
df_student = pd.read_csv(student_data)
print(df_student.shape)
print(df_student.head())
print(df_student.isnull().sum())

(8824678, 14)
   sessionId  timestamp      x      y       z   ppgValue  hr  hrIbi  hrStatus  \
0          1          8  653.0  195.0  4324.0  1917829.0 NaN    NaN       NaN   
1          1         50  659.0  198.0  4096.0  1917495.0 NaN    NaN       NaN   
2          1         89  645.0  149.0  4054.0  1917365.0 NaN    NaN       NaN   
3          1        128  752.0  213.0  4090.0  1918146.0 NaN    NaN       NaN   
4          1        169  652.0  232.0  4088.0  1919319.0 NaN    NaN       NaN   

   ibiStatus  notification  engagement affect context  
0        NaN           NaN         NaN    NaN     NaN  
1        NaN           NaN         NaN    NaN     NaN  
2        NaN           NaN         NaN    NaN     NaN  
3        NaN           NaN         NaN    NaN     NaN  
4        NaN           NaN         NaN    NaN     NaN  
sessionId             0
timestamp             0
x                338892
y                338892
z                338892
ppgValue         338892
hr              848

In [2]:
# 假设您的 DataFrame 变量名为 df
hr_status_counts = df_student['hrStatus'].value_counts()

# 显示 hrStatus 列中 0 和 1 的数量
print("hrStatus 列中 0 的数量:", hr_status_counts[0])
print("hrStatus 列中 1 的数量:", hr_status_counts[1])


hrStatus 列中 0 的数量: 2321
hrStatus 列中 1 的数量: 195585


In [3]:
student_skeleton = r"C:\Users\MK\Desktop\task\task\student_skeleton.csv"
df_student_skeleton = pd.read_csv(student_skeleton)
print(df_student_skeleton.shape)
print(df_student_skeleton.head())
print(df_student_skeleton.isnull().sum())

(193, 4)
   sessionId  timestamp  affect  context
0          1    1652042    True    False
1          1    1658301   False     True
2          1    5914412    True    False
3          1    5921368   False     True
4          2     629724    True    False
sessionId    0
timestamp    0
affect       0
context      0
dtype: int64


In [4]:
session_info = r"C:\Users\MK\Desktop\task\task\session_info.csv"
df_session_info = pd.read_csv(session_info)
print(df_session_info.shape)
print(df_session_info.head())
print(df_session_info.isnull().sum())

(58, 6)
   id  duration watchId    age  gender  fairNumber
0   1   8021351    0Q3V  18-25  female           1
1   2   3752526    0Q3V  26-30    male           1
2   3   3989497    0QED  18-25    male           1
3   4   4979198    0QED  18-25    male           1
4   5   4836199    0QVM  18-25    male           1
id            0
duration      0
watchId       0
age           0
gender        0
fairNumber    0
dtype: int64


In [5]:
# 假设您的 DataFrame 变量名为 df
id_column = df_session_info['id']

# 显示 id 列的数据
print(id_column)


0      1
1      2
2      3
3      4
4      5
5      6
6      7
7      8
8      9
9     10
10    11
11    12
12    13
13    14
14    15
15    16
16    17
17    18
18    19
19    20
20    21
21    22
22    23
23    24
24    25
25    26
26    27
27    28
28    29
29    30
30    31
31    32
32    33
33    34
34    35
35    36
36    37
37    38
38    39
39    40
40    41
41    42
42    43
43    44
44    45
45    46
46    47
47    48
48    49
49    50
50    51
51    52
52    53
53    54
54    55
55    56
56    57
57    58
Name: id, dtype: int64


In [6]:
col_names = df_student.columns
col_names

Index(['sessionId', 'timestamp', 'x', 'y', 'z', 'ppgValue', 'hr', 'hrIbi',
       'hrStatus', 'ibiStatus', 'notification', 'engagement', 'affect',
       'context'],
      dtype='object')

In [7]:
affect_info = df_student["affect"].value_counts()

print("affect 列的信息:")
print(affect_info)

context_info = df_student["context"].value_counts()

print("context 列的信息:")
print(context_info)

affect 列的信息:
HAPPY      114
RELAXED    112
SAD          9
ANGRY        5
Name: affect, dtype: int64
context 列的信息:
CONVERSATION    85
OTHER           66
WALKING         48
VIEW_BOOTH      34
Name: context, dtype: int64


In [8]:
affect_dummies = pd.get_dummies(df_student['affect'], prefix='affect')

# 对 context 列进行独热编码
context_dummies = pd.get_dummies(df_student['context'], prefix='context')
df_encoded = pd.concat([df_student, affect_dummies, context_dummies], axis=1)

# 删除原始的 affect 和 context 列
df_encoded.drop(['affect', 'context'], axis=1, inplace=True)

# 显示编码后的数据
print(df_encoded.head())

   sessionId  timestamp      x      y       z   ppgValue  hr  hrIbi  hrStatus  \
0          1          8  653.0  195.0  4324.0  1917829.0 NaN    NaN       NaN   
1          1         50  659.0  198.0  4096.0  1917495.0 NaN    NaN       NaN   
2          1         89  645.0  149.0  4054.0  1917365.0 NaN    NaN       NaN   
3          1        128  752.0  213.0  4090.0  1918146.0 NaN    NaN       NaN   
4          1        169  652.0  232.0  4088.0  1919319.0 NaN    NaN       NaN   

   ibiStatus  notification  engagement  affect_ANGRY  affect_HAPPY  \
0        NaN           NaN         NaN             0             0   
1        NaN           NaN         NaN             0             0   
2        NaN           NaN         NaN             0             0   
3        NaN           NaN         NaN             0             0   
4        NaN           NaN         NaN             0             0   

   affect_RELAXED  affect_SAD  context_CONVERSATION  context_OTHER  \
0               0     

In [9]:
df_student['x'] = df_student['x'].interpolate()
df_student['y'] = df_student['y'].interpolate()
df_student['z'] = df_student['z'].interpolate()
df_student['ppgValue'] = df_student['ppgValue'].interpolate()
111