In [85]:
import pandas as pd

df = pd.read_csv('stress_detection.csv')
df.head(50)

Unnamed: 0,participant_id,day,PSS_score,Openness,Conscientiousness,Extraversion,Agreeableness,Neuroticism,sleep_time,wake_time,sleep_duration,PSQI_score,call_duration,num_calls,num_sms,screen_on_time,skin_conductance,accelerometer,mobility_radius,mobility_distance
0,1,1,34,2.322732,4.332193,1.185878,1.570213,3.782094,7.726792,5.19066,6.572069,1,3.924527,12,32,10.703714,3.11573,0.161717,1.145179,2.196851
1,1,2,37,1.761436,3.25412,3.907281,4.072512,1.997145,7.312674,6.170717,8.030168,4,58.318004,3,41,11.012939,0.959144,0.985587,1.021133,0.737825
2,1,3,30,3.025887,1.855002,2.0459,2.317493,3.619225,6.99206,5.318825,7.10242,1,4.941043,4,48,4.877372,3.311629,1.877445,0.478179,0.911673
3,1,4,16,1.94837,4.966676,3.345225,1.607756,3.583524,8.886914,8.061075,8.123294,3,0.295373,11,38,3.462956,0.625721,0.494921,0.630549,3.911418
4,1,5,32,3.343484,2.065936,3.137843,2.118061,2.567347,7.811705,7.312145,7.785143,3,22.300571,17,17,4.861046,0.622609,1.3426,0.25409,1.605132
5,1,6,28,3.31914,4.141191,2.934202,1.416814,3.901544,8.79954,8.602325,8.030797,4,55.45985,4,14,7.883468,4.901522,0.539268,1.461828,3.927181
6,1,7,39,2.893623,1.909251,3.731296,1.914267,3.165248,8.251176,8.029626,7.944314,3,28.972359,4,3,9.096195,4.949273,1.486665,1.017602,2.740476
7,1,8,19,1.437202,1.816345,4.170692,3.716547,1.645627,8.433565,6.427281,8.558427,3,47.342943,3,37,1.395497,3.012052,1.853035,0.546391,4.715504
8,1,9,22,3.279301,3.968729,1.764378,4.443119,1.326407,8.307685,8.034998,6.134476,1,36.755438,3,16,5.377928,3.147191,1.357896,0.475016,1.546705
9,1,10,15,4.604192,4.025498,1.137726,3.67764,4.682931,8.097748,7.456953,7.248696,4,3.391413,7,31,7.723512,1.241725,2.356403,0.794373,2.206041


In [87]:
# Data understanding
print(df.dtypes)

participant_id         int64
day                    int64
PSS_score              int64
Openness             float64
Conscientiousness    float64
Extraversion         float64
Agreeableness        float64
Neuroticism          float64
sleep_time           float64
wake_time            float64
sleep_duration       float64
PSQI_score             int64
call_duration        float64
num_calls              int64
num_sms                int64
screen_on_time       float64
skin_conductance     float64
accelerometer        float64
mobility_radius      float64
mobility_distance    float64
dtype: object


In [None]:
#Data cleaning procedure
#1. Analyzing columns independently

In [89]:
#a. Identyfing the missing value

df.isnull().sum() # no missing values are detected

participant_id       0
day                  0
PSS_score            0
Openness             0
Conscientiousness    0
Extraversion         0
Agreeableness        0
Neuroticism          0
sleep_time           0
wake_time            0
sleep_duration       0
PSQI_score           0
call_duration        0
num_calls            0
num_sms              0
screen_on_time       0
skin_conductance     0
accelerometer        0
mobility_radius      0
mobility_distance    0
dtype: int64

In [91]:
#b. Fix Inconsistent or Logical Errors
#column sleep time seems too have non-logical values, new values for sleep_time: at which time a person went to sleep
df['sleep_time'] = df['wake_time'] - df['sleep_duration']
df['sleep_time'] = df['sleep_time'].apply(lambda x: 24+x if x < 0 else x)

In [61]:
df.head(20)

Unnamed: 0,participant_id,day,PSS_score,Openness,Conscientiousness,Extraversion,Agreeableness,Neuroticism,sleep_time,wake_time,sleep_duration,PSQI_score,call_duration,num_calls,num_sms,screen_on_time,skin_conductance,accelerometer,mobility_radius,mobility_distance
0,1,1,34,2.322732,4.332193,1.185878,1.570213,3.782094,22.618591,5.19066,6.572069,1,3.924527,12,32,10.703714,0.581241,0.025395,1.145179,2.196851
1,1,2,37,1.761436,3.25412,3.907281,4.072512,1.997145,22.140549,6.170717,8.030168,4,58.318004,3,41,11.012939,0.101734,0.368795,1.021133,0.737825
2,1,3,30,3.025887,1.855002,2.0459,2.317493,3.619225,22.216405,5.318825,7.10242,1,4.941043,4,48,4.877372,0.624798,0.740533,0.478179,0.911673
3,1,4,16,1.94837,4.966676,3.345225,1.607756,3.583524,23.937781,8.061075,8.123294,3,0.295373,11,38,3.462956,0.027599,0.164279,0.630549,3.911418
4,1,5,32,3.343484,2.065936,3.137843,2.118061,2.567347,23.527002,7.312145,7.785143,3,22.300571,17,17,4.861046,0.026907,0.517603,0.25409,1.605132
5,1,6,28,3.31914,4.141191,2.934202,1.416814,3.901544,0.571527,8.602325,8.030797,4,55.45985,4,14,7.883468,0.978303,0.182763,1.461828,3.927181
6,1,7,39,2.893623,1.909251,3.731296,1.914267,3.165248,0.085312,8.029626,7.944314,3,28.972359,4,3,9.096195,0.98892,0.577651,1.017602,2.740476
7,1,8,19,1.437202,1.816345,4.170692,3.716547,1.645627,21.868854,6.427281,8.558427,3,47.342943,3,37,1.395497,0.558188,0.730359,0.546391,4.715504
8,1,9,22,3.279301,3.968729,1.764378,4.443119,1.326407,1.900522,8.034998,6.134476,1,36.755438,3,16,5.377928,0.588236,0.523978,0.475016,1.546705
9,1,10,15,4.604192,4.025498,1.137726,3.67764,4.682931,0.208257,7.456953,7.248696,4,3.391413,7,31,7.723512,0.164564,0.940169,0.794373,2.206041


In [93]:
#c. check the range of the columns, search for values out of range
for column in df.columns:
    min_value = df[column].min()
    max_value = df[column].max()
    print(f"Column: {column}")
    print(f"Range: {min_value} to {max_value}\n")


Column: participant_id
Range: 1 to 100

Column: day
Range: 1 to 30

Column: PSS_score
Range: 10 to 39

Column: Openness
Range: 1.005003488 to 4.997404661

Column: Conscientiousness
Range: 1.000982489 to 4.9991371

Column: Extraversion
Range: 1.000584289 to 4.997641772

Column: Agreeableness
Range: 1.002205904 to 4.99988084

Column: Neuroticism
Range: 1.000172846 to 4.996408108

Column: sleep_time
Range: 0.0016942770000003549 to 23.998365478

Column: wake_time
Range: 5.00192651 to 8.998370732

Column: sleep_duration
Range: 6.000560973 to 8.999061409

Column: PSQI_score
Range: 1 to 4

Column: call_duration
Range: 0.002886141 to 59.98307358

Column: num_calls
Range: 0 to 19

Column: num_sms
Range: 0 to 49

Column: screen_on_time
Range: 1.006874242 to 11.99787122

Column: skin_conductance
Range: 0.501594763 to 4.999103926

Column: accelerometer
Range: 0.100790508 to 2.499946388

Column: mobility_radius
Range: 0.100041356 to 1.499889808

Column: mobility_distance
Range: 0.501621966 to 4.999

In [77]:
# d. Normalize data ranges for variables that  have different ranges or units: 
#skin_conductance, accelerometer, mobility_radius, mobility_distance
#from sklearn.preprocessing import MinMaxScaler
#scaler = MinMaxScaler()
#df[['skin_conductance', 'accelerometer', 'mobility_radius', 'mobility_distance']] = scaler.fit_transform(df[['skin_conductance', 'accelerometer', 'mobility_radius', 'mobility_distance']])

In [97]:
#e. check for duplicates
df[df.duplicated()]
# no duplicates were found

Unnamed: 0,participant_id,day,PSS_score,Openness,Conscientiousness,Extraversion,Agreeableness,Neuroticism,sleep_time,wake_time,sleep_duration,PSQI_score,call_duration,num_calls,num_sms,screen_on_time,skin_conductance,accelerometer,mobility_radius,mobility_distance


In [99]:
#Search for outliers in each attribute:
# columns to detect outliers
outliers_dict = {}  # To store outliers for each column

for column in df.columns:
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    
    # Store the outliers in the dictionary
    outliers_dict[column] = outliers

    # Print results for the column
    print(f"Column: {column}")
    print(f"Lower Bound: {lower_bound}, Upper Bound: {upper_bound}")
    print(f"Number of Outliers: {len(outliers)}")
    print(outliers, '\n')

# no outliers were found

Column: participant_id
Lower Bound: -48.5, Upper Bound: 149.5
Number of Outliers: 0
Empty DataFrame
Columns: [participant_id, day, PSS_score, Openness, Conscientiousness, Extraversion, Agreeableness, Neuroticism, sleep_time, wake_time, sleep_duration, PSQI_score, call_duration, num_calls, num_sms, screen_on_time, skin_conductance, accelerometer, mobility_radius, mobility_distance]
Index: [] 

Column: day
Lower Bound: -14.5, Upper Bound: 45.5
Number of Outliers: 0
Empty DataFrame
Columns: [participant_id, day, PSS_score, Openness, Conscientiousness, Extraversion, Agreeableness, Neuroticism, sleep_time, wake_time, sleep_duration, PSQI_score, call_duration, num_calls, num_sms, screen_on_time, skin_conductance, accelerometer, mobility_radius, mobility_distance]
Index: [] 

Column: PSS_score
Lower Bound: -5.5, Upper Bound: 54.5
Number of Outliers: 0
Empty DataFrame
Columns: [participant_id, day, PSS_score, Openness, Conscientiousness, Extraversion, Agreeableness, Neuroticism, sleep_time, wa

In [101]:
df.head()

Unnamed: 0,participant_id,day,PSS_score,Openness,Conscientiousness,Extraversion,Agreeableness,Neuroticism,sleep_time,wake_time,sleep_duration,PSQI_score,call_duration,num_calls,num_sms,screen_on_time,skin_conductance,accelerometer,mobility_radius,mobility_distance
0,1,1,34,2.322732,4.332193,1.185878,1.570213,3.782094,22.618591,5.19066,6.572069,1,3.924527,12,32,10.703714,3.11573,0.161717,1.145179,2.196851
1,1,2,37,1.761436,3.25412,3.907281,4.072512,1.997145,22.140549,6.170717,8.030168,4,58.318004,3,41,11.012939,0.959144,0.985587,1.021133,0.737825
2,1,3,30,3.025887,1.855002,2.0459,2.317493,3.619225,22.216405,5.318825,7.10242,1,4.941043,4,48,4.877372,3.311629,1.877445,0.478179,0.911673
3,1,4,16,1.94837,4.966676,3.345225,1.607756,3.583524,23.937781,8.061075,8.123294,3,0.295373,11,38,3.462956,0.625721,0.494921,0.630549,3.911418
4,1,5,32,3.343484,2.065936,3.137843,2.118061,2.567347,23.527002,7.312145,7.785143,3,22.300571,17,17,4.861046,0.622609,1.3426,0.25409,1.605132
