In [134]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

#**Data Collection**

In [135]:
# Membaca dataset
file_path = 'Sleep_health_and_lifestyle_dataset.csv'
data = pd.read_csv(file_path)

In [136]:
# Menampilkan nama kolom dan beberapa baris pertama
print("Nama Kolom:")
print(data.columns)

Nama Kolom:
Index(['Person ID', 'Gender', 'Age', 'Occupation', 'Sleep Duration',
       'Quality of Sleep', 'Physical Activity Level', 'Stress Level',
       'BMI Category', 'Blood Pressure', 'Heart Rate', 'Daily Steps',
       'Sleep Disorder'],
      dtype='object')


In [137]:
# Menampilkan beberapa baris pertama
print("Preview Dataset:")
print(data.head())

Preview Dataset:
   Person ID Gender  Age            Occupation  Sleep Duration  \
0          1   Male   27     Software Engineer             6.1   
1          2   Male   28                Doctor             6.2   
2          3   Male   28                Doctor             6.2   
3          4   Male   28  Sales Representative             5.9   
4          5   Male   28  Sales Representative             5.9   

   Quality of Sleep  Physical Activity Level  Stress Level BMI Category  \
0                 6                       42             6   Overweight   
1                 6                       60             8       Normal   
2                 6                       60             8       Normal   
3                 4                       30             8        Obese   
4                 4                       30             8        Obese   

  Blood Pressure  Heart Rate  Daily Steps Sleep Disorder  
0         126/83          77         4200            NaN  
1         125/80 

In [138]:
# Menampilkan informasi tentang kolom dataset
print("\nInformasi Dataset:")
print(data.info())


Informasi Dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374 entries, 0 to 373
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Person ID                374 non-null    int64  
 1   Gender                   374 non-null    object 
 2   Age                      374 non-null    int64  
 3   Occupation               374 non-null    object 
 4   Sleep Duration           374 non-null    float64
 5   Quality of Sleep         374 non-null    int64  
 6   Physical Activity Level  374 non-null    int64  
 7   Stress Level             374 non-null    int64  
 8   BMI Category             374 non-null    object 
 9   Blood Pressure           374 non-null    object 
 10  Heart Rate               374 non-null    int64  
 11  Daily Steps              374 non-null    int64  
 12  Sleep Disorder           155 non-null    object 
dtypes: float64(1), int64(7), object(5)
memory usage: 38.1+ KB
No

In [139]:
# Menampilkan statistik deskriptif
print("\nStatistik Deskriptif:")
print(data.describe())


Statistik Deskriptif:
        Person ID         Age  Sleep Duration  Quality of Sleep  \
count  374.000000  374.000000      374.000000        374.000000   
mean   187.500000   42.184492        7.132086          7.312834   
std    108.108742    8.673133        0.795657          1.196956   
min      1.000000   27.000000        5.800000          4.000000   
25%     94.250000   35.250000        6.400000          6.000000   
50%    187.500000   43.000000        7.200000          7.000000   
75%    280.750000   50.000000        7.800000          8.000000   
max    374.000000   59.000000        8.500000          9.000000   

       Physical Activity Level  Stress Level  Heart Rate   Daily Steps  
count               374.000000    374.000000  374.000000    374.000000  
mean                 59.171123      5.385027   70.165775   6816.844920  
std                  20.830804      1.774526    4.135676   1617.915679  
min                  30.000000      3.000000   65.000000   3000.000000  
25%     

In [140]:
# Memeriksa ukuran dataset
print(f"\nUkuran Dataset: {data.shape[0]} baris, {data.shape[1]} kolom")


Ukuran Dataset: 374 baris, 13 kolom


In [141]:
# Mengecek nilai unik untuk kolom kategori
for col in ['Gender', 'Occupation']:
    if col in data.columns:
        print(f"\nNilai Unik di Kolom '{col}':")
        print(data[col].unique())


Nilai Unik di Kolom 'Gender':
['Male' 'Female']

Nilai Unik di Kolom 'Occupation':
['Software Engineer' 'Doctor' 'Sales Representative' 'Teacher' 'Nurse'
 'Engineer' 'Accountant' 'Scientist' 'Lawyer' 'Salesperson' 'Manager']


#**Data Pre-Processing**

***Cek Nilai Kosong***

In [142]:
missing_values = data.isnull().sum()
print("\nJumlah Nilai Kosong per Kolom Sebelum Preprocessing:")
print(missing_values)


Jumlah Nilai Kosong per Kolom Sebelum Preprocessing:
Person ID                    0
Gender                       0
Age                          0
Occupation                   0
Sleep Duration               0
Quality of Sleep             0
Physical Activity Level      0
Stress Level                 0
BMI Category                 0
Blood Pressure               0
Heart Rate                   0
Daily Steps                  0
Sleep Disorder             219
dtype: int64


***Cek dan Tangani Duplikat (Duplicate Rows).***

In [143]:
# semua kolom
duplicate_rows_all = data.duplicated().sum()
print(f"Jumlah Baris Duplikat (Semua Kolom): {duplicate_rows_all}")

Jumlah Baris Duplikat (Semua Kolom): 0


In [144]:
# kolom tertentu
duplicate_rows_subset = data[['Age', 'Gender', 'Stress Level', 'Sleep Duration', 'Quality of Sleep', 'Occupation']].duplicated().sum()
print(f"Jumlah Baris Duplikat (Berdasarkan Kolom Tertentu): {duplicate_rows_subset}")

Jumlah Baris Duplikat (Berdasarkan Kolom Tertentu): 265


In [145]:
# Analisis baris duplikat
duplicates = data[data.duplicated(subset=['Age', 'Gender', 'Stress Level', 'Sleep Duration', 'Quality of Sleep', 'Occupation'], keep=False)]
print(f"\nContoh Baris Duplikat:\n{duplicates.head()}")


Contoh Baris Duplikat:
   Person ID Gender  Age            Occupation  Sleep Duration  \
1          2   Male   28                Doctor             6.2   
2          3   Male   28                Doctor             6.2   
3          4   Male   28  Sales Representative             5.9   
4          5   Male   28  Sales Representative             5.9   
7          8   Male   29                Doctor             7.8   

   Quality of Sleep  Physical Activity Level  Stress Level BMI Category  \
1                 6                       60             8       Normal   
2                 6                       60             8       Normal   
3                 4                       30             8        Obese   
4                 4                       30             8        Obese   
7                 7                       75             6       Normal   

  Blood Pressure  Heart Rate  Daily Steps Sleep Disorder  
1         125/80          75        10000            NaN  
2         

In [146]:
print(f"Jumlah baris setelah memeriksa duplikat: {len(data)}\n")

Jumlah baris setelah memeriksa duplikat: 374



***Penanganan Kelas Target (Insomnia).***

In [147]:
# Memeriksa nama kolom dalam dataset
print("Kolom dalam dataset:")
print(data.columns)

Kolom dalam dataset:
Index(['Person ID', 'Gender', 'Age', 'Occupation', 'Sleep Duration',
       'Quality of Sleep', 'Physical Activity Level', 'Stress Level',
       'BMI Category', 'Blood Pressure', 'Heart Rate', 'Daily Steps',
       'Sleep Disorder'],
      dtype='object')


In [148]:
# Menangani jika kolom Insomnia tidak ditemukan
if 'Insomnia' not in data.columns:
    print("\nKolom 'Insomnia' tidak ditemukan. Harap periksa nama kolom atau gunakan nama yang benar.")
else:
    print("\nDistribusi Kelas Target Sebelum Preprocessing:")
    print(data['Insomnia'].value_counts())


Kolom 'Insomnia' tidak ditemukan. Harap periksa nama kolom atau gunakan nama yang benar.


In [149]:
# Menambahkan Kolom Insomnia
# Misalnya: Sleep Quality < 3 atau Sleep Duration < 5 dianggap insomnia
data['Insomnia'] = ((data['Quality of Sleep'] < 3) | (data['Sleep Duration'] < 5) |
                     (data['Stress Level'] > 6)).astype(int)

In [150]:
print("\nPreview Dataset dengan Kolom Insomnia:")
print(data[['Quality of Sleep', 'Sleep Duration', 'Stress Level', 'Insomnia']].head())


Preview Dataset dengan Kolom Insomnia:
   Quality of Sleep  Sleep Duration  Stress Level  Insomnia
0                 6             6.1             6         0
1                 6             6.2             8         1
2                 6             6.2             8         1
3                 4             5.9             8         1
4                 4             5.9             8         1


In [151]:
# Cek distribusi kelas target setelah pembaruan kriteria
print("\nDistribusi Kelas Target Setelah Preprocessing:")
print(data['Insomnia'].value_counts())


Distribusi Kelas Target Setelah Preprocessing:
Insomnia
0    254
1    120
Name: count, dtype: int64


***Pilih Fitur dan Target.***

In [152]:
# Pilih Fitur dan Target
selected_features = ['Age', 'Gender', 'Stress Level', 'Sleep Duration', 'Quality of Sleep', 'Occupation']
target_column = 'Insomnia'

In [153]:
X = data[selected_features]
y = data[target_column]

***Encoding Kolom Kategori (Label Encoding).***

In [154]:
le_gender = LabelEncoder()
le_occupation = LabelEncoder()

X['Gender'] = le_gender.fit_transform(X['Gender'])
X['Occupation'] = le_occupation.fit_transform(X['Occupation'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Gender'] = le_gender.fit_transform(X['Gender'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Occupation'] = le_occupation.fit_transform(X['Occupation'])


In [155]:
print("\nFitur Setelah Encoding:")
print(X.head())


Fitur Setelah Encoding:
   Age  Gender  Stress Level  Sleep Duration  Quality of Sleep  Occupation
0   27       1             6             6.1                 6           9
1   28       1             8             6.2                 6           1
2   28       1             8             6.2                 6           1
3   28       1             8             5.9                 4           6
4   28       1             8             5.9                 4           6


***Normalisasi Fitur Numerik (Standardization).***

In [156]:
# Inisialisasi StandardScaler
scaler = StandardScaler()

# Normalisasi fitur numerik
X[['Age', 'Stress Level', 'Sleep Duration', 'Quality of Sleep']] = scaler.fit_transform(X[['Age', 'Stress Level', 'Sleep Duration', 'Quality of Sleep']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[['Age', 'Stress Level', 'Sleep Duration', 'Quality of Sleep']] = scaler.fit_transform(X[['Age', 'Stress Level', 'Sleep Duration', 'Quality of Sleep']])


In [157]:
print("\nFitur Setelah Normalisasi:")
print(X.head())


Fitur Setelah Normalisasi:
        Age  Gender  Stress Level  Sleep Duration  Quality of Sleep  \
0 -1.753096       1      0.347021       -1.298887         -1.098280   
1 -1.637643       1      1.475592       -1.173036         -1.098280   
2 -1.637643       1      1.475592       -1.173036         -1.098280   
3 -1.637643       1      1.475592       -1.550588         -2.771424   
4 -1.637643       1      1.475592       -1.550588         -2.771424   

   Occupation  
0           9  
1           1  
2           1  
3           6  
4           6  


***Penyeimbangan Kelas dengan SMOTE.***

In [158]:
# Mengecek distribusi kelas target
print("Distribusi Kelas Target:")
print(data['Insomnia'].value_counts())

Distribusi Kelas Target:
Insomnia
0    254
1    120
Name: count, dtype: int64


In [159]:
if data['Insomnia'].nunique() > 1:
    smote = SMOTE(sampling_strategy='minority', random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    print("Distribusi Kelas Target Setelah SMOTE:")
    print(f"Jumlah data pada kelas 0 (Tidak Insomnia): {sum(y_resampled == 0)}")
    print(f"Jumlah data pada kelas 1 (Insomnia): {sum(y_resampled == 1)}")
else:
    print("Data hanya memiliki satu kelas, tidak dapat melakukan SMOTE.")

Distribusi Kelas Target Setelah SMOTE:
Jumlah data pada kelas 0 (Tidak Insomnia): 254
Jumlah data pada kelas 1 (Insomnia): 254


***Verifikasi Hasil Preprocessing.***

In [163]:
# 1. Cek Nilai Kosong (Missing Values)
print("Jumlah Nilai Kosong per Kolom Setelah Preprocessing:")
print(data[selected_features].isnull().sum())

# 2. Cek Duplikat (Duplicate Rows)
print("\nJumlah Baris Duplikat Setelah Preprocessing:")
print(data.duplicated().sum())

# 3. Distribusi Kelas Target Setelah SMOTE
print("\nDistribusi Kelas Target Setelah SMOTE:")
print(y_resampled.value_counts())

# 4. Cek Kembali Fitur dan Target
print("\nBentuk Fitur dan Target:")
print("Fitur (X) Shape:", X_resampled.shape)
print("Target (y) Shape:", y_resampled.shape)

Jumlah Nilai Kosong per Kolom Setelah Preprocessing:
Age                 0
Gender              0
Stress Level        0
Sleep Duration      0
Quality of Sleep    0
Occupation          0
dtype: int64

Jumlah Baris Duplikat Setelah Preprocessing:
0

Distribusi Kelas Target Setelah SMOTE:
Insomnia
0    254
1    254
Name: count, dtype: int64

Bentuk Fitur dan Target:
Fitur (X) Shape: (508, 6)
Target (y) Shape: (508,)


#**Building the Model**

#**Training the Model**

#**Evaluate Model**

#**Model Deployment**