In [3]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# ========== FUNCTIONS (Requirement 1) ==========

def categorize_sleep_quality(quality):
    """Function to categorize sleep quality using conditional statements"""
    if quality >= 8:
        return 'Excellent'
    elif quality >= 6:
        return 'Good'
    elif quality >= 4:
        return 'Fair'
    else:
        return 'Poor'

def calculate_health_score(row):
    """Function to calculate overall health score"""
    score = 0
    score += row['Quality of Sleep'] * 10
    score += (row['Physical Activity Level'] / 10) * 5
    score -= row['Stress Level'] * 5
    score += (10 - abs(7.5 - row['Sleep Duration'])) * 3
    return score

def split_blood_pressure(bp_string):
    """Function to split blood pressure into systolic and diastolic"""
    try:
        systolic, diastolic = bp_string.split('/')
        return int(systolic), int(diastolic)
    except:
        return None, None

def analyze_occupation_health(df, occupation):
    """Function to analyze health metrics by occupation"""
    occ_data = df[df['Occupation'] == occupation]
    return {
        'count': len(occ_data),
        'avg_sleep_duration': occ_data['Sleep Duration'].mean(),
        'avg_stress': occ_data['Stress Level'].mean(),
        'avg_quality': occ_data['Quality of Sleep'].mean()
    }



In [4]:
# ========== DATA LOADING ==========

df = pd.read_csv('Sleep_health_and_lifestyle_dataset.csv')
print("Dataset loaded successfully!")
print(f"Total records: {len(df)}")
print(f"\nColumns: {list(df.columns)}")
print("\n" + "="*80)




Dataset loaded successfully!
Total records: 374

Columns: ['Person ID', 'Gender', 'Age', 'Occupation', 'Sleep Duration', 'Quality of Sleep', 'Physical Activity Level', 'Stress Level', 'BMI Category', 'Blood Pressure', 'Heart Rate', 'Daily Steps', 'Sleep Disorder']



In [5]:
# ========== CONDITIONAL STATEMENTS (Requirement 2) ==========

print("\n1. CONDITIONAL STATEMENTS ANALYSIS")
print("="*80)

# Apply sleep quality categorization using conditional logic
df['Sleep_Category'] = df['Quality of Sleep'].apply(categorize_sleep_quality)

# Count each category using conditional logic
excellent_count = len(df[df['Sleep_Category'] == 'Excellent'])
good_count = len(df[df['Sleep_Category'] == 'Good'])
fair_count = len(df[df['Sleep_Category'] == 'Fair'])
poor_count = len(df[df['Sleep_Category'] == 'Poor'])

print(f"\nSleep Quality Distribution:")
print(f"Excellent (≥8): {excellent_count} people")
print(f"Good (6-7): {good_count} people")
print(f"Fair (4-5): {fair_count} people")
print(f"Poor (<4): {poor_count} people")

# Conditional analysis for risk assessment
df['Risk_Level'] = 'Low'
for idx, row in df.iterrows():
    if row['Sleep Duration'] < 6 and row['Stress Level'] >= 7:
        df.at[idx, 'Risk_Level'] = 'High'
    elif row['Sleep Duration'] < 7 or row['Stress Level'] >= 6:
        df.at[idx, 'Risk_Level'] = 'Medium'

print(f"\nRisk Level Distribution:")
print(df['Risk_Level'].value_counts())


1. CONDITIONAL STATEMENTS ANALYSIS

Sleep Quality Distribution:
Excellent (≥8): 180 people
Good (6-7): 182 people
Fair (4-5): 12 people
Poor (<4): 0 people

Risk Level Distribution:
Risk_Level
Medium    188
Low       180
High        6
Name: count, dtype: int64


In [6]:

# ========== LOOPING STATEMENTS (Requirement 3) ==========

print("\n2. LOOPING STATEMENTS ANALYSIS")
print("="*80)

# Loop through unique occupations
print("\nHealth Metrics by Occupation:")
occupations = df['Occupation'].unique()
occupation_stats = []

for occupation in occupations:
    stats = analyze_occupation_health(df, occupation)
    occupation_stats.append({
        'Occupation': occupation,
        'Count': stats['count'],
        'Avg Sleep Duration': round(stats['avg_sleep_duration'], 2),
        'Avg Stress Level': round(stats['avg_stress'], 2),
        'Avg Sleep Quality': round(stats['avg_quality'], 2)
    })

occupation_df = pd.DataFrame(occupation_stats)
print(occupation_df.to_string(index=False))

# Loop to calculate cumulative statistics
print("\n\nAge Group Analysis (Using Loops):")
age_groups = [(27, 35), (36, 45), (46, 59)]
for age_min, age_max in age_groups:
    group = df[(df['Age'] >= age_min) & (df['Age'] <= age_max)]
    print(f"\nAge {age_min}-{age_max}: {len(group)} people")
    print(f"  - Avg Sleep Duration: {group['Sleep Duration'].mean():.2f} hours")
    print(f"  - Sleep Disorders: {group[group['Sleep Disorder'] != 'None'].shape[0]} cases")




2. LOOPING STATEMENTS ANALYSIS

Health Metrics by Occupation:
          Occupation  Count  Avg Sleep Duration  Avg Stress Level  Avg Sleep Quality
   Software Engineer      4                6.75              6.00               6.50
              Doctor     71                6.97              6.73               6.65
Sales Representative      2                5.90              8.00               4.00
             Teacher     40                6.69              4.53               6.98
               Nurse     73                7.06              5.55               7.37
            Engineer     63                7.99              3.89               8.41
          Accountant     37                7.11              4.59               7.89
           Scientist      4                6.00              7.00               5.00
              Lawyer     47                7.41              5.06               7.89
         Salesperson     32                6.40              7.00               6.00
  

In [7]:
# ========== NUMPY OPERATIONS (Requirement 4) ==========

print("\n3. NUMPY OPERATIONS")
print("="*80)

# Convert columns to NumPy arrays for operations
sleep_duration = df['Sleep Duration'].values
quality_of_sleep = df['Quality of Sleep'].values
physical_activity = df['Physical Activity Level'].values
stress_level = df['Stress Level'].values
heart_rate = df['Heart Rate'].values
daily_steps = df['Daily Steps'].values

# NumPy statistical operations
print("\nNumPy Statistical Analysis:")
print(f"Sleep Duration - Mean: {np.mean(sleep_duration):.2f}, Std: {np.std(sleep_duration):.2f}")
print(f"Sleep Quality - Mean: {np.mean(quality_of_sleep):.2f}, Std: {np.std(quality_of_sleep):.2f}")
print(f"Physical Activity - Mean: {np.mean(physical_activity):.2f}, Std: {np.std(physical_activity):.2f}")
print(f"Stress Level - Mean: {np.mean(stress_level):.2f}, Std: {np.std(stress_level):.2f}")

# NumPy array operations
normalized_sleep = (sleep_duration - np.mean(sleep_duration)) / np.std(sleep_duration)
normalized_quality = (quality_of_sleep - np.mean(quality_of_sleep)) / np.std(quality_of_sleep)

# Correlation using NumPy
correlation_matrix = np.corrcoef([sleep_duration, quality_of_sleep, physical_activity, stress_level])
print("\nCorrelation Matrix (NumPy):")
print("Variables: [Sleep Duration, Sleep Quality, Physical Activity, Stress Level]")
print(correlation_matrix)

# NumPy percentile calculations
print(f"\nPercentile Analysis (NumPy):")
print(f"Sleep Duration - 25th: {np.percentile(sleep_duration, 25):.2f}, "
      f"50th: {np.percentile(sleep_duration, 50):.2f}, "
      f"75th: {np.percentile(sleep_duration, 75):.2f}")

# NumPy filtering
high_stress = np.where(stress_level >= 7)[0]
print(f"\nHigh Stress Cases (Stress ≥ 7): {len(high_stress)} people")
print(f"Their Average Sleep Duration: {np.mean(sleep_duration[high_stress]):.2f} hours")



3. NUMPY OPERATIONS

NumPy Statistical Analysis:
Sleep Duration - Mean: 7.13, Std: 0.79
Sleep Quality - Mean: 7.31, Std: 1.20
Physical Activity - Mean: 59.17, Std: 20.80
Stress Level - Mean: 5.39, Std: 1.77

Correlation Matrix (NumPy):
Variables: [Sleep Duration, Sleep Quality, Physical Activity, Stress Level]
[[ 1.          0.883213    0.21236031 -0.81102303]
 [ 0.883213    1.          0.19289645 -0.89875203]
 [ 0.21236031  0.19289645  1.         -0.03413446]
 [-0.81102303 -0.89875203 -0.03413446  1.        ]]

Percentile Analysis (NumPy):
Sleep Duration - 25th: 6.40, 50th: 7.20, 75th: 7.80

High Stress Cases (Stress ≥ 7): 120 people
Their Average Sleep Duration: 6.22 hours


In [8]:

# ========== PANDAS ANALYSIS (Requirement 5) ==========

print("\n4. PANDAS DATA ANALYSIS")
print("="*80)

# Split blood pressure column
df[['Systolic_BP', 'Diastolic_BP']] = df['Blood Pressure'].apply(
    lambda x: pd.Series(split_blood_pressure(x))
)

# Descriptive statistics using Pandas
print("\nDescriptive Statistics (Pandas):")
print(df[['Sleep Duration', 'Quality of Sleep', 'Physical Activity Level', 
          'Stress Level', 'Heart Rate', 'Daily Steps']].describe())

# Group by analysis
print("\n\nSleep Disorder Analysis (Pandas GroupBy):")
disorder_analysis = df.groupby('Sleep Disorder').agg({
    'Person ID': 'count',
    'Sleep Duration': 'mean',
    'Quality of Sleep': 'mean',
    'Stress Level': 'mean',
    'Physical Activity Level': 'mean',
    'Heart Rate': 'mean'
}).round(2)
disorder_analysis.columns = ['Count', 'Avg Sleep Duration', 'Avg Quality', 
                              'Avg Stress', 'Avg Physical Activity', 'Avg Heart Rate']
print(disorder_analysis)

# Gender-based analysis
print("\n\nGender-Based Analysis (Pandas):")
gender_analysis = df.groupby('Gender').agg({
    'Sleep Duration': ['mean', 'std'],
    'Quality of Sleep': ['mean', 'std'],
    'Stress Level': ['mean', 'std'],
    'Daily Steps': ['mean', 'std']
}).round(2)
print(gender_analysis)

# BMI Category analysis
print("\n\nBMI Category Impact (Pandas):")
bmi_analysis = df.groupby('BMI Category').agg({
    'Sleep Duration': 'mean',
    'Quality of Sleep': 'mean',
    'Physical Activity Level': 'mean',
    'Stress Level': 'mean',
    'Person ID': 'count'
}).round(2)
bmi_analysis.columns = ['Avg Sleep Duration', 'Avg Quality', 'Avg Physical Activity', 
                        'Avg Stress', 'Count']
print(bmi_analysis)

# Correlation analysis using Pandas
print("\n\nCorrelation Analysis (Pandas):")
correlation_df = df[['Sleep Duration', 'Quality of Sleep', 'Physical Activity Level',
                     'Stress Level', 'Heart Rate', 'Daily Steps', 'Systolic_BP', 
                     'Diastolic_BP']].corr()
print(correlation_df)

# Cross-tabulation analysis
print("\n\nCross-Tabulation: Gender vs Sleep Disorder (Pandas):")
crosstab = pd.crosstab(df['Gender'], df['Sleep Disorder'], margins=True)
print(crosstab)

# Calculate health scores using function
df['Health_Score'] = df.apply(calculate_health_score, axis=1)

print("\n\nTop 10 Healthiest Individuals (Based on Health Score):")
top_healthy = df.nlargest(10, 'Health_Score')[['Person ID', 'Age', 'Occupation', 
                                                 'Sleep Duration', 'Quality of Sleep',
                                                 'Stress Level', 'Health_Score']]
print(top_healthy.to_string(index=False))



4. PANDAS DATA ANALYSIS

Descriptive Statistics (Pandas):
       Sleep Duration  Quality of Sleep  Physical Activity Level  \
count      374.000000        374.000000               374.000000   
mean         7.132086          7.312834                59.171123   
std          0.795657          1.196956                20.830804   
min          5.800000          4.000000                30.000000   
25%          6.400000          6.000000                45.000000   
50%          7.200000          7.000000                60.000000   
75%          7.800000          8.000000                75.000000   
max          8.500000          9.000000                90.000000   

       Stress Level  Heart Rate   Daily Steps  
count    374.000000  374.000000    374.000000  
mean       5.385027   70.165775   6816.844920  
std        1.774526    4.135676   1617.915679  
min        3.000000   65.000000   3000.000000  
25%        4.000000   68.000000   5600.000000  
50%        5.000000   70.000000   7000.0

In [9]:

# ========== KEY FINDINGS SUMMARY ==========

print("\n5. KEY FINDINGS FOR RESEARCH PAPER")
print("="*80)

# Finding 1: Sleep disorder prevalence
total_with_disorder = df[df['Sleep Disorder'] != 'None'].shape[0]
disorder_percentage = (total_with_disorder / len(df)) * 100

print(f"\n1. Sleep Disorder Prevalence:")
print(f"   - {total_with_disorder} out of {len(df)} individuals ({disorder_percentage:.1f}%) have sleep disorders")
print(f"   - Breakdown: {df['Sleep Disorder'].value_counts().to_dict()}")

# Finding 2: Stress and sleep relationship
high_stress_group = df[df['Stress Level'] >= 7]
low_stress_group = df[df['Stress Level'] <= 4]
print(f"\n2. Stress Impact on Sleep:")
print(f"   - High stress (≥7): Avg sleep quality = {high_stress_group['Quality of Sleep'].mean():.2f}")
print(f"   - Low stress (≤4): Avg sleep quality = {low_stress_group['Quality of Sleep'].mean():.2f}")
print(f"   - Difference: {low_stress_group['Quality of Sleep'].mean() - high_stress_group['Quality of Sleep'].mean():.2f} points")

# Finding 3: Physical activity correlation
high_activity = df[df['Physical Activity Level'] >= 60]
low_activity = df[df['Physical Activity Level'] <= 40]
print(f"\n3. Physical Activity Benefits:")
print(f"   - High activity (≥60): Avg sleep duration = {high_activity['Sleep Duration'].mean():.2f} hours")
print(f"   - Low activity (≤40): Avg sleep duration = {low_activity['Sleep Duration'].mean():.2f} hours")

# Finding 4: Occupation-specific insights
high_risk_occupations = occupation_df.nlargest(3, 'Avg Stress Level')
print(f"\n4. High-Stress Occupations:")
print(high_risk_occupations[['Occupation', 'Avg Stress Level', 'Avg Sleep Quality']].to_string(index=False))

# Finding 5: BMI impact
print(f"\n5. BMI Category Impact on Sleep Disorders:")
bmi_disorder = pd.crosstab(df['BMI Category'], df['Sleep Disorder'] != 'None', normalize='index') * 100
print(f"   Percentage with sleep disorders by BMI category:")
for bmi_cat in bmi_disorder.index:
    print(f"   - {bmi_cat}: {bmi_disorder.loc[bmi_cat, True]:.1f}%")

# Save processed data
df.to_csv('sleep_health_analysis_results.csv', index=False)
print("\n\nAnalysis complete! Results saved to 'sleep_health_analysis_results.csv'")


5. KEY FINDINGS FOR RESEARCH PAPER

1. Sleep Disorder Prevalence:
   - 374 out of 374 individuals (100.0%) have sleep disorders
   - Breakdown: {'Sleep Apnea': 78, 'Insomnia': 77}

2. Stress Impact on Sleep:
   - High stress (≥7): Avg sleep quality = 5.92
   - Low stress (≤4): Avg sleep quality = 8.33
   - Difference: 2.41 points

3. Physical Activity Benefits:
   - High activity (≥60): Avg sleep duration = 7.37 hours
   - Low activity (≤40): Avg sleep duration = 7.07 hours

4. High-Stress Occupations:
          Occupation  Avg Stress Level  Avg Sleep Quality
Sales Representative               8.0                4.0
           Scientist               7.0                5.0
         Salesperson               7.0                6.0

5. BMI Category Impact on Sleep Disorders:
   Percentage with sleep disorders by BMI category:
   - Normal: 100.0%
   - Normal Weight: 100.0%
   - Obese: 100.0%
   - Overweight: 100.0%


Analysis complete! Results saved to 'sleep_health_analysis_results.csv'