In [None]:
#importing libraries
import seaborn as sns
import plotly.express as px
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
from scipy.stats import t
from scipy.stats import f_oneway
from scipy.stats import chi2_contingency
from scipy.stats import gaussian_kde
import plotly.graph_objects as go
import urllib.request


In [None]:

# Download the dataset from GitHub
!curl -L -o human_behavior_dataset.csv https://raw.githubusercontent.com/amnafatimaa6-ops/data-visualization/main/human_behavior_dataset.csv

# Load it into a DataFrame
df = pd.read_csv('human_behavior_dataset.csv')
df.head()

In [None]:
#checking data first
df.describe
df.columns
print(df.dtypes)
print('number of duplicates',df.duplicated().sum())
print('number of null values', df.isnull().sum())
print(df['Primary_Activity'].unique())
print(df['Caffeine_Intake_Cups'].unique())


In [None]:
#cleaning data
df = df.drop_duplicates() #dropping duplicates
print("Duplicate rows after:", df.duplicated().sum())
#filling nan
numeric_cols = ['Stress_Level',"Sleep_Hours", "Screen_Time_Hours", "Caffeine_Intake_Cups", "Mood_Score", "Daily_Steps"]
for col in numeric_cols:
  df[col].fillna(df[col].mean(), inplace = True)
categorical_cols = ["Primary_Activity"]
for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)
df['Primary_Activity'] = df['Primary_Activity'].str.title()

print(df.isna().sum())
print(df.duplicated().sum())
print(df['Primary_Activity'].unique())

In [None]:
#HYPOTHESIS
# Z-TEST
'''
A Z-test is a statistical test used to determine if the mean of a sample is significantly different from a known population mean.
It’s used when the population standard deviation is known.
The sample size is relatively large (n > 30 is a common rule)
'''
#claim:average person takes 7000 or more steps per day
sample_col = 'Daily_Steps'
sample_mean = df[sample_col].mean()
n = df[sample_col].shape[0]
# Population stats (known or assumed)
population_mean = 7000
population_std = 2000  # assumed known
alpha = 0.05  # significance level
# One-tailed Z-test (H1: mean > 7000)
#calculate Z-statistic
z_stat = (sample_mean - population_mean) / (population_std / np.sqrt(n))
#calculate p-value
p_value = 1 - norm.cdf(z_stat)  # one-tailed: check if mean > 7000

print("One-Tailed Z-Test (H1: mean > 7000)")
print("Z-value:", z_stat)
print("P-value:", p_value)

if p_value < alpha:
    print("Reject H₀: Average daily steps is significantly greater than 7000")
else:
    print("Fail to reject H₀: Average daily steps is NOT significantly greater than 7000")

#bar plot to check steps count above or less than 7000
# Create the step category
df['Step_Category'] = np.where(df['Daily_Steps'] >= 7000, '>= 7000', '< 7000')
# Count how many people in each category
step_counts = df['Step_Category'].value_counts()
plt.figure(figsize=(6,4))
plt.bar(step_counts.index, step_counts.values, color=['green','red'])
plt.title('People Taking More or Less Than 7000 Steps')
plt.xlabel('Daily Steps')
plt.ylabel('Number of People')
plt.show()

In [None]:
# Z-test
#Average stress level of humans is 6
col = 'Stress_Level'
sample_mean = df[col].mean()
sample_std = df[col].std()
n = df[col].shape[0]
#Assumed population std
population_std = 2
population_mean = 6

z_stats = (sample_mean - population_mean) / (sample_std/np.sqrt(n))
print("Z-value:", z_stat)
p_value = 2 * (1 - norm.cdf(abs(z_stat)))
print("P-value:", p_value)

alpha = 0.05
if p_value < alpha:
    print("Reject H₀: Stress level is significantly different from 6")
else:
    print("Fail to reject H₀: Stress level is NOT significantly different from 6")



# Count how many people are at each stress level
stress_counts = df['Stress_Level'].value_counts().sort_index()

# For density curve
stress_values = df['Stress_Level']
kde = gaussian_kde(stress_values)
x_vals = np.linspace(min(stress_values), max(stress_values), 100)
y_vals = kde(x_vals)

# Histogram bars + density line
fig = px.bar(x=stress_counts.index, y=stress_counts.values,
             labels={'x':'Stress Level', 'y':'Count'},
             title='Stress Level Distribution with Density Curve',
             color_discrete_sequence=['skyblue'])

# Add density curve
fig.add_scatter(x=x_vals, y=y_vals*len(df), mode='lines', line=dict(color='red', width=2), name='Density Curve')

fig.show()



In [None]:
#T-TEST
'''
Used when population standard deviation is unknown or sample size is small (n < 30).
Tests if the sample mean is significantly different from a known value (one-sample) or if two sample means differ (two-sample).
'''
#does student sleep ,on average 7 hours per day?
sample_col = 'Sleep_Hours'
sam_mean = df[sample_col].mean()
sample_std = df[sample_col].std()
n = df[sample_col].shape[0]
# Population mean (hypothetical)
population_mean = 7
# Significance level
alpha = 0.05
#Calculate T-statistic
t_stat = (sam_mean - population_mean) / (sample_std / (n ** 0.5))
print("T-value:", t_stat)

#Calculate p-value (two-tailed)
p_value = 2 * (1 - t.cdf(abs(t_stat), df=n-1))
print("p-value:", p_value)

#Hypothesis decision
if p_value < alpha:
    print("Reject H₀: Mean sleep hours is significantly different from 7")
else:
    print("Fail to reject H₀: Mean sleep hours is not significantly different from 7")


#violin plot to check if people sleep more or less than 7
# Create a new column to classify sleep relative to 7 hours
df['Sleep_Category'] = np.where(df['Sleep_Hours'] >= 7, '>= 7 hours', '< 7 hours')
fig = px.violin(df, x='Primary_Activity', y='Sleep_Hours', color='Sleep_Category',
                box=True, points='all', hover_data=['Sleep_Hours'],
                title='Sleep Hours Distribution Across Activities: Above or Below 7 Hours',
                color_discrete_map={'< 7 hours':'red', '>= 7 hours':'green'})
fig.add_shape(type='line', x0=-0.5, x1=len(df['Primary_Activity'].unique())-0.5,
              y0=7, y1=7,
              line=dict(color='blue', dash='dash'),
              xref='x', yref='y')

fig.show()

In [None]:
#ANNOVA-TEST(one way)
'''
ANOVA is a test that compares the means of three or more groups to check if they are different from each other.
'''
# does the type of primary activity affect how much people sleep?
gaming = df[df['Primary_Activity'] == 'Gaming']['Sleep_Hours']
workout = df[df['Primary_Activity'] == 'Workout']['Sleep_Hours']
reading = df[df['Primary_Activity'] == 'Reading']['Sleep_Hours']

f_stat, p_value = f_oneway(gaming, workout, reading)
print("F-statistic:", f_stat)
print("P-value:", p_value)
alpha = 0.05

if p_value < alpha:
    print("Reject H₀: At least one activity group has different mean sleep hours.")
else:
    print("Fail to reject H₀: No significant difference in sleep hours across activities.")


#box plot visualization of distribution of sleep in type of primary activities
fig = px.box(df, x='Primary_Activity', y='Sleep_Hours',
             color='Primary_Activity',
             title='Sleep Hours Across Different Activities',
             points="all")  # Shows all data points
fig.show()
#gaming max  and min sleep hours -->  3-9
#workout max and min sleep hours -->  4-9
#reading max and min sleep hours -->  2-11
#sleeping max and min sleep hours --> 1-9.7
#travel max and min sleep hours -->   3-9
#studying max and min hours -->       2-9.3

In [None]:
df

In [None]:
#CHI SQUARE TEST
'''
Chi-Square Test is a statistical test used to examine whether there is a significant association between categorical variables.
A contingency table is basically a way to organize and summarize categorical data to see how two (or more) variables relate to each other.
Chi-Square Test of Independence: Checks if two categorical variables are related or independent.
Chi-Square Goodness-of-Fit Test: Checks if the observed frequency of a single categorical variable matches an expected distribution.
'''
#claim: sleep and step are independent columns
# Create contingency table
contingency_table = pd.crosstab(df['Sleep_Category'], df['Step_Category'])
print(contingency_table)

# Perform Chi-Square test
chi2, p, dof, expected = chi2_contingency(contingency_table)

print(f"Chi2 statistic: {chi2}")
print(f"P-value: {p}")
print(f"Degrees of freedom: {dof}")
print("Expected frequencies:")
print(expected)

# Hypothesis decision
alpha = 0.05
if p < alpha:
    print("Reject H₀: Sleep category and Step category are dependent")
else:
    print("Fail to reject H₀: Sleep category and Step category are independent")

fig = px.imshow(contingency_table.values,
                x=contingency_table.columns,
                y=contingency_table.index,
                text_auto=True,
                color_continuous_scale='Blues',
                labels=dict(x="Step Category", y="Sleep Category", color="Count"),
                title="Heatmap of Sleep Category vs Step Category")
fig.show()

In [None]:
# Making sure Date column is in datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Melt the dataframe to long format for multiple lines
df_melt = df.melt(id_vars='Date',
                  value_vars=['Sleep_Hours', 'Screen_Time_Hours', 'Daily_Steps'],
                  var_name='Metric',
                  value_name='Value')

# Create multi-line plot
fig = px.line(df_melt, x='Date', y='Value', color='Metric',
              title='Sleep, Screen Time, and Daily Steps Over Time',
              labels={'Date':'Date', 'Value':'Value', 'Metric':'Metric'},
              markers=True)  # dots on data points

fig.show()

In [None]:
#Five step  statistical summary for cols
numeric_columns = ['Stress_Level', 'Sleep_Hours', 'Screen_Time_Hours',
                   'Caffeine_Intake_Cups', 'Mood_Score', 'Daily_Steps']

# Loop through each column and create a box plot
for col in numeric_columns:
    fig = px.box(df, y=col, points="all",
                 title=f'Box Plot of {col} (5-number summary)',
                 labels={col: col})
    fig.show()