In [27]:
#data cleanup
import pandas as pd
df_raw = pd.read_csv("Data.csv")

#get basic info. tells us there are no null values
df_raw.info()

#take a sample so that we can see what columns are categories
df_raw.sample(n=5)

#change dtype from object to category
digital_habits = pd.read_csv("Data.csv", dtype = {'gender':'category', 'region':'category', 'income_level':'category', 'daily_role':'category'})

#double check that there truly are no missing values
print(digital_habits.isna().sum())
#yep, no missing values. moving to a new cell.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1607 entries, 0 to 1606
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        1607 non-null   int64  
 1   age                       1607 non-null   int64  
 2   gender                    1607 non-null   object 
 3   region                    1607 non-null   object 
 4   income_level              1607 non-null   object 
 5   education_level           1607 non-null   object 
 6   daily_role                1607 non-null   object 
 7   device_hours_per_day      1607 non-null   float64
 8   phone_unlocks             1607 non-null   int64  
 9   notifications_per_day     1607 non-null   int64  
 10  social_media_mins         1607 non-null   int64  
 11  study_mins                1607 non-null   int64  
 12  physical_activity_days    1607 non-null   float64
 13  sleep_hours               1607 non-null   float64
 14  sleep_qu

In [31]:
# Looking for outliers in data and removing them
#import needed libraries & read the file
import pandas as pd
import numpy as np
df = pd.read_csv('Data.csv')

# find numeric columns, non numeric values cannot have outliers
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print("\nNumeric columns found:", numeric_cols)

# create function to find and remove outliers
def remove_outliers_iqr(dataframe, columns, threshold=1.5):
    df_clean = dataframe.copy()
    for col in columns: #first find interquartile range for each column
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        
        # create upper and lower limits
        lower_bound = Q1 - threshold * IQR
        upper_bound = Q3 + threshold * IQR
        
        # git rid of outliers
        df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]
    
    return df_clean

# remove outliers from data
df_cleaned = remove_outliers_iqr(df, numeric_cols, threshold=1.5)

print(f"Rows removed: {len(df) - len(df_cleaned)}")
print(f"Original shape: {df.shape}")
print(f"Cleaned shape: {df_cleaned.shape}")

# save data cleanup changes
df_cleaned.to_csv('Data.csv', index=False)
print("\nCleaned data saved to 'cleaned_data.csv'")


Numeric columns found: ['id', 'age', 'device_hours_per_day', 'phone_unlocks', 'notifications_per_day', 'social_media_mins', 'study_mins', 'physical_activity_days', 'sleep_hours', 'sleep_quality', 'anxiety_score', 'depression_score', 'stress_level', 'happiness_score', 'focus_score', 'high_risk_flag', 'productivity_score', 'digital_dependence_score']
Rows removed: 0
Original shape: (1607, 24)
Cleaned shape: (1607, 24)

Cleaned data saved to 'cleaned_data.csv'


In [29]:
# find min and max for each numeric column with the cleaned data 
print("Minimum & maximum values for each column:")
print("="*50)

for col in df_cleaned.columns:
    if col in numeric_cols:
        min_val = df_cleaned[col].min()
        max_val = df_cleaned[col].max()
        print(f"\n{col}:")
        print(f"  Minimum: {min_val}")
        print(f"  Maximum: {max_val}")

Minimum & maximum values for each column:

id:
  Minimum: 1
  Maximum: 3500

age:
  Minimum: 13
  Maximum: 50

device_hours_per_day:
  Minimum: 0.48
  Maximum: 12.44

phone_unlocks:
  Minimum: 9
  Maximum: 260

notifications_per_day:
  Minimum: 24
  Maximum: 692

social_media_mins:
  Minimum: 0
  Maximum: 323

study_mins:
  Minimum: 0
  Maximum: 340

physical_activity_days:
  Minimum: 0.0
  Maximum: 7.0

sleep_hours:
  Minimum: 4.563497685964527
  Maximum: 10.615569697445762

sleep_quality:
  Minimum: 1.0
  Maximum: 5.0

anxiety_score:
  Minimum: 0.0
  Maximum: 11.99985911213248

depression_score:
  Minimum: 0.0
  Maximum: 19.0

stress_level:
  Minimum: 1.0
  Maximum: 10.0

happiness_score:
  Minimum: 4.8
  Maximum: 10.0

focus_score:
  Minimum: 0.0
  Maximum: 100.0

high_risk_flag:
  Minimum: 0
  Maximum: 0

productivity_score:
  Minimum: 41.0
  Maximum: 89.0

digital_dependence_score:
  Minimum: 5.6
  Maximum: 55.4
