In [1]:
import pandas as pd
import random
from datetime import datetime, timedelta

# Function to generate random dates within a range
def generate_random_date(start_date, end_date):
    delta = end_date - start_date
    random_days = random.randint(0, delta.days)
    return start_date + timedelta(days=random_days)

# Update parameters for new simulated data
num_records = 500
gender_ratio = {'Female': 0.8, 'Male': 0.18, 'Non-binary': 0.02}
app_purpose_options = ["Period Tracking", "Cycle & Symptom Monitoring", "Sex Life Improvement", "Pregnancy Tracking", "Mood & Behavior Tracking", "Other"]
regularity_options = ["Very Regular", "Somewhat Regular", "Somewhat Irregular", "Very Irregular"]
cycle_len_distribution = {"Fewer than 21 days": 0.05, "21-35 days": 0.85, "More than 35 days": 0.1}
menstrual_pain_options = ["No pain", "Mild pain (manageable without medication)", "Moderate pain (requires over-the-counter pain medication)", "Severe pain (requires prescription pain medication)", "Very severe pain (disrupts daily activities)"]

# Generate the simulated data
simulated_data = []
today = datetime.today()

for i in range(num_records):
    gender = random.choices(list(gender_ratio.keys()), weights=list(gender_ratio.values()), k=1)[0]
    birth_year = random.randint(1960, 2020)
    age = today.year - birth_year
    if gender == "Male":
        app_purpose = random.choice(["Sex Life Improvement", "Mood & Behavior Tracking", "Other"])
    elif gender == "Non-binary":
        app_purpose = random.choice(["Cycle & Symptom Monitoring", "Mood & Behavior Tracking", "Other"])
    else:
        if age < 35:
            app_purpose = random.choice(["Period Tracking", "Cycle & Symptom Monitoring", "Pregnancy Tracking"])
        else:
            app_purpose = random.choice(["Period Tracking", "Cycle & Symptom Monitoring", "Mood & Behavior Tracking"])
    regularity = random.choices(regularity_options, weights=[0.4, 0.3, 0.2, 0.1], k=1)[0]
    cycle_len_est = random.choices(list(cycle_len_distribution.keys()), weights=list(cycle_len_distribution.values()), k=1)[0]
    period_len_est = random.randint(3, 7) if regularity == "Very Regular" else random.randint(3, 10)
    record_date = generate_random_date(today - timedelta(days=365), today).strftime("%Y-%m-%d")
    height = round(random.uniform(4.5, 6.5), 1)
    weight = round(random.uniform(90, 250), 1)
    birth_control = random.choice(["Implant", "Birth Control Ring", "Emergency Contraception (Plan B, Ella)", "Hormonal IUD", "Non-Hormonal IUD", "Birth Control Shot", "Birth Control Pills", "Patch", "Condoms", "Fertility Awareness Based Methods", "None", "Other"])
    menstrual_pain = random.choice(menstrual_pain_options)
    products = random.choice(["Sanitary Pads", "Tampons", "Menstrual Cups", "Menstrual Discs", "Period Underwear", "Menstrual Sponges", "Other", "None"])
    wear_device = random.choice(["Yes", "No"])
    sleep_levels = random.choice(["Fewer than 6 hours", "6-7 hours", "7-8 hours", "More than 8 hours"])
    activity_levels = random.choice(["Less than 30 min", "30 min to 1 hour", "More than 1 hour"])
    sedentary_levels = random.choice(["Fewer than 5 hours", "5 hours to 8 hours", "More than 8 hours"])
    simulated_data.append({
        "user_id": f"user_{i+1}",
        "record_date": record_date,
        "app_purpose": app_purpose,
        "birth_year": birth_year,
        "gender": gender,
        "height": height,
        "weight": weight,
        "birth_control": birth_control,
        "regularity": regularity,
        "last_period_date": generate_random_date(today - timedelta(days=90), today).strftime("%Y-%m-%d"),
        "period_len_est": period_len_est,
        "cycle_len_est": cycle_len_est,
        "menstrual_pain": menstrual_pain,
        "products": products,
        "wear_device": wear_device,
        "sleep_levels": sleep_levels,
        "activity_levels": activity_levels,
        "sedenary_levels": sedentary_levels
    })

# Create a DataFrame and export to CSV
simulated_df = pd.DataFrame(simulated_data)
simulated_df.to_csv("Onboarding_Data.csv", index=False)


In [2]:
# import pandas as pd
# import random
# from datetime import datetime, timedelta

# # Define parameters
# num_users = 50  # Number of unique users
# today = datetime.today()

# # Function to simulate consecutive period days and menstrual flow
# def determine_season(date):
#     month = date.month
#     day = date.day
#     if (month == 3 and day >= 21) or (3 < month < 6) or (month == 6 and day <= 20):
#         return "Spring"
#     elif (month == 6 and day >= 21) or (6 < month < 9) or (month == 9 and day <= 20):
#         return "Summer"
#     elif (month == 9 and day >= 21) or (9 < month < 12) or (month == 12 and day <= 20):
#         return "Fall"
#     else:
#         return "Winter"
    

# def generate_period_flow(days):
#     """
#     Simulates menstrual flow for a period lasting 'days' days, with variability across the days.
#     The flow starts heavier, becomes moderate, and lightens toward the end.
#     """
#     flow = []
#     for day in range(1, days + 1):
#         if day == 1:  # First day is typically heavy or very heavy
#             flow.append(random.choice(["Heavy", "Very heavy"]))
#         elif day <= min(3, days):  # Next 1-2 days are moderate to heavy
#             flow.append(random.choice(["Moderate", "Heavy"]))
#         elif day == days:  # Last day is usually light or very light
#             flow.append(random.choice(["Light", "Very light (spotting)"]))
#         else:  # Middle days vary between moderate and light
#             flow.append(random.choice(["Moderate", "Light"]))
#     return flow


# # Generate simulated data
# simulated_checkin_data = []
# for user_id in range(1, num_users + 1):
#     # Generate 1-2 periods per user in the last 60 days
#     num_periods = random.randint(1, 2)
#     period_days_recorded = set()

#     for _ in range(num_periods):
#         # Random start date for the period
#         period_start = today - timedelta(days=random.randint(0, 60))
#         num_period_days = random.randint(3, 7)  # Period lasts 3-7 days
#         flow_pattern = generate_period_flow(num_period_days)

#         # Record period days
#         for day in range(num_period_days):
#             record_date = period_start + timedelta(days=day)
#             period_days_recorded.add(record_date)

#             simulated_checkin_data.append({

#                 "user_id": f"user_{user_id}",
#                 "date": record_date.strftime("%Y-%m-%d"),
#                 "Are you on your period today?": "Yes",
#                 "Season": random.choice(["Spring", "Summer", "Fall", "Winter"]),
#                 "How would you describe your menstrual flow today?": flow_pattern[day],
#                 "How would you best describe your discharge today?": random.choice([
#                     "Dry or very little discharge", "Sticky or tacky discharge",
#                     "Creamy or lotion-like discharge", "Egg white-like (clear and stretchy) discharge",
#                     "Watery discharge", "Thick or clumpy discharge", "Blood-tinged discharge (spotting)", "Other (please specify)"
#                 ]),
#                 "What is your body temperature?": round(random.uniform(36.1, 37.8), 1),
#                 "Which of the following symptoms are you experiencing today?": random.choice([
#                     "None", "Cramps", "Bloating", "Headaches or Migraines", "Back Pain",
#                     "Fatigue", "Mood Swings", "Irritability", "Breast Tenderness", "Nausea",
#                     "Diarrhea or Constipation", "Acne", "Heavy Bleeding", "Light Bleeding",
#                     "Dizziness", "Sleep Disturbances", "Food Cravings", "Joint or Muscle Pain",
#                     "Anxiety or Depression", "Other (please specify)"
#                 ]),
#                 "How would you rate your menstrual pain today?": random.choice([
#                     "No pain", "Mild pain (manageable without medication)",
#                     "Moderate pain (required over-the-counter pain medication)",
#                     "Severe pain (required prescription pain medication)",
#                     "Very severe pain (disrupted daily activities)"
#                 ]),
#                 "Did you have sexual intercourse today?": random.choice(["Yes", "No"]),
#                 "What birth control did you use today?": random.choice([
#                     "Implant", "Birth Control Ring", "Emergency Contraception (Plan B, Ella)",
#                     "Hormonal IUD", "Non-Hormonal IUD", "Birth Control Shot", "Birth Control Pills",
#                     "Patch", "Condoms", "Fertility Awareness Based Methods", "None", "Other"
#                 ]),
#                 "How much sleep did you get last night?": random.choice(["Fewer than 6 hours", "6-7 hours", "7-8 hours", "More than 8 hours"]),
#                 "How well rested do you feel today?": random.choice(["Not well rested", "Somewhat rested", "Well rested", "Very well rested"]),
#                 "Did you exercise today？": random.choice(["Yes", "No"]),
#                 "What was the intensity of your exercise?": random.choice([
#                     "Light (e.g., walking, stretching)", "Moderate (e.g., brisk walking, dancing)",
#                     "Vigorous (e.g., running, high-intensity interval training)", "Mixed (a combination of light, moderate, and vigorous activities)", "None"
#                 ]),
#                 "Notes on Mood": random.choice(["Very happy today", "I did not sleep well so I feel drained"])
#             })

#     # Add random non-period records
#     non_period_days = random.randint(10, 20)
#     for _ in range(non_period_days):
#         random_date = today - timedelta(days=random.randint(0, 60))
#         if random_date not in period_days_recorded:
#             simulated_checkin_data.append({
#                 "user_id": f"user_{user_id}",
#                 "date": random_date.strftime("%Y-%m-%d"),
#                 "Are you on your period today?": "No",
#                 "Season": random.choice(["Spring", "Summer", "Fall", "Winter"]),
#                 "How would you describe your menstrual flow today?": "None",
#                 "How would you best describe your discharge today?": random.choice([
#                     "Dry or very little discharge", "Sticky or tacky discharge",
#                     "Creamy or lotion-like discharge", "Egg white-like (clear and stretchy) discharge",
#                     "Watery discharge", "Thick or clumpy discharge", "Other (please specify)"
#                 ]),
#                 "What is your body temperature?": round(random.uniform(36.1, 37.8), 1),
#                 "Which of the following symptoms are you experiencing today?": random.choice([
#                     "None", "Fatigue", "Mood Swings", "Anxiety or Depression", "Other (please specify)"
#                 ]),
#                 "How would you rate your menstrual pain today?": "No pain",
#                 "Did you have sexual intercourse today?": random.choice(["Yes", "No"]),
#                 "What birth control did you use today?": random.choice([
#                     "Implant", "Birth Control Ring", "Emergency Contraception (Plan B, Ella)",
#                     "Hormonal IUD", "Non-Hormonal IUD", "Birth Control Shot", "Birth Control Pills",
#                     "Patch", "Condoms", "Fertility Awareness Based Methods", "None", "Other"
#                 ]),
#                 "How much sleep did you get last night?": random.choice(["Fewer than 6 hours", "6-7 hours", "7-8 hours", "More than 8 hours"]),
#                 "How well rested do you feel today?": random.choice(["Not well rested", "Somewhat rested", "Well rested", "Very well rested"]),
#                 "Did you exercise today？": random.choice(["Yes", "No"]),
#                 "What was the intensity of your exercise?": random.choice([
#                     "Light (e.g., walking, stretching)", "Moderate (e.g., brisk walking, dancing)",
#                     "Vigorous (e.g., running, high-intensity interval training)", "Mixed (a combination of light, moderate, and vigorous activities)", "None"
#                 ]),
#                 "Notes on Mood": random.choice(["Feeling great today", "Had a calm and relaxed day"])
#             })

# # Create a DataFrame
# simulated_checkin_df = pd.DataFrame(simulated_checkin_data)

# # Save to a CSV file
# simulated_checkin_df.to_csv("Simulated_User_Check-In2.csv", index=False)


In [5]:
import pandas as pd
import random
from datetime import datetime, timedelta

# Define parameters
num_users = 50  # Number of unique users
today = datetime.today()

# Function to simulate consecutive period days and menstrual flow
def generate_period_flow(days):
    """
    Simulates menstrual flow for a period lasting 'days' days, with variability across the days.
    The flow starts heavier, becomes moderate, and lightens toward the end.
    """
    flow = []
    for day in range(1, days + 1):
        if day == 1:
            flow.append(random.choice(["Heavy", "Very heavy"]))
        elif day <= 3:
            flow.append(random.choice(["Moderate", "Heavy"]))
        elif day == days:
            flow.append(random.choice(["Light", "Very light (spotting)"]))
        else:
            flow.append(random.choice(["Moderate", "Light"]))
    return flow

# Function to determine the season based on the date
def determine_season(date):
    """
    Determines the season for a given date (northern hemisphere).
    """
    month = date.month
    day = date.day
    if (month == 3 and day >= 21) or (3 < month < 6) or (month == 6 and day <= 20):
        return "Spring"
    elif (month == 6 and day >= 21) or (6 < month < 9) or (month == 9 and day <= 20):
        return "Summer"
    elif (month == 9 and day >= 21) or (9 < month < 12) or (month == 12 and day <= 20):
        return "Fall"
    else:
        return "Winter"

# Generate simulated data
simulated_checkin_data = []
for user_id in range(1, num_users + 1):
    # Generate 1-2 periods per user in the last 60 days
    num_periods = random.randint(1, 2)
    period_days_recorded = set()

    for _ in range(num_periods):
        period_start = today - timedelta(days=random.randint(0, 60))
        num_period_days = random.randint(3, 7)
        flow_pattern = generate_period_flow(num_period_days)

        for day in range(num_period_days):
            record_date = period_start + timedelta(days=day)
            period_days_recorded.add(record_date)

            simulated_checkin_data.append({
                "user_id": f"user_{user_id}",
                "date": record_date.strftime("%Y-%m-%d"),
                "Are you on your period today?": "Yes",
                "Season": determine_season(record_date),
                "Menstrual Flow Level": flow_pattern[day],
                "Discharge": random.choice([
                    "Dry or very little discharge", "Sticky or tacky discharge",
                    "Creamy or lotion-like discharge", "Egg white-like (clear and stretchy) discharge",
                    "Watery discharge", "Thick or clumpy discharge", "Blood-tinged discharge (spotting)", "Other (please specify)"
                ]),
                "Body Temperature": round(random.uniform(36.1, 37.8), 1),
                "Symptoms": random.choice([
                    "None", "Cramps", "Bloating", "Headaches or Migraines", "Back Pain",
                    "Fatigue", "Mood Swings", "Irritability", "Breast Tenderness", "Nausea",
                    "Diarrhea or Constipation", "Acne", "Heavy Bleeding", "Light Bleeding",
                    "Dizziness", "Sleep Disturbances", "Food Cravings", "Joint or Muscle Pain",
                    "Anxiety or Depression", "Other (please specify)"
                ]),
                "Menstrual Pain Level": random.choice([
                    "No pain", "Mild pain (manageable without medication)",
                    "Moderate pain (required over-the-counter pain medication)",
                    "Severe pain (required prescription pain medication)",
                    "Very severe pain (disrupted daily activities)"
                ]),
                "Sexual Intercourse": random.choice(["Yes", "No"]),
                "Birth Control": random.choice([
                    "Implant", "Birth Control Ring", "Emergency Contraception (Plan B, Ella)",
                    "Hormonal IUD", "Non-Hormonal IUD", "Birth Control Shot", "Birth Control Pills",
                    "Patch", "Condoms", "Fertility Awareness Based Methods", "None", "Other"
                ]),
                "Sleep Duration": random.choice(["Fewer than 6 hours", "6-7 hours", "7-8 hours", "More than 8 hours"]),
                "Rested Feeling": random.choice(["Not well rested", "Somewhat rested", "Well rested", "Very well rested"]),
                "Exercise Today": random.choice(["Yes", "No"]),
                "Exercise Intensity": random.choice([
                    "Light (e.g., walking, stretching)", "Moderate (e.g., brisk walking, dancing)",
                    "Vigorous (e.g., running, high-intensity interval training)", "Mixed (a combination of light, moderate, and vigorous activities)", "None"
                ]),
                "Mood": random.choice([
                    "Irritable", "Happy", "Bored", "Motivated", "Frustrated", "Anxious",
                    "Energetic", "Neutral", "Content", "Lonely", "Stressed", "Excited", "Other"
                ]),
                "Notes on Mood": random.choice(["Very happy today", "I did not sleep well so I feel drained"])
            })

    # Add random non-period records
    non_period_days = random.randint(10, 20)
    for _ in range(non_period_days):
        random_date = today - timedelta(days=random.randint(0, 60))
        if random_date not in period_days_recorded:
            simulated_checkin_data.append({
                "user_id": f"user_{user_id}",
                "date": random_date.strftime("%Y-%m-%d"),
                "Are you on your period today?": "No",
                "Season": determine_season(random_date),
                "Menstrual Flow Level": "None",
                "Discharge": random.choice([
                    "Dry or very little discharge", "Sticky or tacky discharge",
                    "Creamy or lotion-like discharge", "Egg white-like (clear and stretchy) discharge",
                    "Watery discharge", "Thick or clumpy discharge", "Other (please specify)"
                ]),
                "Body Temperature": round(random.uniform(36.1, 37.8), 1),
                "Symptoms": random.choice([
                    "None", "Fatigue", "Mood Swings", "Anxiety or Depression", "Other (please specify)"
                ]),
                "Menstrual Pain Level": "No pain",
                "Sexual Intercourse": random.choice(["Yes", "No"]),
                "Birth Control": random.choice([
                    "Implant", "Birth Control Ring", "Emergency Contraception (Plan B, Ella)",
                    "Hormonal IUD", "Non-Hormonal IUD", "Birth Control Shot", "Birth Control Pills",
                    "Patch", "Condoms", "Fertility Awareness Based Methods", "None", "Other"
                ]),
                "Sleep Duration": random.choice(["Fewer than 6 hours", "6-7 hours", "7-8 hours", "More than 8 hours"]),
                "Rested Feeling": random.choice(["Not well rested", "Somewhat rested", "Well rested", "Very well rested"]),
                "Exercise Today": random.choice(["Yes", "No"]),
                "Exercise Intensity": random.choice([
                    "Light (e.g., walking, stretching)", "Moderate (e.g., brisk walking, dancing)",
                    "Vigorous (e.g., running, high-intensity interval training)", "Mixed (a combination of light, moderate, and vigorous activities)", "None"
                ]),
                "Mood": random.choice([
                    "Irritable", "Happy", "Bored", "Motivated", "Frustrated", "Anxious",
                    "Energetic", "Neutral", "Content", "Lonely", "Stressed", "Excited", "Other"
                ]),
                "Notes on Mood": random.choice(["Feeling great today", "Had a calm and relaxed day"])
            })

# Create a DataFrame
simulated_checkin_df = pd.DataFrame(simulated_checkin_data)

# Save to a CSV file
simulated_checkin_df.to_csv("Check-In-Data.csv", index=False)


In [6]:
df = pd.read_csv("Check-In-Data.csv")
df.head()

Unnamed: 0,user_id,date,Are you on your period today?,Season,Menstrual Flow Level,Discharge,Body Temperature,Symptoms,Menstrual Pain Level,Sexual Intercourse,Birth Control,Sleep Duration,Rested Feeling,Exercise Today,Exercise Intensity,Mood,Notes on Mood
0,user_1,2024-12-09,Yes,Fall,Heavy,Blood-tinged discharge (spotting),36.7,Sleep Disturbances,Moderate pain (required over-the-counter pain ...,No,Patch,6-7 hours,Somewhat rested,Yes,"Light (e.g., walking, stretching)",Content,I did not sleep well so I feel drained
1,user_1,2024-12-10,Yes,Fall,Heavy,Watery discharge,36.5,Headaches or Migraines,Mild pain (manageable without medication),Yes,Fertility Awareness Based Methods,More than 8 hours,Somewhat rested,No,"Mixed (a combination of light, moderate, and v...",Energetic,I did not sleep well so I feel drained
2,user_1,2024-12-11,Yes,Fall,Moderate,Thick or clumpy discharge,37.0,Joint or Muscle Pain,Moderate pain (required over-the-counter pain ...,No,Other,6-7 hours,Well rested,Yes,,Lonely,Very happy today
3,user_1,2024-12-12,Yes,Fall,Light,Dry or very little discharge,37.7,Mood Swings,Severe pain (required prescription pain medica...,No,Patch,More than 8 hours,Not well rested,No,"Light (e.g., walking, stretching)",Irritable,Very happy today
4,user_1,2024-11-25,Yes,Fall,Very heavy,Sticky or tacky discharge,37.4,Bloating,Severe pain (required prescription pain medica...,Yes,Birth Control Ring,7-8 hours,Very well rested,Yes,"Light (e.g., walking, stretching)",Neutral,Very happy today


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1087 entries, 0 to 1086
Data columns (total 17 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   user_id                        1087 non-null   object 
 1   date                           1087 non-null   object 
 2   Are you on your period today?  1087 non-null   object 
 3   Season                         1087 non-null   object 
 4   Menstrual Flow Level           423 non-null    object 
 5   Discharge                      1087 non-null   object 
 6   Body Temperature               1087 non-null   float64
 7   Symptoms                       939 non-null    object 
 8   Menstrual Pain Level           1087 non-null   object 
 9   Sexual Intercourse             1087 non-null   object 
 10  Birth Control                  1004 non-null   object 
 11  Sleep Duration                 1087 non-null   object 
 12  Rested Feeling                 1087 non-null   o

In [8]:
df.shape

(1087, 17)

In [13]:
df['user_id'].unique

<bound method Series.unique of 0        user_1
1        user_1
2        user_1
3        user_1
4        user_1
         ...   
1082    user_50
1083    user_50
1084    user_50
1085    user_50
1086    user_50
Name: user_id, Length: 1087, dtype: object>

In [14]:
df['user_id'].value_counts()

user_id
user_16    27
user_38    27
user_27    27
user_2     27
user_17    27
user_1     26
user_36    25
user_34    25
user_7     25
user_29    25
user_24    25
user_21    25
user_32    24
user_50    24
user_10    24
user_4     24
user_45    23
user_11    23
user_25    23
user_14    23
user_48    23
user_47    23
user_9     22
user_19    22
user_20    22
user_12    22
user_44    22
user_23    22
user_6     22
user_49    21
user_8     21
user_22    21
user_15    21
user_46    20
user_41    20
user_42    20
user_33    20
user_37    20
user_3     19
user_40    19
user_5     19
user_18    19
user_43    18
user_31    18
user_30    18
user_35    16
user_28    16
user_39    15
user_26    14
user_13    13
Name: count, dtype: int64