In [1]:
import pandas as pd
import random
from datetime import datetime, timedelta

#Function to generate synthetic menstrual cycle data with health factors
def generate_menstrual_data(num_samples=1000, start_date='2020-01-01'):
    # Set random seed for reproducibility
    random.seed(42)
    
    # Define basic cycle and period lengths (in days)
    min_cycle_length = 21
    max_cycle_length = 35
    min_period_length = 3
    max_period_length = 7
    
    # Define health factors ranges
    min_stress = 1
    max_stress = 10
    min_exercise = 0
    max_exercise = 7
    min_diet = 1
    max_diet = 10
    min_sleep = 1
    max_sleep = 10
    normal_temperature = 36.5  # Normal body temperature in Celsius
    
    # List to store generated data
    data = []
    
    # Generate random data for each sample
    for _ in range(num_samples):
        # Generate random cycle length and period length
        cycle_length = random.randint(min_cycle_length, max_cycle_length)
        period_length = random.randint(min_period_length, max_period_length)
        
        # Generate random start date for the last period
        last_period_start = datetime.strptime(start_date, '%Y-%m-%d') + timedelta(days=random.randint(0, 365))
        
        # Calculate the next period start date
        next_period_start = last_period_start + timedelta(days=cycle_length)
        
        # Optional: Calculate ovulation day (approximately mid-cycle)
        ovulation_day = last_period_start + timedelta(days=cycle_length // 2)
        
        # Generate health factors
        stress_level = random.randint(min_stress, max_stress)
        exercise_frequency = random.randint(min_exercise, max_exercise)
        diet_quality = random.randint(min_diet, max_diet)
        sleep_quality = random.randint(min_sleep, max_sleep)
        
        # Simulate body temperature around ovulation (normally it's slightly higher during ovulation)
        body_temperature = normal_temperature + (0.3 if (last_period_start + timedelta(days=cycle_length // 2)) == ovulation_day else 0.0)
        
        # Collect features: start date of the last period, cycle length, period length, next period start date, ovulation day, health factors
        data.append({
            'last_period_start': last_period_start.strftime('%Y-%m-%d'),
            'cycle_length': cycle_length,
            'period_length': period_length,
            'next_period_start': next_period_start.strftime('%Y-%m-%d'),
            'ovulation_day': ovulation_day.strftime('%Y-%m-%d'),
            'stress_level': stress_level,
            'exercise_frequency': exercise_frequency,
            'diet_quality': diet_quality,
            'sleep_quality': sleep_quality,
            'body_temperature': body_temperature
        })
        
    # Convert the data into a pandas DataFrame
    df = pd.DataFrame(data)
    
    return df

# Generate dataset with health factors
menstrual_data = generate_menstrual_data(num_samples=1000)

# Display the first few rows of the generated data
print(menstrual_data.head())

# Optionally, save to CSV for further use
menstrual_data.to_csv('my_data.csv', index=False)


  last_period_start  cycle_length  period_length next_period_start  \
0        2020-01-13            31              3        2020-02-13   
1        2020-12-12            32              3        2021-01-13   
2        2020-02-17            21              3        2020-03-09   
3        2020-04-11            21              7        2020-05-02   
4        2020-01-04            30              5        2020-02-03   

  ovulation_day  stress_level  exercise_frequency  diet_quality  \
0    2020-01-28             5                   3             4   
1    2020-12-28             9                   1            10   
2    2020-02-27             4                   3             9   
3    2020-04-21             9                   6             4   
4    2020-01-19             3                   6             6   

   sleep_quality  body_temperature  
0              3              36.8  
1              7              36.8  
2             10              36.8  
3              8            