In [5]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# List of training topics with associated difficulty levels
training_topics = {
    "Software Development": 3,
    "Data Science": 4,
    "Cloud Computing": 3,
    "Web Development": 2,
    "Machine Learning": 4,
    "Natural Language Processing": 4,
    "Artificial Intelligence": 4,
    "Database Management": 3,
    "DevOps": 3,
    "Mobile App Development": 3,
    "Business Intelligence": 3,
    "Wireless Communication": 2,
    "5G Technology": 4,
    "Soft Skill Training": 2,
    "Quality Assurance and Testing": 3
}

# Generate random training dates from past year (from 2023 to 13 April 2024)
start_date = datetime(2023, 1, 1)
end_date = datetime(2024, 4, 13)
num_trainings = 1000000

# Function to generate training duration based on difficulty level
def generate_training_duration(difficulty_level):
    if difficulty_level == 1:
        return random.randint(1, 2)  # 1 to 2 hours for easy topics
    elif difficulty_level == 2:
        return random.randint(2, 3)  # 2 to 3 hours for moderate topics
    elif difficulty_level == 3:
        return random.randint(3, 4)  # 3 to 4 hours for difficult topics
    else:
        return random.randint(4, 5)  # 4 to 5 hours for very difficult topics

# Generate random training data
data = []
for _ in range(num_trainings):
    topic = random.choice(list(training_topics.keys()))
    difficulty_level = training_topics[topic]
    duration = generate_training_duration(difficulty_level)
    start_time = datetime.strptime(f"{random.randint(8, 12):02d}:{random.randint(0, 59):02d}", "%H:%M")
    end_time = start_time + timedelta(hours=duration)
    trainer_name = random.choice(["John Doe", "Jane Smith", "Alice Johnson", "Bob Brown", "Emily Davis"])
    intended_audience = random.choice(["Intern", "Employee"])
    average_topic_score = 100 - (difficulty_level * 10) + random.randint(-5, 5)
    training_date = start_date + timedelta(days=random.randint(0, (end_date - start_date).days))
    data.append({
        "Training Topic": topic,
        "Training Date": training_date,
        "Training Start Time": start_time,
        "Training End Time": end_time,
        "Trainer Name": trainer_name,
        "Intended Audience": intended_audience,
        "Average Topic Score": average_topic_score
    })

# Create DataFrame
df = pd.DataFrame(data)

# Display the first few rows of the DataFrame
print(df.head())

# Save DataFrame to a CSV file
df.to_csv("training_data.csv", index=False)


                  Training Topic Training Date Training Start Time  \
0  Quality Assurance and Testing    2024-03-23 1900-01-01 12:14:00   
1                Cloud Computing    2023-04-24 1900-01-01 12:10:00   
2        Artificial Intelligence    2024-02-13 1900-01-01 09:30:00   
3           Software Development    2023-05-06 1900-01-01 12:57:00   
4               Machine Learning    2023-05-14 1900-01-01 10:51:00   

    Training End Time   Trainer Name Intended Audience  Average Topic Score  
0 1900-01-01 15:14:00       John Doe          Employee                   73  
1 1900-01-01 16:10:00  Alice Johnson            Intern                   66  
2 1900-01-01 14:30:00    Emily Davis            Intern                   65  
3 1900-01-01 16:57:00     Jane Smith          Employee                   70  
4 1900-01-01 14:51:00      Bob Brown          Employee                   55  
