In [5]:
import pandas as pd
import numpy as np
import random

# -----------------------------
# 1. Define categories
# -----------------------------
data_scientists = ["Alice", "Bob", "Charlie", "Diana", "Ethan", "Fatima", "George", "Hannah"]
tasks = [
    "Data_Cleaning",
    "Data_Wrangling",
    "Exploratory_Analysis",
    "Feature_Engineering",
    "Model_Building",
    "Model_Testing",
    "Model_Deployment",
    "Model_Monitoring"
]
project_types = ["NLP", "Vision", "Tabular", "Time Series"]
complexity_levels = ["Low", "Medium", "High"]
tools = ["Scikit-learn", "PyTorch", "TensorFlow", "XGBoost", "LightGBM"]

# -----------------------------
# 2. Set average times per task (base times in hours)
# -----------------------------
avg_time = {
    "Data_Cleaning": 4,
    "Data_Wrangling": 5,
    "Exploratory_Analysis": 6,
    "Feature_Engineering": 7,
    "Model_Building": 8,
    "Model_Testing": 6,
    "Model_Deployment": 4,
    "Model_Monitoring": 3
}

# Complexity scaling factors
complexity_factor = {"Low": 0.8, "Medium": 1.0, "High": 1.3}

# Project type scaling (vision & NLP often take longer)
project_factor = {"Tabular": 1.0, "NLP": 1.2, "Vision": 1.3, "Time Series": 1.1}

# -----------------------------
# 3. Generate the synthetic dataset
# -----------------------------
np.random.seed(42)
rows = []

for _ in range(300):
    name = random.choice(data_scientists)
    task = random.choice(tasks)
    project = random.choice(project_types)
    complexity = random.choice(complexity_levels)
    tool = random.choice(tools)
    data_size = np.random.uniform(0.1, 10.0)  # GB
    deadline = np.random.randint(5, 30)  # days
    
    # Calculate base time with modifiers
    time = (
        avg_time[task]
        * complexity_factor[complexity]
        * project_factor[project]
        * (1 + data_size / 50)  # slightly scale with data size
    )
    
    # Add noise (±20%)
    time += np.random.normal(0, 0.2 * time)
    time = round(max(0.5, time), 2)
    
    rows.append([name, task, project, complexity, tool, data_size, deadline, time])

# -----------------------------
# 4. Create DataFrame
# -----------------------------
df = pd.DataFrame(
    rows,
    columns=[
        "data_scientist",
        "task",
        "project_type",
        "complexity_level",
        "tool_used",
        "data_size_gb",
        "deadline_days",
        "time_hours"
    ]
)

# -----------------------------
# 5. Save or preview
# -----------------------------
print(df.head(10))
df.to_csv("data_science_productivity_dataset.csv", index=False)


  data_scientist                 task project_type complexity_level  \
0         Hannah  Feature_Engineering          NLP           Medium   
1          Diana       Data_Wrangling          NLP             High   
2         George       Data_Wrangling      Tabular             High   
3          Alice        Data_Cleaning       Vision             High   
4         Fatima     Model_Monitoring  Time Series             High   
5          Diana       Model_Building       Vision             High   
6         George       Model_Building          NLP             High   
7        Charlie        Data_Cleaning  Time Series           Medium   
8         Fatima       Data_Wrangling          NLP             High   
9         George     Model_Deployment  Time Series             High   

      tool_used  data_size_gb  deadline_days  time_hours  
0       PyTorch      3.807947             19       10.21  
1    TensorFlow      1.644585             23       10.51  
2    TensorFlow      1.089752            

In [6]:
data = pd.read_csv("data_science_productivity_dataset.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   data_scientist    300 non-null    object 
 1   task              300 non-null    object 
 2   project_type      300 non-null    object 
 3   complexity_level  300 non-null    object 
 4   tool_used         300 non-null    object 
 5   data_size_gb      300 non-null    float64
 6   deadline_days     300 non-null    int64  
 7   time_hours        300 non-null    float64
dtypes: float64(2), int64(1), object(5)
memory usage: 18.9+ KB


In [None]:
data.head()

Unnamed: 0,data_scientist,task,project_type,complexity_level,tool_used,data_size_gb,deadline_days,time_hours
0,Hannah,Feature_Engineering,NLP,Medium,PyTorch,3.807947,19,10.21
1,Diana,Data_Wrangling,NLP,High,TensorFlow,1.644585,23,10.51
2,George,Data_Wrangling,Tabular,High,TensorFlow,1.089752,15,7.01
3,Alice,Data_Cleaning,Vision,High,XGBoost,7.109919,26,9.28
4,Fatima,Model_Monitoring,Time Series,High,Scikit-learn,0.658475,28,3.94


: 