In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression

# Seed for reproducibility
np.random.seed(0)

In [2]:
# Helper functions
def random_date(start, end):
    """Generate a random datetime between `start` and `end`"""
    return start + timedelta(
        seconds=random.randint(0, int((end - start).total_seconds())))

def generate_data(start_date, end_date, num_records):
    """Generate simulated data for the given date range and number of records"""
    data = []
    for _ in range(num_records):
        sales_date = random_date(start_date, end_date)
        customer_since = random_date(start_date, sales_date)
        lead_conversion_date = random_date(customer_since, sales_date)

        data.append({
            "Transaction ID": random.randint(1000, 9999),
            "Product ID": random.randint(100, 199),
            "Product Category": random.choice(["CRM software", "cloud storage"]),
            "Sales Date": sales_date,
            "Revenue": round(random.uniform(1000, 5000), 2),
            "Customer ID": random.randint(10000, 19999),
            "Customer Age": random.randint(20, 70),
            "Location": random.choice(["North America", "Europe", "Asia", "South America"]),
            "Industry": random.choice(["Retail", "Technology", "Finance", "Healthcare"]),
            "Customer Since": customer_since,
            "Lead ID": random.randint(20000, 29999),
            "Lead Source": random.choice(["email marketing", "social media"]),
            "Lead Conversion Date": lead_conversion_date,
            "Lead Score": random.randint(1, 10),
            "Sales Rep ID": random.randint(300, 399),
            "Number of Leads Handled": random.randint(10, 100),
            "Number of Successful Conversions": random.randint(5, 50),
            "Total Sales Value": round(random.uniform(5000, 20000), 2),
            "Support Ticket ID": random.randint(40000, 49999),
            "Issue Category": random.choice(["technical", "billing"]),
            "Resolution Time": random.randint(1, 48), # hours
            "Customer Satisfaction Score": random.randint(1, 5)
        })

    return data

In [3]:
# Generate simulated dataset
start_date = datetime(2020, 1, 1)
end_date = datetime(2023, 1, 1)
num_records = 5000

simulated_data = generate_data(start_date, end_date, num_records)

# Convert to DataFrame
simulated_df = pd.DataFrame(simulated_data)

# Display the first few rows of the DataFrame
simulated_df.head()

Unnamed: 0,Transaction ID,Product ID,Product Category,Sales Date,Revenue,Customer ID,Customer Age,Location,Industry,Customer Since,...,Lead Conversion Date,Lead Score,Sales Rep ID,Number of Leads Handled,Number of Successful Conversions,Total Sales Value,Support Ticket ID,Issue Category,Resolution Time,Customer Satisfaction Score
0,4669,147,cloud storage,2022-07-09 01:24:58,3132.68,11968,24,Europe,Finance,2021-07-26 07:26:59,...,2022-02-06 14:58:15,10,364,92,10,19431.41,49454,billing,43,2
1,1989,150,CRM software,2020-04-25 05:58:57,4683.95,12177,21,Asia,Healthcare,2020-02-28 23:18:19,...,2020-04-20 16:37:50,6,322,20,40,5298.87,45127,billing,39,5
2,4261,170,cloud storage,2021-08-17 22:44:01,2933.54,14723,22,Europe,Finance,2021-03-09 07:14:50,...,2021-08-17 06:07:36,1,372,82,31,18457.97,42201,technical,37,4
3,7076,157,cloud storage,2021-09-26 03:10:34,4407.97,15469,41,Europe,Retail,2020-12-13 19:49:18,...,2021-01-22 20:40:09,1,386,45,30,12767.32,47795,billing,18,1
4,8897,136,CRM software,2021-10-19 15:19:15,1142.87,13600,53,North America,Retail,2020-09-04 09:03:54,...,2021-08-04 13:33:42,8,395,95,39,6194.03,49932,technical,26,1


In [4]:
# Separating the data into the specified categories

# 1. Sales Data
sales_data_columns = ['Transaction ID', 'Product ID', 'Product Category', 'Sales Date', 'Revenue', 'Customer ID']
sales_data = simulated_df[sales_data_columns]

# 2. Customer Data
customer_data_columns = ['Customer ID', 'Customer Age', 'Location', 'Industry', 'Customer Since', 'Sales Rep ID']
customer_data = simulated_df[customer_data_columns].drop_duplicates()

# 3. Marketing and Lead Data
marketing_data_columns = ['Lead ID', 'Lead Source', 'Lead Conversion Date', 'Lead Score','Sales Rep ID']
marketing_data = simulated_df[marketing_data_columns].drop_duplicates()

# 4. Sales Team Performance Data
sales_team_data_columns = ['Sales Rep ID', 'Number of Leads Handled', 'Number of Successful Conversions', 'Total Sales Value']
sales_team_data = simulated_df[sales_team_data_columns].drop_duplicates()

# 5. Operational Data
operational_data_columns = ['Customer ID','Support Ticket ID', 'Issue Category', 'Resolution Time', 'Customer Satisfaction Score']
operational_data = simulated_df[operational_data_columns]


In [5]:
# Saving data to CSV files
sales_data.to_csv('sales_data.csv', index=False)
customer_data.to_csv('customer_data.csv', index=False)
marketing_data.to_csv('marketing_data.csv', index=False)
sales_team_data.to_csv('sales_team_data.csv', index=False)
operational_data.to_csv('operational_data.csv', index=False)
simulated_df.to_csv('rawdata.csv', index=False)

'sales_data.csv', 'customer_data.csv', 'marketing_data.csv', 'sales_team_data.csv', 'operational_data.csv'

('sales_data.csv',
 'customer_data.csv',
 'marketing_data.csv',
 'sales_team_data.csv',
 'operational_data.csv')