In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression

# Seed for reproducibility
np.random.seed(0)

In [5]:
# Helper functions
def random_date(start, end):
    """Generate a random datetime between `start` and `end`"""
    return start + timedelta(
        seconds=random.randint(0, int((end - start).total_seconds())))

def generate_data(start_date, end_date, num_records):
    """Generate simulated data for the given date range and number of records"""
    data = []
    for _ in range(num_records):
        sales_date = random_date(start_date, end_date)
        customer_since = random_date(start_date, sales_date)
        lead_conversion_date = random_date(customer_since, sales_date)

        data.append({
            "Transaction ID": random.randint(1000, 9999),
            "Product ID": random.randint(100, 199),
            "Product Category": random.choice(["CRM software", "cloud storage"]),
            "Sales Date": sales_date,
            "Revenue": round(random.uniform(1000, 5000), 2),
            "Customer ID": random.randint(10000, 19999),
            "Customer Age": random.randint(20, 70),
            "Location": random.choice(["North America", "Europe", "Asia", "South America"]),
            "Industry": random.choice(["Retail", "Technology", "Finance", "Healthcare"]),
            "Customer Since": customer_since,
            "Lead ID": random.randint(20000, 29999),
            "Lead Source": random.choice(["email marketing", "social media"]),
            "Lead Conversion Date": lead_conversion_date,
            "Lead Score": random.randint(1, 10),
            "Sales Rep ID": random.randint(300, 399),
            "Number of Leads Handled": random.randint(10, 100),
            "Number of Successful Conversions": random.randint(5, 50),
            "Total Sales Value": round(random.uniform(5000, 20000), 2),
            "Support Ticket ID": random.randint(40000, 49999),
            "Issue Category": random.choice(["technical", "billing"]),
            "Resolution Time": random.randint(1, 48), # hours
            "Customer Satisfaction Score": random.randint(1, 5)
        })

    return data

In [6]:
# Generate simulated dataset
start_date = datetime(2020, 1, 1)
end_date = datetime(2023, 1, 1)
num_records = 5000

simulated_data = generate_data(start_date, end_date, num_records)

# Convert to DataFrame
simulated_df = pd.DataFrame(simulated_data)

# Display the first few rows of the DataFrame
simulated_df.head()

Unnamed: 0,Transaction ID,Product ID,Product Category,Sales Date,Revenue,Customer ID,Customer Age,Location,Industry,Customer Since,...,Lead Conversion Date,Lead Score,Sales Rep ID,Number of Leads Handled,Number of Successful Conversions,Total Sales Value,Support Ticket ID,Issue Category,Resolution Time,Customer Satisfaction Score
0,4892,135,CRM software,2021-05-02 11:23:15,3431.01,17662,66,South America,Finance,2020-09-04 04:03:35,...,2020-11-07 09:56:47,8,375,32,34,7980.11,48763,technical,11,5
1,7096,144,cloud storage,2022-03-27 22:23:14,4700.31,18312,55,North America,Retail,2020-10-10 20:22:02,...,2021-09-19 20:02:00,8,368,48,47,5043.47,45895,billing,43,1
2,8983,171,CRM software,2022-05-10 09:50:47,4769.53,10061,38,Europe,Technology,2021-10-10 00:58:34,...,2022-04-01 02:24:17,10,320,85,10,6563.65,44390,billing,29,3
3,8195,102,CRM software,2022-12-29 21:59:39,1814.12,17539,37,Asia,Retail,2021-11-15 06:33:06,...,2022-10-25 21:32:05,8,374,95,50,14861.03,46553,billing,46,2
4,8966,173,cloud storage,2022-12-04 13:32:41,3573.53,17202,55,North America,Technology,2021-03-17 08:18:13,...,2021-06-07 23:23:34,7,377,100,6,8283.96,43623,technical,10,1


In [7]:
# Separating the data into the specified categories

# 1. Sales Data
sales_data_columns = ['Transaction ID', 'Product ID', 'Product Category', 'Sales Date', 'Revenue', 'Customer ID']
sales_data = simulated_df[sales_data_columns]

# 2. Customer Data
customer_data_columns = ['Customer ID', 'Customer Age', 'Location', 'Industry', 'Customer Since']
customer_data = simulated_df[customer_data_columns].drop_duplicates()

# 3. Marketing and Lead Data
marketing_data_columns = ['Lead ID', 'Lead Source', 'Lead Conversion Date', 'Lead Score']
marketing_data = simulated_df[marketing_data_columns].drop_duplicates()

# 4. Sales Team Performance Data
sales_team_data_columns = ['Sales Rep ID', 'Number of Leads Handled', 'Number of Successful Conversions', 'Total Sales Value']
sales_team_data = simulated_df[sales_team_data_columns].drop_duplicates()

# 5. Operational Data
operational_data_columns = ['Support Ticket ID', 'Issue Category', 'Resolution Time', 'Customer Satisfaction Score']
operational_data = simulated_df[operational_data_columns]


In [8]:
# Saving data to CSV files
sales_data.to_csv('sales_data.csv', index=False)
customer_data.to_csv('customer_data.csv', index=False)
marketing_data.to_csv('marketing_data.csv', index=False)
sales_team_data.to_csv('sales_team_data.csv', index=False)
operational_data.to_csv('operational_data.csv', index=False)


'sales_data.csv', 'customer_data.csv', 'marketing_data.csv', 'sales_team_data.csv', 'operational_data.csv'

('sales_data.csv',
 'customer_data.csv',
 'marketing_data.csv',
 'sales_team_data.csv',
 'operational_data.csv')