In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression

# Seed for reproducibility
np.random.seed(0)

In [8]:

customer_csv_file_path = 'c:/Users/Awhy/OneDrive/Desktop/PARA/Project/PORTFOLIO/Repo/Portfolio/Sales Intelligence/Data/customer_data.csv'
marketings_csv_file_path = 'c:/Users/Awhy/OneDrive/Desktop/PARA/Project/PORTFOLIO/Repo/Portfolio/Sales Intelligence/Data/marketing_data.csv'
operational_csv_file_path = 'c:/Users/Awhy/OneDrive/Desktop/PARA/Project/PORTFOLIO/Repo/Portfolio/Sales Intelligence/Data/operational_data.csv'
sales_data_csv_file_path = 'c:/Users/Awhy/OneDrive/Desktop/PARA/Project/PORTFOLIO/Repo/Portfolio/Sales Intelligence/Data/sales_data.csv'
sales_team_data_csv_file_path = 'c:/Users/Awhy/OneDrive/Desktop/PARA/Project/PORTFOLIO/Repo/Portfolio/Sales Intelligence/Data/sales_team_data.csv'

In [9]:
customer_data = pd.read_csv(customer_csv_file_path)
marketings = pd.read_csv(marketings_csv_file_path)
operational = pd.read_csv(operational_csv_file_path)
sales_data = pd.read_csv(sales_data_csv_file_path)
sales_team_data = pd.read_csv(sales_team_data_csv_file_path)

In [None]:
# Helper functions
def random_date(start, end):
    """Generate a random datetime between `start` and `end`"""
    return start + timedelta(
        seconds=random.randint(0, int((end - start).total_seconds())))

def generate_data(start_date, end_date, num_records):
    """Generate simulated data for the given date range and number of records"""
    data = []
    for _ in range(num_records):
        sales_date = random_date(start_date, end_date)
        customer_since = random_date(start_date, sales_date)
        lead_conversion_date = random_date(customer_since, sales_date)

        data.append({
            "Transaction ID": random.randint(1000, 9999),
            "Product ID": random.randint(100, 199),
            "Product Category": random.choice(["CRM software", "cloud storage"]),
            "Sales Date": sales_date,
            "Revenue": round(random.uniform(1000, 5000), 2),
            "Customer ID": random.randint(10000, 19999),
            "Customer Age": random.randint(20, 70),
            "Location": random.choice(["North America", "Europe", "Asia", "South America"]),
            "Industry": random.choice(["Retail", "Technology", "Finance", "Healthcare"]),
            "Customer Since": customer_since,
            "Lead ID": random.randint(20000, 29999),
            "Lead Source": random.choice(["email marketing", "social media"]),
            "Lead Conversion Date": lead_conversion_date,
            "Lead Score": random.randint(1, 10),
            "Sales Rep ID": random.randint(300, 399),
            "Number of Leads Handled": random.randint(10, 100),
            "Number of Successful Conversions": random.randint(5, 50),
            "Total Sales Value": round(random.uniform(5000, 20000), 2),
            "Support Ticket ID": random.randint(40000, 49999),
            "Issue Category": random.choice(["technical", "billing"]),
            "Resolution Time": random.randint(1, 48), # hours
            "Customer Satisfaction Score": random.randint(1, 5)
        })

    return data

In [None]:
# Generate simulated dataset
start_date = datetime(2020, 1, 1)
end_date = datetime(2023, 1, 1)
num_records = 5000

simulated_data = generate_data(start_date, end_date, num_records)

# Convert to DataFrame
simulated_df = pd.DataFrame(simulated_data)

# Display the first few rows of the DataFrame
simulated_df.head()

In [None]:
# Separating the data into the specified categories

# 1. Sales Data
sales_data_columns = ['Transaction ID', 'Product ID', 'Product Category', 'Sales Date', 'Revenue', 'Customer ID']
sales_data = simulated_df[sales_data_columns]

# 2. Customer Data
customer_data_columns = ['Customer ID', 'Customer Age', 'Location', 'Industry', 'Customer Since']
customer_data = simulated_df[customer_data_columns].drop_duplicates()

# 3. Marketing and Lead Data
marketing_data_columns = ['Lead ID', 'Lead Source', 'Lead Conversion Date', 'Lead Score']
marketing_data = simulated_df[marketing_data_columns].drop_duplicates()

# 4. Sales Team Performance Data
sales_team_data_columns = ['Sales Rep ID', 'Number of Leads Handled', 'Number of Successful Conversions', 'Total Sales Value']
sales_team_data = simulated_df[sales_team_data_columns].drop_duplicates()

# 5. Operational Data
operational_data_columns = ['Support Ticket ID', 'Issue Category', 'Resolution Time', 'Customer Satisfaction Score']
operational_data = simulated_df[operational_data_columns]


In [None]:
# Saving data to CSV files
sales_data.to_csv('sales_data.csv', index=False)
customer_data.to_csv('customer_data.csv', index=False)
marketing_data.to_csv('marketing_data.csv', index=False)
sales_team_data.to_csv('sales_team_data.csv', index=False)
operational_data.to_csv('operational_data.csv', index=False)


'sales_data.csv', 'customer_data.csv', 'marketing_data.csv', 'sales_team_data.csv', 'operational_data.csv'