<a href="https://colab.research.google.com/github/Yed-hu/Projects/blob/main/Synthetic_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
!pip install faker



In [24]:
import pandas as pd
import uuid
import random
from faker import Faker
from datetime import datetime, timedelta

In [42]:
# Initialize Faker to generate fake data
fake = Faker()

# Constants for generating the data
num_customers = 750
num_shipments = 25000
customer_ids = [f"CUST{i}" for i in range(1,751)]
shipment_ids = [f"SHIP{i}" for i in range(1,25001)]
trabsaction_ids = [f"TRAN{i}" for i in range(num_shipments)]
feedback_ids = [f"FD{i}" for i in range(num_shipments)]
regions = ['Alappuzha', 'Ernakulam', 'Idukki', 'Kannur', 'Kasaragod', 'Kollam', 'Kottayam', 'Kozhikode', 'Malappuram', 'Palakkad', 'Pathanamthitta', 'Thrissur', 'Thiruvananthapuram', 'Wayanad']
countries = ['United Arab Emirates', 'Saudi Arabia', 'Kuwait', 'Qatar', 'Oman', 'Bahrain', 'Malaysia', 'Singapore', 'United States', 'United Kingdom', 'Canada', 'Australia', 'South Africa', 'Jordan', 'Italy', 'Germany', 'France', 'Netherlands', 'Sweden', 'Norway', 'Switzerland', 'Ireland', 'Denmark', 'Belgium', 'Austria', 'Luxembourg', 'Greece']
customer_types = ['Regular', 'VIP', 'New']
occupations = ['Student', 'Working Professional', 'Business', 'Retired']
shipment_types = ['Document', 'Gift', 'Electronic', 'Household Goods','Food']
delivery_statuses = ['Delivered', 'Delayed', 'Canceled']
payment_methods = ['Credit Card', 'Debit Card', 'Bank Transfer', 'COD']
currency_types = ['INR', 'USD', 'EUR']
provider_names = ['DHL', 'FedEx', 'India Post', 'UPS', 'BlueDart']
provider_types = ['Air Freight', 'Sea Freight', 'Ground']
service_types = ['Standard', 'Express', 'Economy']
promotion_codes = ['DISCOUNT10', 'FREESHIP', 'WELCOME20']



# Generate Customer Table
def generate_customers(num_customers):
  customers =[]
  for _ in range(num_customers):
    customer_id = random.choice(customer_ids)
    first_name = fake.first_name()
    last_name = fake.last_name()
    email = f"{first_name}{last_name}{random.randint(0, 1000)}@gmail.com".lower()
    phone = f"+91{random.randint(7000000000, 7999999999)}"
    dob = fake.date_of_birth(minimum_age=18, maximum_age=60)
    nationality = 'Indain'
    region = random.choice(regions)
    country_of_residence = random.choice(countries)
    occupation = random.choice(occupations)
    customer_type = random.choice(customer_types)
    account_creation_date = fake.date_this_decade()

    customers.append([customer_id, first_name, last_name, email, phone, dob, nationality, region, country_of_residence, occupation, customer_type, account_creation_date])

  return pd.DataFrame(customers, columns=['customer_id', 'first_name', 'last_name', 'email', 'phone', 'dob', 'nationality', 'region', 'country_of_residence', 'occupation', 'customer_type', 'account_creation_date'])


# Generate Shipment Table
def generate_shipments(num_shipments,customers):
  shipments = []
  for _ in range(num_shipments):
    customer = random.choice(customers)
    shipment_id = random.choice(shipment_ids)
    customer_id = customer['customer_id']
    origin_country = 'India'
    destination_country = customer['country_of_residence']
    origin_city = customer['region']
    destination_city = fake.city()
    weight = random.uniform(1,100) #kg
    volume = random.uniform(0.1,5) #cubic meter
    shipment_date = fake.date_between(start_date='-2y', end_date='today')
    delivery_date = shipment_date + timedelta(days=random.randint(3,10))
    delivery_status = 'In Transit' if delivery_date >= datetime.now().date() else random.choice(delivery_statuses)
    shipment_type =  random.choice(shipment_types)
    customs_clearance = False if delivery_status == 'Canceled' else True
    cargo_provider_name = random.choice(provider_names)
    cargo_type = random.choice(provider_types)
    service_type = random.choice(service_types)


    shipments.append([shipment_id, customer_id, origin_country, destination_country, origin_city, destination_city,
                          weight, volume, shipment_date, delivery_date, delivery_status, shipment_type, customs_clearance,cargo_provider_name, cargo_type, service_type])

  return pd.DataFrame(shipments, columns=['shipment_id', 'customer_id', 'origin_country', 'destination_country',
                                            'origin_city', 'destination_city', 'weight', 'volume', 'shipment_date',
                                            'delivery_date', 'delivery_status', 'shipment_type', 'customs_clearance','cargo_provider_name', 'cargo_type', 'service_type'])


# Generate Transaction Table
def generate_transactions(num_shipments, shipments):
  transactions = []
  for _ in range(num_shipments):
    transaction_id = random.choice(trabsaction_ids)
    shipment = random.choice(shipments)
    customer_id = shipment['customer_id']
    shipment_id = shipment['shipment_id']
    transaction_date = shipment['shipment_date']
    transaction_amount = random.uniform(1000, 100000)
    payment_method = random.choice(payment_methods)
    transaction_status = 'Paid' if shipment['delivery_status'] != 'Canceled' else 'Refunded'

    transactions.append([transaction_id, customer_id, shipment_id, transaction_date, transaction_amount,
                             payment_method, transaction_status])

  return pd.DataFrame(transactions, columns=['transaction_id', 'customer_id', 'shipment_id', 'transaction_date',
                                                'transaction_amount', 'payment_method', 'transaction_status'])




# Generate Delivery & Feedback Table
def generate_delivery_feedback(num_shipments, shipments):
  feedbacks = []
  for _ in range(num_shipments):
    feedback_id = random.choice(feedback_ids)
    shipment = random.choice(shipments)
    shipment_id = shipment['shipment_id']
    customer_id = shipment['customer_id']
    delivery_time = random.randint(1,14) # In days
    delivery_rating = random.randint(1, 5)
    issue_reported = random.choice([True, False])
    issue_type = random.choice(['Delay', 'Damage', 'Lost', 'Wrong Address']) if issue_reported else None
    complaint_status = random.choice(['Resolved', 'Pending', 'Unresolved']) if issue_reported else None

    feedbacks.append([feedback_id, shipment_id, customer_id, delivery_time, delivery_rating,
                          issue_reported, issue_type, complaint_status, ])

  return pd.DataFrame(feedbacks, columns=['feedback_id', 'shipment_id', 'customer_id', 'delivery_time',
                                             'delivery_rating', 'issue_reported', 'issue_type',
                                             'complaint_status'])



In [43]:
customers_df = generate_customers(num_customers)
# Sample output to check
customers_df.head()

Unnamed: 0,customer_id,first_name,last_name,email,phone,dob,nationality,region,country_of_residence,occupation,customer_type,account_creation_date
0,CUST575,Raymond,Alexander,raymondalexander735@gmail.com,917398009152,2002-07-24,Indain,Pathanamthitta,Luxembourg,Student,New,2023-11-08
1,CUST146,Barbara,Hughes,barbarahughes509@gmail.com,917164695765,1981-08-13,Indain,Ernakulam,Oman,Student,New,2023-07-29
2,CUST558,Daniel,Bennett,danielbennett81@gmail.com,917678239035,2005-05-20,Indain,Kannur,Bahrain,Student,Regular,2023-11-12
3,CUST9,Ashley,Soto,ashleysoto795@gmail.com,917298573657,1998-08-27,Indain,Kannur,Italy,Retired,Regular,2024-06-10
4,CUST38,Desiree,Wright,desireewright863@gmail.com,917759544222,1995-09-21,Indain,Kollam,Australia,Retired,Regular,2022-01-20


In [44]:
shipments_df = generate_shipments(num_shipments, customers_df.to_dict(orient='records'))
shipments_df.head()

Unnamed: 0,shipment_id,customer_id,origin_country,destination_country,origin_city,destination_city,weight,volume,shipment_date,delivery_date,delivery_status,shipment_type,customs_clearance,cargo_provider_name,cargo_type,service_type
0,SHIP10281,CUST130,India,Luxembourg,Wayanad,West Johnmouth,99.162915,1.235989,2023-06-11,2023-06-18,Delayed,Electronic,True,FedEx,Ground,Express
1,SHIP5558,CUST286,India,Luxembourg,Kannur,Erictown,43.690957,2.822511,2023-09-23,2023-10-03,Canceled,Household Goods,False,FedEx,Ground,Express
2,SHIP15198,CUST445,India,Oman,Ernakulam,Smithport,81.660173,2.223051,2023-11-25,2023-11-30,Canceled,Electronic,False,UPS,Air Freight,Express
3,SHIP22382,CUST279,India,United Arab Emirates,Kannur,Frazierhaven,62.809792,4.202233,2024-07-05,2024-07-08,Delivered,Document,True,DHL,Air Freight,Standard
4,SHIP10087,CUST435,India,Austria,Kozhikode,Espinozaborough,75.439379,2.501078,2023-12-31,2024-01-03,Delayed,Electronic,True,FedEx,Ground,Economy


In [45]:
transactions_df = generate_transactions(num_shipments, shipments_df.to_dict(orient='records'))
transactions_df.head()


Unnamed: 0,transaction_id,customer_id,shipment_id,transaction_date,transaction_amount,payment_method,transaction_status
0,TRAN20880,CUST493,SHIP2664,2024-06-28,39313.025676,COD,Paid
1,TRAN10662,CUST648,SHIP16478,2023-07-19,51857.299358,Bank Transfer,Paid
2,TRAN3599,CUST107,SHIP11599,2023-08-18,86822.259795,Credit Card,Refunded
3,TRAN12937,CUST517,SHIP394,2023-06-16,51862.525985,Debit Card,Refunded
4,TRAN2663,CUST666,SHIP13888,2024-01-12,97335.437687,Credit Card,Paid


In [46]:
delivery_feedback_df = generate_delivery_feedback(num_shipments, shipments_df.to_dict(orient='records'))
delivery_feedback_df.head()


Unnamed: 0,feedback_id,shipment_id,customer_id,delivery_time,delivery_rating,issue_reported,issue_type,complaint_status
0,FD12574,SHIP13069,CUST506,4,2,False,,
1,FD5927,SHIP15765,CUST324,7,3,True,Delay,Resolved
2,FD22125,SHIP4316,CUST338,12,4,True,Damage,Resolved
3,FD10635,SHIP17773,CUST626,9,3,True,Damage,Pending
4,FD24653,SHIP3906,CUST20,4,4,False,,
