# dependencies

In [1]:
!pip install faker



In [2]:
!pip install schedule



# imports

In [1]:
import csv
import pandas as pd
from faker import Faker
import random
from datetime import datetime, timedelta
import schedule
import time
import signal
import sys

1. `csv`: This module provides functionality for reading and writing CSV (Comma-Separated Values) files. Although it's imported, it is not used in the code you provided.

2. `pandas as pd`: Pandas is a powerful data manipulation library. It provides data structures and functions to efficiently manipulate and analyze structured data, such as CSV files. It is commonly imported as `pd` for convenience.

3. `Faker`: Faker is a Python library that generates fake data for various purposes, such as testing and populating databases. It can generate fake names, addresses, phone numbers, and more. In the code, it is used to generate fake data for names, unique IDs, transactions, access times, and locations.

4. `random`: The random module provides functions for generating random numbers and making random choices. In the code, it is used to generate random choices for transactions and to generate random time intervals for access times.

5. `datetime, timedelta`: The `datetime` module provides classes for working with dates and times, and the `timedelta` class represents a duration or difference between two dates or times. In the code, they are used to generate random access times based on the current time.

6. `schedule`: The schedule library provides a simple and intuitive way to schedule tasks to run at specific times. It allows you to schedule functions to run periodically or at specific intervals. In the code, it is used to schedule the `generate_and_update_data` function to run daily at 8:00 AM.

7. `time`: The `time` module provides various time-related functions. In the code, it is used to introduce a small delay between iterations of the `while` loop to avoid excessive CPU usage.

8. `signal`: The `signal` module provides mechanisms to handle signals raised by the operating system. In the code, it is used to define a signal handler for stopping the script gracefully when a specific signal is received.

9. `sys`: The `sys` module provides access to some variables used or maintained by the interpreter and functions that interact with the interpreter. In the code, it is used to exit the script using `sys.exit()`.

These imports bring in the necessary functionality to generate fake data, work with CSV files, schedule tasks, handle signals, and perform other necessary operations in the code.

# End Imports

In [2]:
# Set the seed for random number generation (to get consistent results)
random.seed(42)

# Automated Schedule

In [5]:
# Create a Faker object
fake = Faker()

# Generate fake data
# Import the necessary modules
import random
import numpy as np

# Define the range and mean for the transaction amount
amount_range = (5000, -3000)
amount_mean = -1000  # Adjust the mean as desired

# Create a dictionary to store the unique IDs for each name
name_ids = {}

# Generate fake data
def generate_fake_data(num_rows, existing_customers=[]):
    data = []
    current_time = datetime.now()

    for _ in range(num_rows):
        # Determine if it's a new customer or an existing customer
        if existing_customers and random.random() < 0.7:
            # Select a random existing customer
            name = random.choice(existing_customers)
        else:
            # Generate a fake name for a new customer
            name = fake.name()
            # Add the new customer to the existing customers list
            existing_customers.append(name)

        # Check if the name already has an ID assigned
        if name in name_ids:
            unique_id = name_ids[name]
        else:
            unique_id = fake.uuid4()
            name_ids[name] = unique_id

        # Generate a random transaction amount based on a normal distribution
        amount = int(np.clip(random.normalvariate(amount_mean, 1000), amount_range[1], amount_range[0]))

        transaction = "withdraw" if amount < 0 else "deposit"

        access_time = current_time - timedelta(days=random.randint(0, 365), hours=random.randint(0, 23),
                                               minutes=random.randint(0, 59))
        location = fake.city()

        data.append([name, unique_id, amount, transaction, access_time, location])

    return data


In [6]:
# Set the number of rows you want in the dataframe
num_rows = 15

# Define the path to the CSV file
csv_file = 'data.csv'

# Define the maximum nuber of days to run the script
max_days = 3

# Variable to keep track of the number of days
days_count = 0

In [7]:
# Function to generate and updte data

def generate_and_update_data():
    global days_count

    # Generate the fake data
    data = generate_fake_data(num_rows)

    # Create the dataframe
    df = pd.DataFrame(data, columns=["Name", "ID", 'amount' ,"Transaction", "Access Time", "Location"])

    # Append the new data to the existing CSV file
    df.to_csv(csv_file, mode='a', index=False, header=not csv_file_exists())

    days_count += 1
    print("Data updated successfully.")


In [8]:
#The global days_count statement allows accessing and modifying the days_count variable defined outside the function.

# data = generate_fake_data(num_rows) calls the generate_fake_data function to generate the fake data. The generated data is stored in the data variable.

# df = pd.DataFrame(data, columns=["Name", "ID", "Transaction", "Access Time", "Location"]) creates a pandas DataFrame using the generated data. The column names are specified as "Name", "ID", "Transaction", "Access Time", and "Location".

# df.to_csv(csv_file, mode='a', index=False, header=not csv_file_exists()) appends the new data to the existing CSV file specified by csv_file. The mode='a' argument ensures the file is opened in append mode. The index=False argument avoids writing the row index to the CSV file. The header=not csv_file_exists() argument controls whether the header row is written to the file. If the CSV file already exists (csv_file_exists() returns True), the header row is not written.

# days_count += 1 increments the days_count variable to keep track of the number of days the script has run.






In [9]:
# Check if the CSV file exists
def csv_file_exists():
    try:
        with open(csv_file, 'r') as file:
            return True
    except FileNotFoundError:
        return False

In [10]:
# try: starts a try-except block to catch any potential exceptions that may occur.

# with open(csv_file, 'r') as file: opens the CSV file specified by csv_file in read mode. The with statement ensures that the file is properly closed after use.

# return True is executed if the CSV file is successfully opened without raising any exceptions. This indicates that the file exists.

# except FileNotFoundError: catches the FileNotFoundError exception, which is raised if the CSV file does not exist.

# return False is executed if the FileNotFoundError exception is raised. This indicates that the file does not exist.

In [11]:
# Signal handler to stop the script
def stop_script(signal, frame):
    print("Stopping the script...")
    sys.exit(0)

In [13]:
# initial_data = generate_fake_data(num_rows) generates the initial fake data using the generate_fake_data function. The data is stored in the initial_data variable.

# df_initial = pd.DataFrame(initial_data, columns=["Name", "ID", "Transaction", "Access Time", "Location"]) creates a DataFrame (df_initial) from the initial data. The column names are specified as "Name", "ID", "Transaction", "Access Time", and "Location".

# df_initial.to_csv(csv_file, mode='w', index=False) writes the initial data to the CSV file specified by csv_file. The mode='w' argument ensures that the file is opened in write mode. The index=False argument avoids writing the row index to the CSV file.



In [14]:
# Schedule the task to run daily
schedule.every().day.at("10:00").do(generate_and_update_data)

Every 1 day at 10:00:00 do generate_and_update_data() (last run: [never], next run: 2023-05-20 10:00:00)

In [15]:
# schedule.every().day.at("08:00").do(generate_and_update_data) schedules the generate_and_update_data function to run daily at 08:00. This is achieved using the schedule.every().day.at("08:00") method chain, followed by .do(generate_and_update_data) to specify the function to be executed.

In [16]:
# Run the scheduled task continuously until the maximum number of days is reached
while days_count < max_days:
    schedule.run_pending()
    time.sleep(360 * 60)  # Sleep for 6 hours

KeyboardInterrupt: 

In [17]:
# The while loop continuously runs the scheduled task until the days_count reaches the max_days. Within the loop, schedule.run_pending() checks if there are any pending tasks and runs them. time.sleep(1) pauses the execution for 1 second to avoid unnecessary CPU usage.


In [None]:
# Stop the script after the maximum number of days
print("Maximum number of days reached. Stopping the script...")
sys.exit(0)

# For Loop

In [18]:
# I want to rapidly create some data set

In [2]:
from datetime import datetime, timedelta
import random
import numpy as np
from faker import Faker

fake = Faker()

# Define the range and mean for the transaction amount
amount_range = (5000, -3000)
amount_mean = -1000  # Adjust the mean as desired

# Create a dictionary to store the unique IDs for each name
name_ids = {}

# Generate fake data
def generate_fake_data(num_rows, existing_customers=[], repeating_customer_prob=0.7):
    data = []
    current_time = datetime.now()

    for _ in range(num_rows):
        # Determine if it's a new customer or an existing customer
        if existing_customers and random.random() < repeating_customer_prob:
            # Select a random existing customer
            name = random.choice(existing_customers)
        else:
            # Generate a fake name for a new customer
            name = fake.name()
            # Add the new customer to the existing customers list
            existing_customers.append(name)

        # Check if the name already has an ID assigned
        if name in name_ids:
            unique_id = name_ids[name]
        else:
            unique_id = fake.uuid4()
            name_ids[name] = unique_id

        # Generate a random transaction amount based on a normal distribution
        amount = int(np.clip(random.normalvariate(amount_mean, 1000), amount_range[1], amount_range[0]))

        transaction = "withdraw" if amount < 0 else "deposit"

        access_time = current_time - timedelta(days=random.randint(0, 365), hours=random.randint(0, 23),
                                               minutes=random.randint(0, 59))
        location = fake.city()

        data.append([name, unique_id, amount, transaction, access_time, location])

    return data


In [13]:
# Set the number of rows you want in the dataframe
num_rows = 100

In [14]:
# Generate and append initial data
# initial_data = generate_fake_data(num_rows)
# df_initial = pd.DataFrame(initial_data, columns=["Name", "ID", 'amount',"Transaction", "Access Time", "Location"])
# df_initial.to_csv(csv_file, mode='w', index=False)

In [15]:

data = pd.DataFrame()

for i in range(100):
    df = pd.DataFrame(generate_fake_data(num_rows))  # Assuming you have a function generate_fake_data() that generates fake data
    
    data = pd.concat([data, df])  # Concatenate the generated DataFrame to the existing data



In [16]:
data.rename( columns= {0: 'name',
                       1: 'id',
                       2: 'amount',
                       3: 'transaction',
                       4: 'access_time',
                       5: 'town'}, inplace= True)

In [17]:
data.to_csv('data.csv')