In [1]:
import csv
import random
from datetime import datetime, timedelta

# Function to generate random data for trade count and quote count
def generate_random_trade_count():
    return random.randint(0, 50)

def generate_random_quote_count():
    return random.randint(0, 50)

# Define headers and possible values
headers = ["timestamp", "content name", "environment", "lineID", "trade count", "quote count"]
environments = ["qa", "pt", "dr", "pr"]
content_names = ["onl", "fut", "otc", "wth"]

# Define lineID ranges for each content name
lineID_ranges = {
    "onl": list(range(101, 105)),
    "fut": list(range(105, 109)),
    "otc": list(range(109, 113)),
    "wth": list(range(113, 117))
}

# Define start and end times for the period
start_date = datetime(2024, 3, 1, 6, 30, 0)
end_date = datetime(2024, 3, 30, 13, 30, 0)

data = []

# Generate timestamps and data for each minute from 6:30 AM to 1:30 PM for each day of March 2024
for timestamp in range(int(start_date.timestamp()), int(end_date.timestamp()) + 1, 60):  # Increment timestamp by 60 seconds
    for content_name in content_names:
        for lineID in lineID_ranges[content_name]:
            trade_count = generate_random_trade_count()
            quote_count = generate_random_quote_count()
            
            # Ensure same trade and quote counts for different environments
            same_trade_count = trade_count  
            same_quote_count = quote_count
            
            for environment in environments:
                data.append([timestamp, content_name, environment, lineID, same_trade_count, same_quote_count])
# Specify the file path
file_path =r"E:\transactions_march.csv"

# Write data to CSV file
with open(file_path, "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(headers)
    writer.writerows(data)

print(f"CSV file generated successfully at: {file_path}")


CSV file generated successfully at: E:\transactions_march.csv


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Load the data from the CSV file
file_path = r"E:\transactions_march.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,timestamp,content name,environment,lineID,trade count,quote count
0,1709254800,onl,qa,101,16,8
1,1709254800,onl,pt,101,16,8
2,1709254800,onl,dr,101,16,8
3,1709254800,onl,pr,101,16,8
4,1709254800,onl,qa,102,44,47


In [4]:
print(df.dtypes)

timestamp        int64
content name    object
environment     object
lineID           int64
trade count      int64
quote count      int64
dtype: object


In [5]:
# Define start and end times for the period
start_date = datetime(2024, 3, 31, 6, 30, 0)
end_date = datetime(2024, 3, 31, 13, 30, 0)

data = []

# Generate timestamps and data for each minute from 6:30 AM to 1:30 PM for each day of March 2024
for timestamp in range(int(start_date.timestamp()), int(end_date.timestamp()) + 1, 60):  # Increment timestamp by 60 seconds
    for content_name in content_names:
        for lineID in lineID_ranges[content_name]:
            trade_count = generate_random_trade_count()
            quote_count = generate_random_quote_count()
            
            # Ensure same trade and quote counts for different environments
            same_trade_count = trade_count  
            same_quote_count = quote_count
            
            for environment in environments:
                data.append([timestamp, content_name, environment, lineID, same_trade_count, same_quote_count])
# Specify the file path
file_path =r"E:\transactions_march31.csv"

# Write data to CSV file
with open(file_path, "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(headers)
    writer.writerows(data)

print(f"CSV file generated successfully at: {file_path}")


CSV file generated successfully at: E:\transactions_march31.csv


In [6]:
import pandas as pd

# Read the CSV file
df = pd.read_csv(r"E:\transactions_march31.csv")

# Function to mark anomalies within each group
def mark_anomalies(group):
    # Check if there are discrepancies in 'trade count' or 'quote count' within the group
    trade_count_unique = group['trade count'].unique()
    quote_count_unique = group['quote count'].unique()
    
    if len(trade_count_unique) > 1 or len(quote_count_unique) > 1:
        # Identify the row with an anomaly and mark it as 1, others as 0
        group['anomaly'] = (~group.duplicated(subset=['lineID', 'trade count', 'quote count'], keep=False)).astype(int)
    else:
        # No anomalies detected within the group, mark all rows as 0
        group['anomaly'] = 0
    
    return group

# Collect individual DataFrame groups with anomalies marked
group_list = []
for _, group_df in df.groupby(['lineID'], sort=False):
    marked_group = mark_anomalies(group_df.copy())  # Make a copy to avoid modifying the original DataFrame
    group_list.append(marked_group)

# Concatenate all groups into a single DataFrame and sort by original index to restore order
updated_df = pd.concat(group_list).sort_index()

# Save the updated data to a new CSV file
updated_df.to_csv(r"E:\anomalies.csv", index=False)

print("Anomalies checked and updated with correct marking in the CSV file.")


Anomalies checked and updated with correct marking in the CSV file.


In [7]:
import pandas as pd

# Filter the DataFrame to include only rows with anomalies (anomaly == 1)
anomaly_df = updated_df[updated_df['anomaly'] == 1]

# Save the anomaly rows to a new CSV file
anomaly_df.to_csv(r"E:\anomaly_rows.csv", index=False)
print("Anomaly rows saved to 'anomaly_rows.csv'.")


Anomaly rows saved to 'anomaly_rows.csv'.
