#### Data Loading and Initial Profiling

#### 1: Imports and Setup

In [2]:
import pandas as pd
import numpy as np
import os
from datetime import datetime

print("Libraries imported successfully.")
print(f"Current working directory: {os.getcwd()}")

Libraries imported successfully.
Current working directory: C:\Users\Eldu\Desktop\data_val_pipe


#### 2: Load the Raw Dataset

In [3]:
# Load the raw noisy dataset
file_path = 'raw_market_data_with_errors.csv'

if not os.path.exists(file_path):
    raise FileNotFoundError(f"File not found: {file_path}. Please ensure Phase 1 was completed.")

df = pd.read_csv(file_path, parse_dates=['timestamp'])

print("Raw market data loaded successfully.")
print(f"Loaded {len(df):,} rows and {df.shape[1]} columns.")
print(f"Timestamp range: {df['timestamp'].min()} to {df['timestamp'].max()}")
print("\nFirst 10 rows:")
print(df.head(10))

Raw market data loaded successfully.
Loaded 20,200 rows and 4 columns.
Timestamp range: 2025-06-01 09:30:00 to 2025-06-01 16:17:27

First 10 rows:
                      timestamp exchange       price  volume
0 2025-06-01 09:30:00.000000000     BATS  150.000507     288
1 2025-06-01 09:30:01.170058502   NASDAQ  150.000378      81
2 2025-06-01 09:30:02.340117005   NASDAQ  150.001036      18
3 2025-06-01 09:30:03.510175508   NASDAQ  150.002569     196
4 2025-06-01 09:30:04.680234011     NYSE  150.002345     291
5 2025-06-01 09:30:05.850292514     NYSE  150.002121     360
6 2025-06-01 09:30:07.020351017   NASDAQ  150.003710     356
7 2025-06-01 09:30:08.190409520     NYSE  150.004488     279
8 2025-06-01 09:30:09.360468023     NYSE  150.004028     463
9 2025-06-01 09:30:10.530526526   NASDAQ  150.004581     397


#### 3: Basic Information and Data Types

In [4]:
print("Data types and memory usage:")
print(df.info())

print("\nColumn details:")
print(df.dtypes)

Data types and memory usage:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20200 entries, 0 to 20199
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   timestamp  20200 non-null  datetime64[ns]
 1   exchange   20200 non-null  object        
 2   price      19592 non-null  float64       
 3   volume     20200 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 631.4+ KB
None

Column details:
timestamp    datetime64[ns]
exchange             object
price               float64
volume                int64
dtype: object


#### 4: Comprehensive Initial Profiling Report

In [13]:
print("="*30)
print("INITIAL DATA PROFILING REPORT")
print("="*30)
print(f"Report generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Dataset: raw_market_data_with_errors.csv")
print(f"Total rows: {len(df):,}")
print(f"Time span: {df['timestamp'].min()} → {df['timestamp'].max()}")
print(f"Duration: {(df['timestamp'].max() - df['timestamp'].min())}")
print()

INITIAL DATA PROFILING REPORT
Report generated on: 2025-12-31 12:17:48
Dataset: raw_market_data_with_errors.csv
Total rows: 20,200
Time span: 2025-06-01 09:30:00 → 2025-06-01 16:17:27
Duration: 0 days 06:47:27



##### Missing values

In [6]:
print("MISSING VALUES")
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({'Missing Count': missing, 'Percentage': missing_pct.round(2)})
print(missing_df[missing_df['Missing Count'] > 0])
print()

MISSING VALUES
       Missing Count  Percentage
price            608        3.01



##### Duplicates

In [7]:
duplicates = df.duplicated().sum()
print(f"DUPLICATE ROWS: {duplicates} ({(duplicates/len(df)*100):.2f}%)")
print()

DUPLICATE ROWS: 200 (0.99%)



##### Negative or zero issues

In [8]:
print("INVALID VALUES")
print(f"Negative prices: {(df['price'] < 0).sum()}")
print(f"Zero prices: {(df['price'] == 0).sum()}")
print(f"Zero volume trades: {(df['volume'] == 0).sum()}")
print()

INVALID VALUES
Negative prices: 20
Zero prices: 0
Zero volume trades: 43



##### Basic statistics for numerical columns

In [9]:
print("NUMERICAL SUMMARY (price and volume)")
print(df[['price', 'volume']].describe().round(4))
print()

NUMERICAL SUMMARY (price and volume)
            price      volume
count  19592.0000  20200.0000
mean     158.0666    248.6335
std      126.8815    144.5025
min     -150.2833      0.0000
25%      150.0596    123.0000
50%      150.0868    247.0000
75%      150.1653    375.0000
max     3002.5212    499.0000



##### Exchange distribution

In [10]:
print("EXCHANGE DISTRIBUTION")
exchange_dist = df['exchange'].value_counts()
exchange_pct = (exchange_dist / len(df) * 100).round(2)
print(pd.DataFrame({'Count': exchange_dist, 'Percentage': exchange_pct}))
print()

EXCHANGE DISTRIBUTION
          Count  Percentage
exchange                   
NYSE       8103       40.11
NASDAQ     6067       30.03
BATS       4015       19.88
ARCA       2015        9.98



##### Timestamp regularity check (detect gaps > 10 seconds)

In [11]:
df_sorted = df.sort_values('timestamp')
time_diffs = df_sorted['timestamp'].diff().dt.total_seconds()
large_gaps = time_diffs[time_diffs > 10]
print(f"TIMESTAMP GAPS > 10 seconds: {len(large_gaps)} gaps detected")
if len(large_gaps) > 0:
    print("Largest gaps (seconds):")
    print(large_gaps.nlargest(5).values)

TIMESTAMP GAPS > 10 seconds: 5 gaps detected
Largest gaps (seconds):
[298.1700585 226.1700585 195.1700585 168.1700585 165.1700585]


#### 5: Save Profiling Log

In [16]:
log_content = f"""
Initial Profiling Report - {datetime.now().strftime('%Y-%m-%d %H:%M')}
Dataset: raw_market_data_with_errors.csv
Rows: {len(df):,}
Missing prices: {df['price'].isna().sum()}
Duplicates: {df.duplicated().sum()}
Negative prices: {(df['price'] < 0).sum()}
Zero volumes: {(df['volume'] == 0).sum()}
Large timestamp gaps: {len(large_gaps)}
"""

with open('profiling_log.txt', 'w') as f:
    f.write(log_content)

print("Profiling summary saved to 'profiling_log.txt'")

Profiling summary saved to 'profiling_log.txt'
