In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
# Load Excel file with pipe-separated data
excel_file = 'data/IncrementalData.xlsx'
df  = pd.read_excel(excel_file)
df.head()

Unnamed: 0,col|col|col|col|col|col|col|col|col|col|col|col|col|col|col
0,2023-10-28 12:00:00|2023-10-28 12:15:00|42|17|...
1,2023-10-28 12:01:00|2023-10-28 12:16:00|21|8|7...
2,2023-10-28 12:02:00|2023-10-28 12:17:00|60|33|...
3,2023-10-28 12:03:00|2023-10-28 12:18:00|12|5|6...
4,2023-10-28 12:04:00|2023-10-28 12:19:00|99|24|...


In [3]:
df.iloc[0, 0]

'2023-10-28 12:00:00|2023-10-28 12:15:00|42|17|99|Product Launch|12345|Привет, мир!|100%|Déjà vu|Pipe|Separated|Data|With|Special@Characters'

In [4]:
# Splite the first row of data by the pipe character to determine the number of columns
num_columns = len(df.iloc[0, 0].split('|'))
num_columns

15

In [5]:
# Split the first row into individual columns
columns = df.iloc[0, 0].split('|')
columns

['2023-10-28 12:00:00',
 '2023-10-28 12:15:00',
 '42',
 '17',
 '99',
 'Product Launch',
 '12345',
 'Привет, мир!',
 '100%',
 'Déjà vu',
 'Pipe',
 'Separated',
 'Data',
 'With',
 'Special@Characters']

In [6]:
# Define column type prefixes follwoing [pattern, prefix]
# Optional group in case the timestamp has nanosecond values 2023-10-28 12:00:00.123456
type_prefixes = {
    'timestamp': [r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}(\.\d{1,9})?$', 'TMSTP'],
    'integer': [r'^\d+$', 'INT'],
    'date': [r'^\d{2,4}-\d{2}-\d{2,4}$', 'DT']
}

In [7]:
# Generate column nmaes with type prefixes
column_names = []

def get_column_type(column):
    for data_type, [pattern, prefix] in type_prefixes.items():
        if re.match(pattern, column):
            return prefix
    return 'STR'

In [8]:
for col in columns:
    print(get_column_type(col))

TMSTP
TMSTP
INT
INT
INT
STR
INT
STR
STR
STR
STR
STR
STR
STR
STR


In [9]:
for i, col in enumerate(columns):
    print(i, col )

0 2023-10-28 12:00:00
1 2023-10-28 12:15:00
2 42
3 17
4 99
5 Product Launch
6 12345
7 Привет, мир!
8 100%
9 Déjà vu
10 Pipe
11 Separated
12 Data
13 With
14 Special@Characters


In [10]:
for i, col in enumerate(columns):
    column_type = get_column_type(col)
    column_name = f"{column_type}_col_{i}"
    column_names.append(column_name)

column_names

['TMSTP_col_0',
 'TMSTP_col_1',
 'INT_col_2',
 'INT_col_3',
 'INT_col_4',
 'STR_col_5',
 'INT_col_6',
 'STR_col_7',
 'STR_col_8',
 'STR_col_9',
 'STR_col_10',
 'STR_col_11',
 'STR_col_12',
 'STR_col_13',
 'STR_col_14']