In [1]:
import pandas as pd
import numpy as np
import re
from typing import List

## Using Raw Excel File as Source

In [2]:
# Load Excel file with pipe-separated data
excel_file = 'data/IncrementalData.xlsx'
df  = pd.read_excel(excel_file)
df.head()

Unnamed: 0,col|col|col|col|col|col|col|col|col|col|col|col|col|col|col
0,2023-10-28 12:00:00|2023-10-28 12:15:00|42|17|...
1,2023-10-28 12:01:00|2023-10-28 12:16:00|21|8|7...
2,2023-10-28 12:02:00|2023-10-28 12:17:00|60|33|...
3,2023-10-28 12:03:00|2023-10-28 12:18:00|12|5|6...
4,2023-10-28 12:04:00|2023-10-28 12:19:00|99|24|...


In [3]:
df.describe

<bound method NDFrame.describe of    col|col|col|col|col|col|col|col|col|col|col|col|col|col|col
0   2023-10-28 12:00:00|2023-10-28 12:15:00|42|17|...         
1   2023-10-28 12:01:00|2023-10-28 12:16:00|21|8|7...         
2   2023-10-28 12:02:00|2023-10-28 12:17:00|60|33|...         
3   2023-10-28 12:03:00|2023-10-28 12:18:00|12|5|6...         
4   2023-10-28 12:04:00|2023-10-28 12:19:00|99|24|...         
..                                                ...         
62  2023-10-28 13:02:00|2023-10-28 13:17:00|73|37|...         
63  2023-10-28 13:03:00|2023-10-28 13:18:00|47|22|...         
64  2023-10-28 13:04:00|2023-10-28 13:19:00|91|39|...         
65  2023-10-28 13:05:00|2023-10-28 13:20:00|34|15|...         
66  2023-10-28 13:06:00|2023-10-28 13:21:00|76|38|...         

[67 rows x 1 columns]>

In [4]:
# Select the first three rows
df.iloc[0:3]

Unnamed: 0,col|col|col|col|col|col|col|col|col|col|col|col|col|col|col
0,2023-10-28 12:00:00|2023-10-28 12:15:00|42|17|...
1,2023-10-28 12:01:00|2023-10-28 12:16:00|21|8|7...
2,2023-10-28 12:02:00|2023-10-28 12:17:00|60|33|...


In [5]:
# Select the first column
df.iloc[:, 0]

0     2023-10-28 12:00:00|2023-10-28 12:15:00|42|17|...
1     2023-10-28 12:01:00|2023-10-28 12:16:00|21|8|7...
2     2023-10-28 12:02:00|2023-10-28 12:17:00|60|33|...
3     2023-10-28 12:03:00|2023-10-28 12:18:00|12|5|6...
4     2023-10-28 12:04:00|2023-10-28 12:19:00|99|24|...
                            ...                        
62    2023-10-28 13:02:00|2023-10-28 13:17:00|73|37|...
63    2023-10-28 13:03:00|2023-10-28 13:18:00|47|22|...
64    2023-10-28 13:04:00|2023-10-28 13:19:00|91|39|...
65    2023-10-28 13:05:00|2023-10-28 13:20:00|34|15|...
66    2023-10-28 13:06:00|2023-10-28 13:21:00|76|38|...
Name: col|col|col|col|col|col|col|col|col|col|col|col|col|col|col, Length: 67, dtype: object

In [6]:
# Select the 15the row in the first column
df.iloc[15, 0]

'2023-10-28 12:15:00|2023-10-28 12:30:00|31|14|55|Supply Chain Management|17482|Col-7|45%|Testing✓|Financial|Data|Multilingual@Data'

In [7]:
# Split the dat in the column using the pipe delimiter
split_data = df.iloc[:, 0].str.split('|')
split_data

0     [2023-10-28 12:00:00, 2023-10-28 12:15:00, 42,...
1     [2023-10-28 12:01:00, 2023-10-28 12:16:00, 21,...
2     [2023-10-28 12:02:00, 2023-10-28 12:17:00, 60,...
3     [2023-10-28 12:03:00, 2023-10-28 12:18:00, 12,...
4     [2023-10-28 12:04:00, 2023-10-28 12:19:00, 99,...
                            ...                        
62    [2023-10-28 13:02:00, 2023-10-28 13:17:00, 73,...
63    [2023-10-28 13:03:00, 2023-10-28 13:18:00, 47,...
64    [2023-10-28 13:04:00, 2023-10-28 13:19:00, 91,...
65    [2023-10-28 13:05:00, 2023-10-28 13:20:00, 34,...
66    [2023-10-28 13:06:00, 2023-10-28 13:21:00, 76,...
Name: col|col|col|col|col|col|col|col|col|col|col|col|col|col|col, Length: 67, dtype: object

In [8]:
# Make sure to place each element in a separate column by setting expand to True
split_data = df.iloc[:, 0].str.split('|', expand=True)
split_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,2023-10-28 12:00:00,2023-10-28 12:15:00,42,17,99,Product Launch,12345,"Привет, мир!",100%,Déjà vu,Pipe,Separated,Data,With,Special@Characters
1,2023-10-28 12:01:00,2023-10-28 12:16:00,21,8,76,Financial Report,67890,Data#Analytics,80%,Testing✓,Market,Research,Data,Analysis@Sample,
2,2023-10-28 12:02:00,2023-10-28 12:17:00,60,33,42,Machine Learning,13579,特殊*字符,50%,Übermensch✓,Big Data,Processing,Advanced,AI@Tech,
3,2023-10-28 12:03:00,2023-10-28 12:18:00,12,5,63,Sales Forecast,24680,Prüfung!Daten,10%,Testing✓,Customer,Support,Multilingual@Data,,
4,2023-10-28 12:04:00,2023-10-28 12:19:00,99,24,37,Quarterly Report,10203,Ειδικός^Στήλη,90%,Déjà vu,Data,With,Специални^Символи,For&Analysis,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62,2023-10-28 13:02:00,2023-10-28 13:17:00,73,37,84,Sales Performance,92481,Col-7,70%,Testing✓,Sales,Performance,Sales@Data,,
63,2023-10-28 13:03:00,2023-10-28 13:18:00,47,22,72,Annual Report,71736,العربية@عنوان,40%,Übermensch✓,Market,Data,Annual@Data,,
64,2023-10-28 13:04:00,2023-10-28 13:19:00,91,39,68,Customer Feedback,68429,Prüfung!Daten,85%,Testing✓,Customer,Feedback,Customer@Data,,
65,2023-10-28 13:05:00,2023-10-28 13:20:00,34,15,61,Business Strategy,62317,Süper^Özel,60%,Übermensch✓,Business,Strategy,Business@Data,,


In [9]:
# Create a new DataFrame by concatenating the split data
new_df = pd.concat([df, split_data], axis=1)
new_df

Unnamed: 0,col|col|col|col|col|col|col|col|col|col|col|col|col|col|col,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,2023-10-28 12:00:00|2023-10-28 12:15:00|42|17|...,2023-10-28 12:00:00,2023-10-28 12:15:00,42,17,99,Product Launch,12345,"Привет, мир!",100%,Déjà vu,Pipe,Separated,Data,With,Special@Characters
1,2023-10-28 12:01:00|2023-10-28 12:16:00|21|8|7...,2023-10-28 12:01:00,2023-10-28 12:16:00,21,8,76,Financial Report,67890,Data#Analytics,80%,Testing✓,Market,Research,Data,Analysis@Sample,
2,2023-10-28 12:02:00|2023-10-28 12:17:00|60|33|...,2023-10-28 12:02:00,2023-10-28 12:17:00,60,33,42,Machine Learning,13579,特殊*字符,50%,Übermensch✓,Big Data,Processing,Advanced,AI@Tech,
3,2023-10-28 12:03:00|2023-10-28 12:18:00|12|5|6...,2023-10-28 12:03:00,2023-10-28 12:18:00,12,5,63,Sales Forecast,24680,Prüfung!Daten,10%,Testing✓,Customer,Support,Multilingual@Data,,
4,2023-10-28 12:04:00|2023-10-28 12:19:00|99|24|...,2023-10-28 12:04:00,2023-10-28 12:19:00,99,24,37,Quarterly Report,10203,Ειδικός^Στήλη,90%,Déjà vu,Data,With,Специални^Символи,For&Analysis,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62,2023-10-28 13:02:00|2023-10-28 13:17:00|73|37|...,2023-10-28 13:02:00,2023-10-28 13:17:00,73,37,84,Sales Performance,92481,Col-7,70%,Testing✓,Sales,Performance,Sales@Data,,
63,2023-10-28 13:03:00|2023-10-28 13:18:00|47|22|...,2023-10-28 13:03:00,2023-10-28 13:18:00,47,22,72,Annual Report,71736,العربية@عنوان,40%,Übermensch✓,Market,Data,Annual@Data,,
64,2023-10-28 13:04:00|2023-10-28 13:19:00|91|39|...,2023-10-28 13:04:00,2023-10-28 13:19:00,91,39,68,Customer Feedback,68429,Prüfung!Daten,85%,Testing✓,Customer,Feedback,Customer@Data,,
65,2023-10-28 13:05:00|2023-10-28 13:20:00|34|15|...,2023-10-28 13:05:00,2023-10-28 13:20:00,34,15,61,Business Strategy,62317,Süper^Özel,60%,Übermensch✓,Business,Strategy,Business@Data,,


In [10]:
# Drop the original column
new_df = new_df.drop(df.columns[0], axis=1)
new_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,2023-10-28 12:00:00,2023-10-28 12:15:00,42,17,99,Product Launch,12345,"Привет, мир!",100%,Déjà vu,Pipe,Separated,Data,With,Special@Characters
1,2023-10-28 12:01:00,2023-10-28 12:16:00,21,8,76,Financial Report,67890,Data#Analytics,80%,Testing✓,Market,Research,Data,Analysis@Sample,
2,2023-10-28 12:02:00,2023-10-28 12:17:00,60,33,42,Machine Learning,13579,特殊*字符,50%,Übermensch✓,Big Data,Processing,Advanced,AI@Tech,
3,2023-10-28 12:03:00,2023-10-28 12:18:00,12,5,63,Sales Forecast,24680,Prüfung!Daten,10%,Testing✓,Customer,Support,Multilingual@Data,,
4,2023-10-28 12:04:00,2023-10-28 12:19:00,99,24,37,Quarterly Report,10203,Ειδικός^Στήλη,90%,Déjà vu,Data,With,Специални^Символи,For&Analysis,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62,2023-10-28 13:02:00,2023-10-28 13:17:00,73,37,84,Sales Performance,92481,Col-7,70%,Testing✓,Sales,Performance,Sales@Data,,
63,2023-10-28 13:03:00,2023-10-28 13:18:00,47,22,72,Annual Report,71736,العربية@عنوان,40%,Übermensch✓,Market,Data,Annual@Data,,
64,2023-10-28 13:04:00,2023-10-28 13:19:00,91,39,68,Customer Feedback,68429,Prüfung!Daten,85%,Testing✓,Customer,Feedback,Customer@Data,,
65,2023-10-28 13:05:00,2023-10-28 13:20:00,34,15,61,Business Strategy,62317,Süper^Özel,60%,Übermensch✓,Business,Strategy,Business@Data,,


## Set Column Names

In [11]:
# Check the existing column names
new_df.columns

Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], dtype='object')

In [12]:
# Take a look at the first row
new_df.iloc[0]

0     2023-10-28 12:00:00
1     2023-10-28 12:15:00
2                      42
3                      17
4                      99
5          Product Launch
6                   12345
7            Привет, мир!
8                    100%
9                 Déjà vu
10                   Pipe
11              Separated
12                   Data
13                   With
14     Special@Characters
Name: 0, dtype: object

In [13]:
new_df.iloc[0, 0]

'2023-10-28 12:00:00'

In [14]:
# Define column type prefixes follwoing [pattern, prefix]
# Optional group in case the timestamp has nanosecond values 2023-10-28 12:00:00.123456
type_prefixes = {
    'timestamp': [r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}(\.\d{1,9})?$', 'TMSTP'],
    'integer': [r'^\d+$', 'INT'],
    'date': [r'^\d{2,4}-\d{2}-\d{2,4}$', 'DT']
}

In [15]:
# Generate column names with type prefixes
column_names = []

# Get column type with consideration for missing values
def get_column_type(column):
    # Check if the column type is not missing. Otherwise, set it to MISSING
    if pd.notna(column):
        for data_type, [pattern, prefix] in type_prefixes.items():
            if re.match(pattern, column):
                return prefix
        return 'STR' # Assign 'STR' as the default prefix for non-matching string columns
    return 'MISSING' # Assign 'MISSING' for columns with missing values

In [16]:
new_df.shape

(67, 15)

In [17]:
num_rows = new_df.shape[0]
num_rows

67

In [18]:
new_df.iloc[0]

0     2023-10-28 12:00:00
1     2023-10-28 12:15:00
2                      42
3                      17
4                      99
5          Product Launch
6                   12345
7            Привет, мир!
8                    100%
9                 Déjà vu
10                   Pipe
11              Separated
12                   Data
13                   With
14     Special@Characters
Name: 0, dtype: object

In [19]:
new_df.iloc[0].to_list()

['2023-10-28 12:00:00',
 '2023-10-28 12:15:00',
 '42',
 '17',
 '99',
 'Product Launch',
 '12345',
 'Привет, мир!',
 '100%',
 'Déjà vu',
 'Pipe',
 'Separated',
 'Data',
 'With',
 'Special@Characters']

In [20]:
# Split the first row into individual columns
for i in range(num_rows):
    current_row = new_df.iloc[i].to_list()
    print(current_row)
    break

['2023-10-28 12:00:00', '2023-10-28 12:15:00', '42', '17', '99', 'Product Launch', '12345', 'Привет, мир!', '100%', 'Déjà vu', 'Pipe', 'Separated', 'Data', 'With', 'Special@Characters']


### Version 1

In [21]:
all_non_missing_found = False # Initialize a flag

for i in range(num_rows):
    if all_non_missing_found:
        break # Exit the loop once a row with all non-missing values is found

    column_names = []
    current_row = new_df.iloc[i].to_list()

    for index, column in enumerate(current_row):
        column_type = get_column_type(column)

        if column_type != 'MISSING':
            column_name = f"{column_type}_col_{index}"
            column_names.append(column_name)
        else:
            # If a missing value is encountered, set the flag to False
            all_non_missing_found = False
            break # Exit the inner loop if a missing value is encountered

    # If all columns in the current row are non-missing, set the flag to True
    if not 'MISSING' in column_names:
        all_non_missing_found = True

    # Print the row number
    print(f"current row number: {i}")
    print(column_names)

current row number: 0
['TMSTP_col_0', 'TMSTP_col_1', 'INT_col_2', 'INT_col_3', 'INT_col_4', 'STR_col_5', 'INT_col_6', 'STR_col_7', 'STR_col_8', 'STR_col_9', 'STR_col_10', 'STR_col_11', 'STR_col_12', 'STR_col_13', 'STR_col_14']


### Version 2 - Cleaner Version

In [22]:
for i in range(num_rows):
    column_names = []
    current_row = new_df.iloc[i].to_list()

    for index, column in enumerate(current_row):
        column_type = get_column_type(column)

        if column_type != 'MISSING':
            column_name = f"{column_type}_col_{index}"
            column_names.append(column_name)

    if not any('MISSING' in name for name in column_names):
        print(f"current row number: {i}")
        print(column_names)
        break # Exit the loop if no missing values found in the current row


current row number: 0
['TMSTP_col_0', 'TMSTP_col_1', 'INT_col_2', 'INT_col_3', 'INT_col_4', 'STR_col_5', 'INT_col_6', 'STR_col_7', 'STR_col_8', 'STR_col_9', 'STR_col_10', 'STR_col_11', 'STR_col_12', 'STR_col_13', 'STR_col_14']


In [23]:
new_df.columns = column_names
new_df.head()

Unnamed: 0,TMSTP_col_0,TMSTP_col_1,INT_col_2,INT_col_3,INT_col_4,STR_col_5,INT_col_6,STR_col_7,STR_col_8,STR_col_9,STR_col_10,STR_col_11,STR_col_12,STR_col_13,STR_col_14
0,2023-10-28 12:00:00,2023-10-28 12:15:00,42,17,99,Product Launch,12345,"Привет, мир!",100%,Déjà vu,Pipe,Separated,Data,With,Special@Characters
1,2023-10-28 12:01:00,2023-10-28 12:16:00,21,8,76,Financial Report,67890,Data#Analytics,80%,Testing✓,Market,Research,Data,Analysis@Sample,
2,2023-10-28 12:02:00,2023-10-28 12:17:00,60,33,42,Machine Learning,13579,特殊*字符,50%,Übermensch✓,Big Data,Processing,Advanced,AI@Tech,
3,2023-10-28 12:03:00,2023-10-28 12:18:00,12,5,63,Sales Forecast,24680,Prüfung!Daten,10%,Testing✓,Customer,Support,Multilingual@Data,,
4,2023-10-28 12:04:00,2023-10-28 12:19:00,99,24,37,Quarterly Report,10203,Ειδικός^Στήλη,90%,Déjà vu,Data,With,Специални^Символи,For&Analysis,


## Check For Non-Latin Characters

In [24]:
new_df.iloc[0]

TMSTP_col_0    2023-10-28 12:00:00
TMSTP_col_1    2023-10-28 12:15:00
INT_col_2                       42
INT_col_3                       17
INT_col_4                       99
STR_col_5           Product Launch
INT_col_6                    12345
STR_col_7             Привет, мир!
STR_col_8                     100%
STR_col_9                  Déjà vu
STR_col_10                    Pipe
STR_col_11               Separated
STR_col_12                    Data
STR_col_13                    With
STR_col_14      Special@Characters
Name: 0, dtype: object

In [25]:
for column_index, column_name in enumerate(new_df.columns):
    for i, value in enumerate(new_df.iloc[:, column_index]):
        if isinstance(value, str) and not value.isascii():
            print(f"Non-Latin character '{value}' found in '{column_name}' column at row num: {i + 1}")
        if i >3:
            break

Non-Latin character 'Привет, мир!' found in 'STR_col_7' column at row num: 1
Non-Latin character '特殊*字符' found in 'STR_col_7' column at row num: 3
Non-Latin character 'Prüfung!Daten' found in 'STR_col_7' column at row num: 4
Non-Latin character 'Ειδικός^Στήλη' found in 'STR_col_7' column at row num: 5
Non-Latin character 'Déjà vu' found in 'STR_col_9' column at row num: 1
Non-Latin character 'Testing✓' found in 'STR_col_9' column at row num: 2
Non-Latin character 'Übermensch✓' found in 'STR_col_9' column at row num: 3
Non-Latin character 'Testing✓' found in 'STR_col_9' column at row num: 4
Non-Latin character 'Déjà vu' found in 'STR_col_9' column at row num: 5
Non-Latin character 'Специални^Символи' found in 'STR_col_12' column at row num: 5


In [26]:
def check_non_latin_characters(dataframe: pd.DataFrame) -> List[str]:
    """
    Check for non-Latin characters in a DataFrame and return columns with such characters.

    This function iterates through the columns of a DataFrame and checks for the presence of non-Latin
    characters in any cell. It returns a list of column names that contain non-Latin characters.

    Parameters:
    dataframe (pd.DataFrame): The DataFrame to be checked for non-Latin characters.

    Returns:
    List[str]: A list of column names that contain non-Latin characters.

    Example Usage:
    non_ascii_columns = check_non_latin_characters(new_df)
    for column_name in non_ascii_columns:
        print(f"Non-Latin characters found in '{column_name}' column.")
    """
    non_ascii_columns = []
    for column_index, column_name in enumerate(dataframe.columns):
        for i, value in enumerate(dataframe.iloc[:, column_index]):
            if isinstance(value, str) and not value.isascii():
                non_ascii_columns.append(column_name)
                break
    return non_ascii_columns

non_ascii_columns = check_non_latin_characters(new_df)
for column_name in non_ascii_columns:
    print(f"Non-Latin characters found in '{column_name}' column.")


Non-Latin characters found in 'STR_col_7' column.
Non-Latin characters found in 'STR_col_9' column.
Non-Latin characters found in 'STR_col_12' column.
