In [1]:
import pandas as pd
import numpy as np
import re
from typing import List

# Using CSV File as Source

In [2]:
csv_file = 'data/IncrementalData.csv'
df = pd.read_csv(csv_file, header=0, sep='|')
df.head()

Unnamed: 0,col,col.1,col.2,col.3,col.4,col.5,col.6,col.7,col.8,col.9,col.10,col.11,col.12,col.13,col.14
0,2023-10-28 12:00:00,2023-10-28 12:15:00,42,17,99,Product Launch,12345,"Привет, мир!",100%,Déjà vu,Pipe,Separated,Data,With,Special@Characters
1,2023-10-28 12:01:00,2023-10-28 12:16:00,21,8,76,Financial Report,67890,Data#Analytics,80%,Testing✓,Market,Research,Data,Analysis@Sample,
2,2023-10-28 12:02:00,2023-10-28 12:17:00,60,33,42,Machine Learning,13579,特殊*字符,50%,Übermensch✓,Big Data,Processing,Advanced,AI@Tech,
3,2023-10-28 12:03:00,2023-10-28 12:18:00,12,5,63,Sales Forecast,24680,Prüfung!Daten,10%,Testing✓,Customer,Support,Multilingual@Data,,
4,2023-10-28 12:04:00,2023-10-28 12:19:00,99,24,37,Quarterly Report,10203,Ειδικός^Στήλη,90%,Déjà vu,Data,With,Специални^Символи,For&Analysis,


In [3]:
df.dtypes

col       object
col.1     object
col.2      int64
col.3      int64
col.4      int64
col.5     object
col.6      int64
col.7     object
col.8     object
col.9     object
col.10    object
col.11    object
col.12    object
col.13    object
col.14    object
dtype: object

In [4]:
# Get summary statistics
df.describe()

Unnamed: 0,col.2,col.3,col.4,col.6
count,67.0,67.0,67.0,67.0
mean,50.970149,22.402985,67.044776,53804.567164
std,23.43073,10.159426,14.736507,28925.554805
min,12.0,5.0,37.0,10203.0
25%,32.0,14.0,56.5,28206.5
50%,47.0,21.0,68.0,54321.0
75%,71.5,31.0,76.0,79913.5
max,99.0,40.0,99.0,98765.0


In [5]:
df.columns

Index(['col', 'col.1', 'col.2', 'col.3', 'col.4', 'col.5', 'col.6', 'col.7',
       'col.8', 'col.9', 'col.10', 'col.11', 'col.12', 'col.13', 'col.14'],
      dtype='object')

In [6]:
df.iloc[0]

col       2023-10-28 12:00:00
col.1     2023-10-28 12:15:00
col.2                      42
col.3                      17
col.4                      99
col.5          Product Launch
col.6                   12345
col.7            Привет, мир!
col.8                    100%
col.9                 Déjà vu
col.10                   Pipe
col.11              Separated
col.12                   Data
col.13                   With
col.14     Special@Characters
Name: 0, dtype: object

In [7]:
# Define column type previxes following [pattern, prefix]
# Use an optional group in case the timestamp has nonsecond value 2023-10-28 12:00:00.123456
type_prefixes = {
    'timestamp': [r'^\d{2, 4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}(\.\d{1,9})?$', 'TMSTP'],
    'integer': [r'^\d+$', 'INT'],
    'date': [r'^\d{2,4}-\d{2}-\d{2,4}$', 'DT']
}

In [8]:
df.shape

(67, 15)

In [9]:
num_rows = df.shape[0]
num_rows

67

In [10]:
# Pay attention to the non-string values for regex pattern match
df.iloc[0].to_list()

['2023-10-28 12:00:00',
 '2023-10-28 12:15:00',
 42,
 17,
 99,
 'Product Launch',
 12345,
 'Привет, мир!',
 '100%',
 'Déjà vu',
 'Pipe',
 'Separated',
 'Data',
 'With',
 'Special@Characters']

In [11]:
# Split the first row into individual columns
for i in range(num_rows):
    current_row = df.iloc[i].to_list()
    print(current_row)
    break

['2023-10-28 12:00:00', '2023-10-28 12:15:00', 42, 17, 99, 'Product Launch', 12345, 'Привет, мир!', '100%', 'Déjà vu', 'Pipe', 'Separated', 'Data', 'With', 'Special@Characters']


## Name Columns

In [12]:
# Check data types
df.dtypes

col       object
col.1     object
col.2      int64
col.3      int64
col.4      int64
col.5     object
col.6      int64
col.7     object
col.8     object
col.9     object
col.10    object
col.11    object
col.12    object
col.13    object
col.14    object
dtype: object

In [13]:
pd.to_datetime(df.iloc[:, 0])

0    2023-10-28 12:00:00
1    2023-10-28 12:01:00
2    2023-10-28 12:02:00
3    2023-10-28 12:03:00
4    2023-10-28 12:04:00
             ...        
62   2023-10-28 13:02:00
63   2023-10-28 13:03:00
64   2023-10-28 13:04:00
65   2023-10-28 13:05:00
66   2023-10-28 13:06:00
Name: col, Length: 67, dtype: datetime64[ns]

In [14]:
# Conver the first two timstamp columns to datetime type
tmstp_columns = ['col', 'col.1']
df[tmstp_columns] = df[tmstp_columns].apply(pd.to_datetime)
df.dtypes

col       datetime64[ns]
col.1     datetime64[ns]
col.2              int64
col.3              int64
col.4              int64
col.5             object
col.6              int64
col.7             object
col.8             object
col.9             object
col.10            object
col.11            object
col.12            object
col.13            object
col.14            object
dtype: object

In [15]:
columns = df.columns.tolist()
columns

['col',
 'col.1',
 'col.2',
 'col.3',
 'col.4',
 'col.5',
 'col.6',
 'col.7',
 'col.8',
 'col.9',
 'col.10',
 'col.11',
 'col.12',
 'col.13',
 'col.14']

In [16]:
dtypes = [str(dtype) for dtype in df.dtypes]
dtypes

['datetime64[ns]',
 'datetime64[ns]',
 'int64',
 'int64',
 'int64',
 'object',
 'int64',
 'object',
 'object',
 'object',
 'object',
 'object',
 'object',
 'object',
 'object']

In [17]:
column_names = []
for (col_name, dtype) in zip(columns, dtypes):
    if dtype.startswith('datetime'):
        col_name = f"TMSTP_{col_name}"
    elif dtype.startswith('int'):
        col_name = f"INT_{col_name}"
    else:
        col_name = f"STR_{col_name}"
    column_names.append(col_name)

column_names

['TMSTP_col',
 'TMSTP_col.1',
 'INT_col.2',
 'INT_col.3',
 'INT_col.4',
 'STR_col.5',
 'INT_col.6',
 'STR_col.7',
 'STR_col.8',
 'STR_col.9',
 'STR_col.10',
 'STR_col.11',
 'STR_col.12',
 'STR_col.13',
 'STR_col.14']

In [18]:
df.columns = column_names
df.head()

Unnamed: 0,TMSTP_col,TMSTP_col.1,INT_col.2,INT_col.3,INT_col.4,STR_col.5,INT_col.6,STR_col.7,STR_col.8,STR_col.9,STR_col.10,STR_col.11,STR_col.12,STR_col.13,STR_col.14
0,2023-10-28 12:00:00,2023-10-28 12:15:00,42,17,99,Product Launch,12345,"Привет, мир!",100%,Déjà vu,Pipe,Separated,Data,With,Special@Characters
1,2023-10-28 12:01:00,2023-10-28 12:16:00,21,8,76,Financial Report,67890,Data#Analytics,80%,Testing✓,Market,Research,Data,Analysis@Sample,
2,2023-10-28 12:02:00,2023-10-28 12:17:00,60,33,42,Machine Learning,13579,特殊*字符,50%,Übermensch✓,Big Data,Processing,Advanced,AI@Tech,
3,2023-10-28 12:03:00,2023-10-28 12:18:00,12,5,63,Sales Forecast,24680,Prüfung!Daten,10%,Testing✓,Customer,Support,Multilingual@Data,,
4,2023-10-28 12:04:00,2023-10-28 12:19:00,99,24,37,Quarterly Report,10203,Ειδικός^Στήλη,90%,Déjà vu,Data,With,Специални^Символи,For&Analysis,


## Check for Non Latin Characters

In [19]:
for column_index, column_name in enumerate(df.columns):
    for i, value in enumerate(df.iloc[:, column_index]):
        if isinstance(value, str) and not value.isascii():
            print(f"Non-Latin character '{value}' found in '{column_name}' column at row num: {i + 1}")
        if i >3:
            break

Non-Latin character 'Привет, мир!' found in 'STR_col.7' column at row num: 1
Non-Latin character '特殊*字符' found in 'STR_col.7' column at row num: 3
Non-Latin character 'Prüfung!Daten' found in 'STR_col.7' column at row num: 4
Non-Latin character 'Ειδικός^Στήλη' found in 'STR_col.7' column at row num: 5
Non-Latin character 'Déjà vu' found in 'STR_col.9' column at row num: 1
Non-Latin character 'Testing✓' found in 'STR_col.9' column at row num: 2
Non-Latin character 'Übermensch✓' found in 'STR_col.9' column at row num: 3
Non-Latin character 'Testing✓' found in 'STR_col.9' column at row num: 4
Non-Latin character 'Déjà vu' found in 'STR_col.9' column at row num: 5
Non-Latin character 'Специални^Символи' found in 'STR_col.12' column at row num: 5


In [20]:
def check_non_latin_characters(dataframe: pd.DataFrame) -> List[str]:
    """
    Check for non-Latin characters in a DataFrame and return columns with such characters.

    This function iterates through the columns of a DataFrame and checks for the presence of non-Latin
    characters in any cell. It returns a list of column names that contain non-Latin characters.

    Parameters:
    dataframe (pd.DataFrame): The DataFrame to be checked for non-Latin characters.

    Returns:
    List[str]: A list of column names that contain non-Latin characters.

    Example Usage:
    non_ascii_columns = check_non_latin_characters(new_df)
    for column_name in non_ascii_columns:
        print(f"Non-Latin characters found in '{column_name}' column.")
    """
    non_ascii_columns = []
    for column_index, column_name in enumerate(dataframe.columns):
        for i, value in enumerate(dataframe.iloc[:, column_index]):
            if isinstance(value, str) and not value.isascii():
                non_ascii_columns.append(column_name)
                break
    return non_ascii_columns

non_ascii_columns = check_non_latin_characters(df)
for column_name in non_ascii_columns:
    print(f"Non-Latin characters found in '{column_name}' column.")

Non-Latin characters found in 'STR_col.7' column.
Non-Latin characters found in 'STR_col.9' column.
Non-Latin characters found in 'STR_col.12' column.
