In [1]:
import pandas as pd

In [2]:
csv_file_path = 'input-data/person_data.csv'

In [9]:
# Reading the CSV file with various parameters
df = pd.read_csv(
    csv_file_path,  # The file path to the CSV file
    sep=',',    # Separator (default is ',')
    header=0,   # Row number to use as column names (0 for the first row)
    index_col=None,  # Set to None to not use any column as the index
    usecols=None,    # List of column names to read, None to read all columns
    skiprows=None,   # Number of rows to skip at the beginning
    nrows=None,      # Number of rows to read (None to read all)
    encoding='utf-8',  # Encoding of the CSV file (default is 'utf-8')
    dtype={'Person@ ID': int, 'Date* Of Birth': str, 'Zip Code|': str},
    na_values=['N/A', 'NA'],  # List of values to treat as NaN (default is None)
)
df.dtypes

Person@ ID             int64
First, Name           object
Last! Name            object
Date* Of Birth        object
Gender#               object
Email$ Address        object
Phone% Number          int64
Address^              object
City (State)          object
Zip Code|             object
Country~              object
Nationality`          object
Occupation_           object
Education+            object
Marital-Status        object
Registration. Date    object
Registration TIme     object
dtype: object

In [5]:
import re
def remove_special_characters_in_column_names(df, remove_special_chars=True):
    if remove_special_chars:
        df.columns = [re.sub(r'[^\w\s]', '', col) for col in df.columns]
    return df
df = remove_special_characters_in_column_names(df)
df.dtypes

def remove_leading_trailing_underscores_from_column_names(df, leading_trailing_underscores=True):
    if leading_trailing_underscores:
        df.columns = [col.lstrip('_').rstrip('_') for col in df.columns]
    return df
df = remove_leading_trailing_underscores_from_column_names(df)
df.dtypes
def remove_leading_trailing_spaces_from_column_names(df,leading_trailing_spaces=True):
    if leading_trailing_spaces:
        df.columns = [col.strip() for col in df.columns]
    return df
df = remove_leading_trailing_spaces_from_column_names(df)
df.dtypes

def join_words_with_spaces_in_column_names(df, join_by_space=True):
    if join_by_space:
        df.columns = ['_'.join(col.split(' ')) for col in df.columns]
    return df
df = join_words_with_spaces_in_column_names(df)
df.dtypes

def lowercase_dataframe_column_names(df, lower_case=True):
    if lower_case:
        df.columns = [col.lower() for col in df.columns]
    return df
df = lowercase_dataframe_column_names(df)
df.dtypes

person_id             int64
first_name           object
last_name            object
date_of_birth        object
gender               object
email_address        object
phone_number          int64
address              object
city_state           object
zip_code             object
country              object
nationality          object
occupation           object
education            object
maritalstatus        object
registration_date    object
registration_time    object
dtype: object

In [6]:
# Reading the CSV file with various parameters
df = pd.read_csv(
    csv_file_path,  # The file path to the CSV file
    sep=',',    # Separator (default is ',')
    header=0,   # Row number to use as column names (0 for the first row)
    index_col=None,  # Set to None to not use any column as the index
    usecols=None,    # List of column names to read, None to read all columns
    skiprows=None,   # Number of rows to skip at the beginning
    nrows=None,      # Number of rows to read (None to read all)
    encoding='utf-8',  # Encoding of the CSV file (default is 'utf-8')
    dtype={'Person@ ID': int, 'Date* Of Birth': str, 'Zip Code|': str},
    na_values=['N/A', 'NA'],  # List of values to treat as NaN (default is None)
)
df.dtypes

Person@ ID             int64
First, Name           object
Last! Name            object
Date* Of Birth        object
Gender#               object
Email$ Address        object
Phone% Number          int64
Address^              object
City (State)          object
Zip Code|             object
Country~              object
Nationality`          object
Occupation_           object
Education+            object
Marital-Status        object
Registration. Date    object
Registration TIme     object
dtype: object

In [11]:
import re

# Define a custom function to clean column names
def clean_and_transform_column_names(
        df,
        remove_special_chars=True, 
        remove_underscores=True,
        remove_spaces=True, 
        join_with_underscores=True, 
        lowercase=True
        ):
    for col in df.columns:
        new_col_name = col
        # Remove special characters and replace spaces with ""
        if remove_special_chars:
            new_col_name = re.sub(r'[^a-zA-Z0-9]+', ' ', new_col_name)
        # Remove leading and trailing underscores
        if remove_underscores:
            new_col_name = new_col_name.strip('_')
        # Remove leading and trailing spaces
        if remove_spaces:
            new_col_name = new_col_name.strip()
        # Join words with underscores
        if join_with_underscores:
            new_col_name = "_".join(new_col_name.split())
        # Convert to lowercase
        if lowercase:
            new_col_name = new_col_name.lower()
        # Rename the column
        df.rename(columns={col: new_col_name}, inplace=True)
    return df

# Clean and transform column names using a single function
df = clean_and_transform_column_names(df)

# Show the DataFrame Schema
df.dtypes

person_id             int64
first_name           object
last_name            object
date_of_birth        object
gender               object
email_address        object
phone_number          int64
address              object
city_state           object
zip_code             object
country              object
nationality          object
occupation           object
education            object
marital_status       object
registration_date    object
registration_time    object
dtype: object