In [29]:
import pandas as pd
import csv
from data_cleaning import (
    string_pattern_search,
    string_pattern_replace,
    single_threaded_read_and_decode_lines,
    load_data_into_dataframe,
    detect_encoding_for_row
)

# 1. Decode CSV

In [30]:
meta_raw = pd.read_csv(r"D:\ML\Portfolio\Projects\semantic-search\datasets\metadata-matches-raw.txt", encoding_errors='ignore') # File version converted to UTF-8 outside of Python for comparison

file_path = r"D:\ML\Portfolio\Projects\semantic-search\datasets\metadata-matches-original-encoding.txt" # Original encoding version of the file

## 1. Use dynamic (line-by-line) decoding:

In [None]:
# Use the single-threaded approach to read and decode the file line by line
decoded_text, encodings_tried = single_threaded_read_and_decode_lines(file_path)

# Print the encodings tried
for encoding, confidence in encodings_tried:
    print(f"Tried encoding: {encoding} with confidence: {confidence}")

# Load the decoded text into a DataFrame
meta_decoded = load_data_into_dataframe(decoded_text)

# Display the DataFrame and check the number of rows
print(meta_decoded.head())
print(f"Number of rows: {len(meta_decoded)}")

Encoding detector. Currently not utilised (not required):

In [None]:
# Example usage
# Assuming df is your DataFrame
# Specify the row index you want to check
row_index = 779  # Change this to the row index you want to inspect

# Detect encoding for the specified row
encoding, confidence = detect_encoding_for_row(meta_decoded, row_index)
print(f"Row {row_index + 1}: Encoding={encoding}, Confidence={confidence:.2f}")

## 2. Validate Decoding

In [None]:
# Edge cases:
# 'C1014982': 
# 'C709663': † River †
mask = meta_decoded['CAN_ID'] == 'C709663' # Spanish example: two separate encodings within the same line (word)
meta_decoded[mask]

meta_decoded.compare(meta_raw) # Compare the dynamically decoded vs. converted to UTF-8 dataframes 

## 3. Export CSV (if required)

In [34]:
#csv_export_path = "D:\ML\Portfolio\Projects\RAG\metadata-matches-decoded.txt"

#meta_decoded.to_csv(csv_export_path, index=False, quoting=csv.QUOTE_NONNUMERIC) # Quotes to prevent Pandas converting to numeric on import to preserve the original file data types.

# 2. Data Exploration

In [None]:
meta_decoded.head()

In [None]:
meta_decoded.info()

In [37]:
pd.set_option('display.float_format', lambda x: f'{x:.2f}') # Prevent using scientific notation for SEQ_NO column when using .describe() method (due to wide value range in the column).

In [None]:
meta_decoded.describe(include='all')

# 3. Data Integrity Validation

Pandas display setting management section:

In [39]:
#pd.set_option('display.max_rows', None)
#pd.reset_option('display.max_rows')
#pd.set_option('display.max_colwidth', None)

## 1. Validate Values and Fix

Review and normalise values.

_**Songcode & ISWC**_ columns are out of the PoC/ prototype scope and won't be validated/ cleaned/ manipulated. They require additional consideration before validating values. 

In [40]:
cols_default_all = meta_decoded.columns[2:] # Default column set (strings only).
cols_default_strict = meta_decoded.columns[2:-2] # Default column set (strings only). Exclude Soncode & ISWCs columns

### 1. Internal whitespaces followed by a number (canonical writer columns only):

In [None]:
internal_whitespace_num_pat = (r'\s{2,}\d', True) # Match two or more whitespace characters, followed by a single digit
internal_whitespace_num_cols = ['CAN_Writers_Formatted', 'CAN_Writers_Raw']

internal_whitespace_num_df = string_pattern_search(meta_decoded, internal_whitespace_num_pat, internal_whitespace_num_cols)

Replace the pattern with an empty string and validate removal:

In [None]:
string_pattern_replace(meta_decoded, internal_whitespace_num_pat, replace_with='', cols=internal_whitespace_num_cols) # Replace with an empty string and validate removal

### 2.' [Not Controlled]' as part of some composition titles (this data is unrelated to the dataset):

In [None]:
not_controled_pat = (' [Not Controlled]', False)

not_controled_df = string_pattern_search(meta_decoded, not_controled_pat, cols_default_all)

Replace the pattern with an empty string and validate removal:

In [None]:
string_pattern_replace(meta_decoded, not_controled_pat, replace_with='', cols=not_controled_df.columns) # Replace with an empty string and validate removal

### 3. All extra whitespaces: leading, internal, trailing, non-breaking space (\xa0), newline characters (\n), etc.:

In [None]:
string_pattern_replace(meta_decoded, (r'\s+', True), replace_with=' ', cols=cols_default_strict) # Match one or more whitespace characters. Replace any whitespaces with a single space. Exclude songcodes & ISWCs

for col in cols_default_strict: # Exclude songcodes & ISWCs
    meta_decoded[col] = meta_decoded[col].str.strip() # Remove leading/ trailing whitespace (incl. non-breaking space)

Validate whitespace removal (songcodes and ISWCs are left untouched as intended):

In [None]:
string_pattern_search(meta_decoded, (r'^\s|\s$|\s{2,}', True), cols=cols_default_all, summary='unique') # Matches leading, trailing or consecutive whitespace characters

### 4. Curly apostrophe (right single quotation mark): '’'

In [None]:
single_quote_pat = ('’', False)
single_quote_pat_df = string_pattern_search(meta_decoded, single_quote_pat, cols=cols_default_all)

Replace the pattern with an empty string and validate removal:

In [None]:
string_pattern_replace(meta_decoded, single_quote_pat, replace_with="'", cols=single_quote_pat_df.columns)

# 4. CSV Export

## 1. Data Type Check
If any columns but the first two contain any numeric values (should be strings as are encapsulated in "" in the CSV).

In [None]:
# Check if any columns contain numeric values
for col in meta_decoded.columns[2:]:
    # Drop NaN values before checking for numeric values
    non_na_values = meta_decoded[col].dropna()
    numeric_values = non_na_values.apply(lambda x: isinstance(x, (int, float)) and not isinstance(x, bool))
    if numeric_values.any():
        first_numeric_value = non_na_values[numeric_values].iloc[0]
        print(f"Column {col} contains numeric values. Example: {first_numeric_value}")
    else:
        print(f"Column {col} does not contain numeric values.")

## 2. Export to CSV

In [50]:
csv_export_path = r"D:\ML\Portfolio\Projects\semantic-search\datasets\metadata-matches-pre-processed.txt"

meta_decoded.to_csv(csv_export_path, index=False, quoting=csv.QUOTE_NONNUMERIC) # Quotes to prevent Pandas converting to numeric on import to preserve the original file data types. The empty strings are converted to NaN values on import.

## 3. CSV Export Validation
To check if any data loss on export.

In [None]:
meta_exported = pd.read_csv(csv_export_path)

if meta_decoded.equals(meta_exported):
    print('No data loss on CSV export!')
else:
    print('WARNING: Some data loss on CSV export!')

The difference in the exported dataset is due to whitespaces only present in 'MATCHED_Writer_2', which got replaced with an empty string, which in turn got replaced with 'NaN' by pd.read_csv:

In [None]:
meta_exported.compare(meta_decoded)

Whitespace conversion to an empty string and then to 'NaN' for a sample value:

In [None]:
meta_raw.loc[31794, 'MATCHED_Writer_2'], meta_decoded.loc[31794, 'MATCHED_Writer_2'], meta_exported.loc[31794, 'MATCHED_Writer_2'] # unprocessed df, extra whitespace removed df, re-imported df