# Data Curation: Remove Misplaced Quotes from CSV Files

This notebook cleans CSV files by removing unescaped quotes that don't follow proper CSV quoting rules, then processes the cleaned files into dataframes.

In [1]:
import os
import pandas as pd
import re

In [2]:
# Define script_dir for Jupyter Notebook
script_dir = os.getcwd()  # Use current working directory instead of __file__
project_root = os.path.abspath(os.path.join(script_dir, ".."))

input_folder = os.path.join(project_root, "data", "source_raw", "magnusweb")
output_folder = os.path.join(project_root, "data", "source_cleaned", "magnusweb")

# Create output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

print(f"Input folder: {input_folder}")
print(f"Output folder: {output_folder}")

Input folder: /Users/adam/Library/Mobile Documents/com~apple~CloudDocs/School/Master's Thesis/Analysis/profit-margins-inflation/data/source_raw/magnusweb
Output folder: /Users/adam/Library/Mobile Documents/com~apple~CloudDocs/School/Master's Thesis/Analysis/profit-margins-inflation/data/source_cleaned/magnusweb


In [3]:
def clean_misplaced_quotes(content):
    """
    Remove misplaced quotes based on rules using regex:
    - Keep quotes at start/end of lines
    - Keep quotes next to semicolons (proper CSV delimiters)
    - Remove quotes in the middle of fields without semicolons
    """
    lines = content.split('\n')
    cleaned_lines = []
    
    for line in lines:
        if not line.strip():  # Skip empty lines
            cleaned_lines.append(line)
            continue
        
        # Use regex to find and remove problematic quotes
        # Pattern: quote not at start/end and not adjacent to semicolon
        # Negative lookbehind and lookahead to check for semicolons
        pattern = r'(?<!^)(?<!;)"(?!;)(?!$)'
        cleaned_line = re.sub(pattern, '', line)
        
        cleaned_lines.append(cleaned_line)
    
    return '\n'.join(cleaned_lines)

In [4]:
# List all CSV files in the input folder that start with 'export-'
csv_files = [f for f in os.listdir(input_folder) if f.startswith('export-') and f.endswith('.csv')]

print(f"Found {len(csv_files)} CSV files to process:")
for file in csv_files:
    print(f"  - {file}")

Found 8 CSV files to process:
  - export-38.csv
  - export-39.csv
  - export-45.csv
  - export-44.csv
  - export-40.csv
  - export-41.csv
  - export-43.csv
  - export-42.csv


In [5]:
# Diagnostic: Analyze a sample file before and after cleaning
if csv_files:
    sample_file = csv_files[0]
    print(f"Analyzing sample file: {sample_file}")
    
    # Read original file
    input_path = os.path.join(input_folder, sample_file)
    with open(input_path, 'r', encoding='utf-8') as f:
        original_content = f.read()
    
    # Show first few lines of original
    original_lines = original_content.split('\n')[:5]
    print("\nOriginal file (first 5 lines):")
    for i, line in enumerate(original_lines):
        print(f"Line {i+1}: {repr(line)}")
    
    # Clean and show first few lines
    cleaned_content = clean_misplaced_quotes(original_content)
    cleaned_lines = cleaned_content.split('\n')[:5]
    print("\nCleaned file (first 5 lines):")
    for i, line in enumerate(cleaned_lines):
        print(f"Line {i+1}: {repr(line)}")
    
    # Count quotes in each
    original_quotes = original_content.count('"')
    cleaned_quotes = cleaned_content.count('"')
    print(f"\nQuote count - Original: {original_quotes}, Cleaned: {cleaned_quotes}, Removed: {original_quotes - cleaned_quotes}")
    
    # Check if structure changed significantly
    original_semicolons = original_content.count(';')
    cleaned_semicolons = cleaned_content.count(';')
    print(f"Semicolon count - Original: {original_semicolons}, Cleaned: {cleaned_semicolons}")

Analyzing sample file: export-38.csv

Original file (first 5 lines):
Line 1: '"Název subjektu";"IČO";"Hospodářský výsledek před zdaněním";"2022/4Q Hospodářský výsledek před zdaněním";"2021/4Q Hospodářský výsledek před zdaněním";"2020/4Q Hospodářský výsledek před zdaněním";"2019/4Q Hospodářský výsledek před zdaněním";"Hlavní NACE";"Hlavní NACE - kód";"2023/4Q Hospodářský výsledek před zdaněním";"2019/4Q Hospodářský výsledek za účetní období";"2020/4Q Hospodářský výsledek za účetní období";"2021/4Q Hospodářský výsledek za účetní období";"2022/4Q Hospodářský výsledek za účetní období";"2023/4Q Hospodářský výsledek za účetní období";"Hospodářský výsledek za účetní období";"Provozní hospodářský výsledek";"2023/4Q Provozní hospodářský výsledek";"2022/4Q Provozní hospodářský výsledek";"2021/4Q Provozní hospodářský výsledek";"2020/4Q Provozní hospodářský výsledek";"2019/4Q Provozní hospodářský výsledek";"Náklady";"2023/4Q Náklady";"2022/4Q Náklady";"2021/4Q Náklady";"2020/4Q Náklady";"2019/4Q 

In [6]:
# Process each CSV file to clean misplaced quotes
cleaned_files = []

print("Starting file processing...")

for csv_file in csv_files:
    print(f"\nProcessing file: {csv_file}")
    
    # Read the original file as text
    input_path = os.path.join(input_folder, csv_file)
    with open(input_path, 'r', encoding='utf-8') as f:
        original_content = f.read()
    
    # Clean misplaced quotes
    cleaned_content = clean_misplaced_quotes(original_content)
    
    # Always save to output folder (either cleaned or original)
    output_path = os.path.join(output_folder, csv_file)
    
    # Check if changes were made
    if original_content != cleaned_content:
        # Save cleaned file to output folder
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(cleaned_content)
        
        print(f"  ✓ Cleaned and saved: {csv_file}")
        cleaned_files.append(csv_file)
    else:
        # Copy original file to output folder (no changes needed)
        import shutil
        shutil.copy2(input_path, output_path)
        print(f"  - No changes needed, copied: {csv_file}")

print(f"\n=== PROCESSING COMPLETE ===")
print(f"Total files processed: {len(csv_files)}")
print(f"Files that needed cleaning: {len(cleaned_files)}")
print(f"Files copied unchanged: {len(csv_files) - len(cleaned_files)}")

Starting file processing...

Processing file: export-38.csv
  ✓ Cleaned and saved: export-38.csv

Processing file: export-39.csv
  ✓ Cleaned and saved: export-39.csv

Processing file: export-45.csv
  ✓ Cleaned and saved: export-45.csv

Processing file: export-44.csv
  ✓ Cleaned and saved: export-44.csv

Processing file: export-40.csv
  ✓ Cleaned and saved: export-40.csv

Processing file: export-41.csv
  ✓ Cleaned and saved: export-41.csv

Processing file: export-43.csv
  ✓ Cleaned and saved: export-43.csv

Processing file: export-42.csv
  ✓ Cleaned and saved: export-42.csv

=== PROCESSING COMPLETE ===
Total files processed: 8
Files that needed cleaning: 8
Files copied unchanged: 0


In [7]:
# Now process the cleaned CSV files into dataframes
base_names = []

print("Loading cleaned CSV files into dataframes...")

# Process each cleaned CSV file and create dataframes
for csv_file in csv_files:
    print(f"Loading: {csv_file}")
    
    # Extract base name without .csv extension
    base_name = csv_file.replace('.csv', '')
    base_name = re.sub(r'[^a-zA-Z0-9_]', '_', base_name)  # Replace non-alphanumeric characters with underscores

    # Add base name to the list
    base_names.append(base_name)
    
    # Construct full file path (now using cleaned files)
    file_path = os.path.join(output_folder, csv_file)
    
    # Read CSV with semicolon delimiter, proper quoting, and UTF-8 encoding
    df_temp = pd.read_csv(file_path, delimiter=';', quotechar='"', encoding='utf-8')
    
    # Create variable with the base name
    globals()[base_name] = df_temp

print(f"\n=== DATAFRAME LOADING COMPLETE ===")
print(f"Processed {len(csv_files)} cleaned CSV files into dataframes")

Loading cleaned CSV files into dataframes...
Loading: export-38.csv
Loading: export-39.csv
Loading: export-45.csv
Loading: export-44.csv
Loading: export-40.csv
Loading: export-41.csv
Loading: export-43.csv
Loading: export-42.csv

=== DATAFRAME LOADING COMPLETE ===
Processed 8 cleaned CSV files into dataframes


In [8]:
# Merge all dataframes into one (based on base_names)
df = pd.concat([globals()[name] for name in base_names], ignore_index=True)

print(f"Dataframes merged into one with {len(df)} rows and {len(df.columns)} columns")

# Check for duplicates
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows in merged dataframe: {duplicates}")

# Check duplicate IČO 
ico_duplicates = df['IČO'].duplicated().sum()
print(f"Number of duplicate IČO in merged dataframe: {ico_duplicates}")

Dataframes merged into one with 73134 rows and 256 columns
Number of duplicate rows in merged dataframe: 0
Number of duplicate IČO in merged dataframe: 38


In [9]:
# Display sample of the merged dataframe
print("Sample of merged dataframe:")
print(df.head())
print(f"\nDataframe shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

Sample of merged dataframe:
                    Název subjektu         IČO  \
0                  TUkas ČSAO a.s.  25614304.0   
1     ACTIVE Partners spol. s r.o.  16191552.0   
2  Lidové noviny, a.s. v likvidaci  43874444.0   
3                Havas Prague a.s.  63079054.0   
4                    KD Beta, a.s.  25676792.0   

   Hospodářský výsledek před zdaněním  \
0                           8061000.0   
1                           -119000.0   
2                            553000.0   
3                          23466000.0   
4                            395000.0   

   2022/4Q Hospodářský výsledek před zdaněním  \
0                                  17873000.0   
1                                   -676000.0   
2                                         NaN   
3                                         NaN   
4                                   -163000.0   

   2021/4Q Hospodářský výsledek před zdaněním  \
0                                  11223000.0   
1                              

In [10]:
# Test reading performance: compare original vs cleaned file
import time

if csv_files:
    sample_file = csv_files[0]
    
    # Time reading original file
    original_path = os.path.join(input_folder, sample_file)
    start_time = time.time()
    try:
        df_original = pd.read_csv(original_path, delimiter=';', quotechar='"', encoding='utf-8')
        original_time = time.time() - start_time
        original_success = True
        print(f"Original file read time: {original_time:.2f} seconds")
        print(f"Original shape: {df_original.shape}")
    except Exception as e:
        original_time = time.time() - start_time
        original_success = False
        print(f"Original file failed to read in {original_time:.2f} seconds: {e}")
    
    # Time reading cleaned file
    cleaned_path = os.path.join(output_folder, sample_file)
    if os.path.exists(cleaned_path):
        start_time = time.time()
        try:
            df_cleaned = pd.read_csv(cleaned_path, delimiter=';', quotechar='"', encoding='utf-8')
            cleaned_time = time.time() - start_time
            cleaned_success = True
            print(f"Cleaned file read time: {cleaned_time:.2f} seconds")
            print(f"Cleaned shape: {df_cleaned.shape}")
            
            if original_success and cleaned_success:
                print(f"Time difference: {cleaned_time - original_time:.2f} seconds")
                if cleaned_time > original_time * 2:
                    print("⚠️  Cleaned file takes significantly longer to read!")
        except Exception as e:
            cleaned_time = time.time() - start_time
            print(f"Cleaned file failed to read in {cleaned_time:.2f} seconds: {e}")

Original file read time: 0.31 seconds
Original shape: (10000, 256)
Cleaned file read time: 0.25 seconds
Cleaned shape: (10000, 256)
Time difference: -0.06 seconds
