In [8]:
import csv

# Define the input and output file paths
input_file = "Data/table_v2.csv"
output_file = "Data/AAA.csv"

# Read the input CSV file
with open(input_file, newline='', encoding='utf-8') as infile:
    reader = csv.reader(infile)
    rows = list(reader)

# Extract year headers from the second row, assuming it contains the years and statistics types
years = rows[1][2::3]  # Every third column starting from the third contains the year

# Create the new headers based on the year and statistic type
new_header = ["Nationality", "Department", "Programme"]
for year in years:
    new_header.extend([f"{year} Apps", f"{year} Offers", f"{year} Entrants"])

# Initialize list to hold corrected rows
corrected_rows = []

# Initialize variables to track the current nationality and its departments
current_nationality = None
department_program_data = []

# Iterate over the rows starting from the third row
for row in rows[2:]:  # Skip the first two header rows
    if row[0]:  # This row contains a new nationality
        if current_nationality:
            # Finish processing the previous nationality
            for department_program, data in department_program_data:
                corrected_row = [current_nationality] + department_program + data
                corrected_rows.append(corrected_row)
            department_program_data = []  # Reset for the next nationality

        # Update the current nationality
        current_nationality = row[0]
    
    # Process the department and program part of the row
    department_program = row[1].split(' ', 1)
    if len(department_program) == 1:  # Only department is present, no program
        department_program.append('')
    
    # Process the statistics part of the row, padding with '*' if necessary
    stats = row[2:]
    if len(stats) < len(years) * 3:  # If less stats than expected
        stats.extend(['*'] * (len(years) * 3 - len(stats)))  # Pad with '*'
    
    # Group stats by year into tuples of (Apps, Offers, Entrants)
    grouped_stats = [stats[i:i + 3] for i in range(0, len(stats), 3)]
    
    # Add to the department_program_data for the current nationality
    department_program_data.append((department_program, grouped_stats))

# Don't forget to add the last processed nationality to corrected_rows
for department_program, data in department_program_data:
    corrected_row = [current_nationality] + department_program + data
    corrected_rows.append(corrected_row)

# Write the corrected rows to the output CSV file
with open(output_file, 'w', newline='', encoding='utf-8') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(new_header)  # Write the header row
    for corrected_row in corrected_rows:
        # Flatten the list of lists to a single list
        flat_corrected_row = [item for sublist in corrected_row for item in sublist]
        writer.writerow(flat_corrected_row)  # Write the flattened corrected row

print(f"Data has been processed and saved to {output_file}")

Data has been processed and saved to Data/AAA.csv


In [14]:
import pandas as pd

# Load data
data = pd.read_csv('Data/table_v2.csv')

# Propagate 'Nationality' down for non-empty rows and remove any rows that are headers or non-data
data['Nationality'] = data['Unnamed: 0'].fillna(method='ffill')
data = data[data['Nationality'] != 'Nationality']  # Remove header-like rows repeated in data

# Assume '2019/20 2020/21' column contains concatenated department, programme, and numbers
# Let's try a more refined extraction based on common patterns observed
extracted_info = data['2019/20 2020/21'].str.extract(r'(?P<Department>[A-Za-z &]+)\s+(?P<Programe>UG Degree|PG Taught)\s+(?P<Apps19>[\d\*]+)\s+(?P<Offers19>[\d\*]+)\s+(?P<Entrants19>[\d\*]+)')
data = data.join(extracted_info)

# Simplifying department names to match Book3 (e.g., stripping trailing text that might be repetitive or incorrect)
data['Department'] = data['Department'].str.replace('Department', '').str.strip()

# Extract numerical data for 2020/21 to 2023/24 if the structure is consistent across columns (this would need the actual column names to be corrected as seen in your data)
# This is a placeholder and needs actual column names and data structure
# data['2020/21 Apps'] = data['2020/21'].str.extract('...')  # Placeholder for actual regex extraction

# Now let's filter out the columns and format to match the Excel structure for comparison
final_data = data[['Nationality', 'Department', 'Programe', 'Apps19', 'Offers19', 'Entrants19']].head(7)

# Save to CSV to check if the format aligns with Book3
final_data.to_csv('Data/AAA.csv', index=False)

final_data

Unnamed: 0,Nationality,Department,Programe,Apps19,Offers19,Entrants19
1,Afghan,Accounting,UG Degree,0,0.0,0.0
2,Afghan,,,,,
3,Afghan,Anthropology,UG Degree,*,0.0,0.0
4,Afghan,Economic,UG Degree,*,0.0,0.0
5,Afghan,History,PG Taught,*,0.0,0.0
6,Afghan,Economics,UG Degree,0,0.0,0.0
7,Afghan,,,,,
