In [None]:
import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Read the original CSV file into a DataFrame
df = pd.read_csv("Data/table_v2.csv")

# Drop rows containing "nationality" in the first column
df = df[~df.iloc[:, 0].str.contains('Nationality', na=False)]

# Fill NaN values in the first column with the value from the row above
df.iloc[:, 0] = df.iloc[:, 0].fillna(method='ffill')

# Determine the number of columns in the original DataFrame
num_columns_original = df.shape[1]

# Define the new column names
new_columns = ['Nationality', 'Department', 'Program',
               'Applications 2019', 'Offers 2019', 'Entrances 2019',
               'Applications 2020', 'Offers 2020', 'Entrances 2020',
               'Applications 2021', 'Offers 2021', 'Entrances 2021',
               'Applications 2022', 'Offers 2022', 'Entrances 2022']

# Append empty columns to the new column names list if needed
num_empty_columns = num_columns_original - len(new_columns)
if num_empty_columns > 0:
    new_columns += [''] * num_empty_columns

# Rename the columns in the original DataFrame
df.columns = new_columns

df.to_csv("Data/GOKU_file_1.csv", index=False)
df

In [None]:
import pandas as pd

# Path to the original and new CSV files
input_file_path = "Data/GOKU_file_1.csv"
output_file_path = "Data/GOKU_file_2.csv"

# Open the original CSV file for reading and the new CSV file for writing
with open(input_file_path, 'r') as infile, open(output_file_path, 'w') as outfile:
    for line_number, line in enumerate(infile, start=1):
        # Split the line into columns
        columns = line.strip().split(',')
        
        # Check if the current line is within the rows to modify
        if 873 <= line_number <= 923:
            # Combine Column F with Column B (index 5 with index 1, considering 0-based indexing)
            # Column F's entries come first
            new_second_column = columns[5] + " " + columns[1]
            columns[1] = new_second_column
            # Move values from Column G (index 6) to Column F (index 5)
            columns[5] = columns[6]
            # Clear the old Column G
            columns[6] = ""
        
        # Write the modified or unmodified line to the new CSV file
        outfile.write(','.join(columns) + '\n')

# Load the updated CSV file into a DataFrame for inspection
df_updated = pd.read_csv(output_file_path)

# Display the modified rows for inspection
df_updated.iloc[870:925]


In [None]:
import csv

# Path to the original and new CSV files
input_file_path = "Data/GOKU_file_2.csv"
output_file_path = "Data/GOKU_file_3.csv"

# Open the original CSV file for reading and the new CSV file for writing
with open(input_file_path, mode='r', newline='') as infile, open(output_file_path, mode='w', newline='') as outfile:
    reader = csv.reader(infile)
    writer = csv.writer(outfile, quoting=csv.QUOTE_MINIMAL)

    for line_number, row in enumerate(reader, start=1):
        # Apply modifications only to specific rows
        if 1543 <= line_number <= 1593:
            row[1] = row[7] + " " + row[8]
            row[7], row[8] = "", ""

        elif 1902 <= line_number <= 1952:
            row[1] = row[9] + " " + row[8]
            row[8], row[9] = "", ""

        elif 2208 <= line_number <= 2258:
            row[1] = row[7] + " " + row[8]
            row[7], row[8] = "", ""

        elif 2566 <= line_number <= 2616:
            row[1] = row[5] + " " + row[1]
            row[5] = row[6]
            row[6] = ""

        elif 3995 <= line_number <= 4045:
            row[1] = row[10] + " " + row[8]
            row[10], row[8] = "", ""

        elif 4455 <= line_number <= 4505:
            row[1] = row[10] + " " + row[8]
            row[10], row[8] = "", ""

        elif 4813 <= line_number <= 4863:
            row[1] = row[9] + " " + row[8]
            row[9], row[8] = "", ""

        elif 5017 <= line_number <= 5067:
            row[1] = row[7] + " " + row[8]
            row[7], row[8] = "", ""

        elif 5170 <= line_number <= 5220:
            row[1] = row[9] + " " + row[8]
            row[9], row[8] = "", ""

        # Write the modified or unmodified row to the new CSV file
        writer.writerow(row)

# Load the updated CSV file into a DataFrame for inspection
df_updated = pd.read_csv(output_file_path)

# Display the modified rows for inspection
print(df_updated.iloc[1538:5225])  # Display a wider range around the modified areas for context


In [None]:
import csv

# Define the file paths
input_file = "Data/GOKU_file_3.csv"
output_file = "Data/GOKU_file_4.csv"

# Open the input CSV file for reading and the output CSV file for writing
with open(input_file, mode='r', newline='', encoding='utf-8') as infile, \
     open(output_file, mode='w', newline='', encoding='utf-8') as outfile:
    
    # Create CSV reader and writer objects
    reader = csv.reader(infile)
    writer = csv.writer(outfile)
    
    # Iterate through each row in the input CSV file
    for row in reader:
        # Split the row by space or comma to extract numbers and stars
        values = ' '.join(row).split()
        
        # Extract numbers and stars from the values
        numbers_stars = [value for value in values if value.isdigit() or value == '*']
        
        # Pad the numbers_stars list with NaNs to ensure it has 15 elements
        numbers_stars.extend([''] * (15 - len(numbers_stars)))
        
        # Write the row with the numbers and stars distributed into the corresponding columns
        writer.writerow(row[:3] + numbers_stars)
        
data = pd.read_csv("Data/GOKU_file_4.csv")
data

In [None]:
# Define the file paths
input_file = "Data/GOKU_file_4.csv"
output_file = "Data/GOKU_file_5.csv"

# Load the data into a DataFrame
data = pd.read_csv(input_file)

# Remove all entries from the "Program" column
data['Program'] = " "

# Remove all numbers and asterisks from the "Department" column
data['Department'] = data['Department'].str.replace(r'[\d\*]+', '', regex=True)

# Use regex to extract "UG Degree" or "PG Taught" and the remaining department part
data['Program'] = data['Department'].str.extract('(UG Degree|PG Taught)')
data['Department'] = data['Department'].str.replace('UG Degree|PG Taught', '').str.strip()

prev_program = None  # To store the 'Programme' of the previous row
prev_index = None      # To store the index of the previous row

# Iterate over the DataFrame using .iterrows()
for index, row in data.iterrows():
    if prev_program == 'UG Degree' and row['Program'] == 'PG Taught' and row['Department'].strip():
        # Concatenate the department name from the current row to the previous row
        data.at[prev_index, 'Department'] += ' ' + row['Department']
        # Clear the department in the current row
        data.at[index, 'Department'] = ''
    
    # Update previous row info
    prev_program = row['Program']
    prev_index = index
    
# Fill forwards to have no empty rows
data['Department'] = data['Department'].replace('', pd.NA).fillna(method='ffill')

for index, row in data.iterrows():
    if row['Department'] == "European In..":
        data.at[index, 'Department'] = "European Institute"
    elif row['Department'] == "Gender Stud..":
        data.at[index, 'Department'] = "Gender Studies"
    elif row['Department'] == "Geography and Environ..":
        data.at[index, 'Department'] = "Geography and Environment"
    elif row['Department'] == "International..":
        data.at[index, 'Department'] = "International Development"
    elif row['Department'] == "Media and C..":
        data.at[index, 'Department'] = "Media and Communications"
    elif row['Department'] == "Philosophy, Logic and S..":
        data.at[index, 'Department'] = "Philosophy, Logic and Scientific Method"
    elif row['Department'] == "Psychologic..":
        data.at[index, 'Department'] = "Psychological and Behavioural Sciences"
    elif row['Department'] == "School of Pu..":
        data.at[index, 'Department'] = "School of Public Policy"
        


new_columns = ['Nationality', 'Department', 'Program',
               'Applications 2019', 'Offers 2019', 'Entrances 2019',
               'Applications 2020', 'Offers 2020', 'Entrances 2020',
               'Applications 2021', 'Offers 2021', 'Entrances 2021',
               'Applications 2022', 'Offers 2022', 'Entrances 2022',
               'Applications 2023', 'Offers 2023', 'Entrances 2023']
data.columns = new_columns

# Save the modified DataFrame to the new output CSV file
data.to_csv(output_file, index=False)

# Display the modified data
data = pd.read_csv(output_file)
data