# 1. Importing necessary libraries/modules

In [18]:
import pandas as pd

# 2. Loading the data

In [19]:
file_path = r'../1_raw_files/ons_education_survey/Experiences of Education during Lockdown.xlsx'

In [20]:
sheet_names = pd.ExcelFile(file_path, engine='openpyxl').sheet_names

In [21]:
sheet_names 

['Cover_sheet', 'Table_of_contents', 'Notes', '1.1', '1.2', '2', '3']

In [22]:
selected_sheets = sheet_names[-4:]
selected_sheets

['1.1', '1.2', '2', '3']

# 3. Creating a dictionary of dataframes representing the table on each sheet

In [23]:
worksheets = {}

for sheet in selected_sheets:
    # Reading the file 
    dataframe = pd.read_excel(file_path, sheet_name=sheet)
    
    # getting name of worksheet to save in dictionary
    name_of_worksheet = dataframe.columns.tolist()[0].split(': ')[-1][:-8].strip().lower()
    
    # Filtering the dataframe
    dataframe = dataframe[7:]
    dataframe = dataframe.drop(dataframe.index[0])
    dataframe = dataframe.dropna(axis=1, how='all')
    
    # Removing new line character from first row and making the first row the column names
    def remove_new_line(string):
        string = string.replace('\n', ' ')
        return string
    
    dataframe.iloc[0] = dataframe.iloc[0].apply(remove_new_line)
    dataframe.columns = dataframe.iloc[0]
    
    # Dropping the first row
    dataframe = dataframe.drop(dataframe.index[0])
    
    # Resetting the index
    dataframe = dataframe.reset_index(drop=True)
    
    # Adding to the dictionary of tables    
    worksheets[name_of_worksheet] = dataframe

# 4. Processing the dataframe to get into more useable format and save to file

In [24]:
worksheets.keys()

dict_keys(['experiences of education and learning during the first lockdown (april to july 2020), by income-related deprivation', 'experiences of education and learning during the third lockdown (january to march 2021), by income-related deprivation', 'experiences of education recovery and catch-up activities, by income-related deprivation', 'future plans and aspirations, by income-related deprivation'])

In [25]:
dataframe = worksheets['experiences of education recovery and catch-up activities, by income-related deprivation']

In [26]:
dataframe

8,Survey question and response options,All persons %,Most deprived %,2nd quintile %,3rd quintile %,4th quintile %,Least deprived %,All persons LCL,All persons UCL,Most deprived LCL,Most deprived UCL,2nd quintile LCL,2nd quintile UCL,3rd quintile LCL,3rd quintile UCL,4th quintile LCL,4th quintile UCL,Least deprived LCL,Least deprived UCL
0,"During Year 11, did you attend school in perso...",,,,,,,,,,,,,,,,,,
1,Yes,97,95,96,97,98,98,96,97,94,96,95,97,96,98,98,99,97,99
2,"In Year 11, I was home-schooled for the whole ...",3,5,4,3,2,2,3,4,4,6,3,5,2,4,1,2,1,3
3,Among those who attended school in person at a...,,,,,,,,,,,,,,,,,,
4,Still thinking about these periods when most s...,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102,Additional classes during school holidays or a...,11026,2458,2203,2026,2147,2064,,,,,,,,,,,,
103,Thinking about the disruption to your educatio...,,,,,,,,,,,,,,,,,,
104,Weighted sample size,12828,3007,2617,2350,2382,2316,,,,,,,,,,,,
105,How do you feel that your overall motivation t...,,,,,,,,,,,,,,,,,,


In [27]:
for filename, dataframe in worksheets.items():
    # Find the limits of where the table ends and the weightings table starts
    delimit_first_table = dataframe.map(lambda x: isinstance(x, str) and 'Table' in x).any(axis=1)
    first_table_limit = dataframe[delimit_first_table].index.tolist()[0]
    
    # Finding the indices of the rows which contain questions and getting a dataframe of it
    threshold = len(dataframe.columns) * 0.5
    rows_which_contain_many_nans = dataframe[dataframe.isna().sum(axis=1) > threshold]
    
    # Getting a list of questions
    question_list = rows_which_contain_many_nans['Survey question and response options'].tolist()
    
    # Working out which questions are classifiers (these contain 'Among' in the question), Creating a list of these
    classifiers = []
    for question in question_list:
        if 'among' in question.lower() or question == 'Thinking about the disruption to your education as a result of the COVID-19 pandemic, how much do you agree or disagree with the following statements?':
            classifiers.append(question)
            question_list.remove(question)
    
    # Filtered dataframe with rows containing many NaNs
    filtered_df = dataframe[dataframe.isna().sum(axis=1) > threshold]
    
    # Get the indices of the filtered dataframe directly
    indices = filtered_df.index.tolist()
    
    # Filter indices based on first_table_limit
    first_table_indices = []
    for index in indices:
        if index < first_table_limit:
            first_table_indices.append(index)
    
    # Find consecutive indices - this represents where the classifier question meets the question
    higher_indices = []
    for i in range(len(indices)-1):
        number = indices[i]
        if (number + 1) == indices[i+1] and number < first_table_limit:
            higher_indices.append(number)    
    
    # Remove the classifier indices from first_table_indices
    for index in higher_indices:
        first_table_indices.remove(index)
    
    # adding in the necessary indices for logic to work
    first_table_indices.append(first_table_limit)
    higher_indices = [0] + higher_indices + [first_table_limit]
    
    # Create the questions list and number of occurrences
    questions = question_list[:len(first_table_indices) - 1]
    
    first_table_number_times = []
    higher_number_times = []
    
    for i in range(len(first_table_indices) - 1):
        first_table_number_times.append(first_table_indices[i + 1] - first_table_indices[i])
    
    for i in range(len(higher_indices) - 1):
        higher_number_times.append(higher_indices[i + 1] - higher_indices[i])
        
    
    # creating the columns for questions and classifiers
    questions_column = []
    hierarchical_questions_column = []
    
    for i, question in enumerate(questions):
        for j in range(first_table_number_times[i]):
            questions_column.append(question)
    
    classifiers.insert(0, None)
    
    for i  in range(len(classifiers)):
        for j in range(higher_number_times[i]):
            hierarchical_questions_column.append(classifiers[i])
            
    # Adding the questions and classifier columns to the table
    final_table = dataframe[:first_table_limit].copy()
    
    final_table['questions'] = questions_column
    
    final_table['classifier'] = hierarchical_questions_column
    
    # Now I have inserted the new columns I drop the rows from the dataframe which contain > 50% nans
    final_table = final_table.dropna(thresh=len(final_table.columns) * 0.5)
    
    # Reorder columns and rename
    final_table = final_table[[final_table.columns[-1], final_table.columns[-2]] + final_table.columns[:-2].tolist()]
    final_table.rename(columns={'Survey question and response options': 'response'}, inplace=True)
    
    # The spreadsheet contains none as a response in the response column I would like to replace the NaNs in 'response' column with 'None'
    final_table['response'] = final_table['response'].fillna('None')
    
    final_table.to_csv(rf'../4_integrated_csv_files/ons_education/{filename.lower().replace(' ', '_').replace('(', '').replace(')','').replace(',', '')}.csv')