In [7]:
import pandas as pd
import numpy as np

# Load the provided normalized dataset
file_path = 'Normalized_Gender_IDN.csv'
normalized_dataset = pd.read_csv(file_path)

# Define the indicators
indicators = {
    'Adjusted net enrollment rate, primary': [
        'Adjusted net enrollment rate, primary, female (% of primary school age children)',
        'Adjusted net enrollment rate, primary, male (% of primary school age children)'
    ],
    'School enrollment, preprimary': [
        'School enrollment, preprimary, female (% gross)',
        'School enrollment, preprimary, male (% gross)'
    ],
    'School enrollment, primary (gross)': [
        'School enrollment, primary, female (% gross)',
        'School enrollment, primary, male (% gross)'
    ],
    'School enrollment, primary (net)': [
        'School enrollment, primary, female (% net)',
        'School enrollment, primary, male (% net)'
    ],
    'School enrollment, secondary (gross)': [
        'School enrollment, secondary, female (% gross)',
        'School enrollment, secondary, male (% gross)'
    ],
    'School enrollment, secondary (net)': [
        'School enrollment, secondary, female (% net)',
        'School enrollment, secondary, male (% net)'
    ],
    'School enrollment, tertiary': [
        'School enrollment, tertiary, female (% gross)',
        'School enrollment, tertiary, male (% gross)'
    ]
}

# Flatten the list of indicators
indicator_columns = [column for sublist in indicators.values() for column in sublist]

# Filter the dataset for years in the range of 1960s to 1980s
filtered_dataset = normalized_dataset[normalized_dataset['Year'].between(1960, 1989)]

# Define a function for random sampling imputation
def random_sampling_imputation(df, column_name):
    non_null_values = df[column_name].dropna()
    if non_null_values.empty:
        return df[column_name]  # Return the column as is if no non-null values are available
    return df[column_name].apply(lambda x: np.random.choice(non_null_values) if pd.isnull(x) else x)

# Apply the random sampling imputation to the specified indicators
for column in indicator_columns:
    if column in filtered_dataset.columns and filtered_dataset[column].isnull().any():
        filtered_dataset[column] = random_sampling_imputation(filtered_dataset, column)

# Replace the years in the original dataset with the imputed data
normalized_dataset.update(filtered_dataset)

# Select only the specified indicators and the 'Year' column
final_dataset = normalized_dataset[['Year'] + indicator_columns]

# Save the final dataset
output_file_path_final = 'Imputed_Indicators_2.csv'
final_dataset.to_csv(output_file_path_final, index=False)

print(f'Final dataset with imputed values for specified indicators has been saved to {output_file_path_final}')

Final dataset with imputed values for specified indicators has been saved to Imputed_Indicators_2.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_dataset[column] = random_sampling_imputation(filtered_dataset, column)
