In [None]:
import pandas as pd
import random
import csv

'''
Initial approach since our sample is given representative

import pandas as pd

# Read the original dataset
original_data = pd.read_csv("./Data.csv")

# Perform random sampling to create synthetic population of 50,000 individuals
synthetic_population = original_data.sample(n=50000, replace=True)

# Save the synthetic population to a file
synthetic_population.to_csv("synthesized_population.csv", index=False)

# synthetic_population
'''

# Load the data into a pandas DataFrame
data = pd.read_csv("Data.csv")

# Calculate the joint probability distribution
joint_prob_table = pd.crosstab(index=[data['Sex'], data['Age_category'], data['Highest_education_level']],
                               columns='count',
                               normalize=True)

# Define conditional probabilities for sex, age, and education
conditional_prob_sex = {}
conditional_prob_age = {}
conditional_prob_edu = {}

# Iterate through all possible combinations of sex, age, and education level
for sex in range(1, 3):  # Sex: 1, 2
    for age in range(1, 4):  # Age: 1, 2, 3
        for edu in range(0, 4):  # Education: 0, 1, 2, 3
            # Conditional probability of sex given age and education
            if(age==3 and (edu==1 or edu==3)):
              continue
            conditional_prob_sex[(sex, age, edu)] = joint_prob_table.loc[(sex, age, edu),'count']/ joint_prob_table.loc[(slice(None), age, edu),'count'].sum()
            # Conditional probability of age given sex and education
            conditional_prob_age[(sex, age, edu)] = joint_prob_table.loc[(sex, age, edu),'count'] /joint_prob_table.loc[(sex, slice(None), edu),'count'].sum()
            # Conditional probability of education given sex and age
            conditional_prob_edu[(sex, age, edu)] = joint_prob_table.loc[(sex, age, edu),'count'] /joint_prob_table.loc[(sex, age, slice(None)),'count'].sum()

# Handle edge cases where conditional probabilities are 0
conditional_prob_sex[(1,3,1)]=0
conditional_prob_age[(1,3,1)]=0
conditional_prob_edu[(1,3,1)]=0
conditional_prob_sex[(1,3,3)]=0
conditional_prob_age[(1,3,3)]=0
conditional_prob_edu[(2,3,3)]=0
conditional_prob_sex[(2,3,1)]=0
conditional_prob_age[(2,3,1)]=0
conditional_prob_edu[(2,3,1)]=0
conditional_prob_sex[(2,3,3)]=0
conditional_prob_age[(2,3,3)]=0
conditional_prob_edu[(2,3,3)]=0

# Find the mode of the combinations of sex, age category, and highest education level
mode_state = data.mode().iloc[0][['Sex', 'Age_category', 'Highest_education_level']]

# Store the mode state as a tuple
initial_state = tuple(mode_state)

# Initialize an empty list to store the new sample
new_sample = []

# Function to determine the next state based on sex
def next_state_sex(age,edu):
  probability=random.random()
  if probability<= conditional_prob_sex[1,age,edu]:
        next_state = (1,age,edu)
  else:
        next_state = (2,age,edu)
  return next_state

# Function to determine the next state based on age
def next_state_age(sex,edu):
  probability=random.random()
  if probability<= conditional_prob_age[sex,1,edu]:
        next_state = (sex,1,edu)
  elif probability > conditional_prob_age[sex,1,edu] and probability <= conditional_prob_age[sex,1,edu]+conditional_prob_age[sex,2,edu]:
        next_state = (sex,2,edu)
  else:
        next_state = (sex,3,edu)
  return next_state

# Function to determine the next state based on education
def next_state_edu(sex,age):
  probability=random.random()
  if probability<= conditional_prob_edu[sex,age,0]:
        next_state = (sex,age,0)
  elif probability > conditional_prob_edu[sex,age,0] and probability <= conditional_prob_edu[sex,age,1]+conditional_prob_edu[sex,age,0]:
        next_state = (sex,age,1)
  elif probability > conditional_prob_edu[sex,age,1]+conditional_prob_edu[sex,age,0] and probability <= conditional_prob_edu[sex,age,1]+conditional_prob_edu[sex,age,0]+conditional_prob_edu[sex,age,2]:
        next_state = (sex,age,2)
  else:
        next_state = (sex,age,3)
  return next_state

# Perform transitions 50,000 times
for _ in range(50000):
  # Initial state
  next_state=initial_state
  # Perform transitions
  for _ in range(10):
    next_state=next_state_sex(next_state[1],next_state[2])
    next_state=next_state_age(next_state[0],next_state[2])
    next_state=next_state_edu(next_state[0],next_state[1])
  # Append the next state to the new sample
  initial_state=next_state
  new_sample.append(next_state)

# Define the file path for the CSV file
csv_file_path = "synthesized_population.csv"

# Define the header for the CSV file
header = ["Sex", "Age_category", "Highest_education_level"]

# Write the header to the CSV file
with open(csv_file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(header)

# Append the new sample to the CSV file
with open(csv_file_path, mode='a', newline='') as file:
    writer = csv.writer(file)
    for state in new_sample:
        writer.writerow(state)

# Read the synthetic population dataset
synthetic_population = pd.read_csv("./synthesised_population.csv")

# Calculate frequency of each category for Sex
sex_frequency = synthetic_population['Sex'].value_counts().sort_index()

# Calculate frequency of each category for Age_category
age_category_frequency = synthetic_population['Age_category'].value_counts().sort_index()

# Calculate frequency of each category for Highest_education_level
education_level_frequency = synthetic_population['Highest_education_level'].value_counts().sort_index()

# Create a list of tuples for the MultiIndex
index = pd.MultiIndex.from_tuples([('Sex', 'Male'), ('Sex', 'Female'), ('Age_group', 'Below 22 years'), ('Age_group', '22-60 years'), ('Age_group', 'Above 60 years'), ('Highest_education_level', 'No_formal education'), ('Highest_education_level', 'Primary education'), ('Highest_education_level', 'Secondary education'), ('Highest_education_level', 'Graduation and above')], names=['Variable', 'Description'])

# Data
data = {
    'Frequency': [sex_frequency.iloc[0], sex_frequency.iloc[1], age_category_frequency.iloc[0], age_category_frequency.iloc[1], age_category_frequency.iloc[2], education_level_frequency.iloc[0], education_level_frequency.iloc[1], education_level_frequency.iloc[2], education_level_frequency.iloc[3]]
}

# Create DataFrame
df = pd.DataFrame(data, index=index)

with open('Answer.txt', 'w') as file:
    file.write(df.to_string())

# Create a list of tuples for the MultiIndex
index = pd.MultiIndex.from_tuples([('Sex', 'Male'), ('Sex', 'Female'), ('Age_group', 'Below 22 years'), ('Age_group', '22-60 years'), ('Age_group', 'Above 60 years'), ('Highest_education_level', 'No_formal education'), ('Highest_education_level', 'Primary education'), ('Highest_education_level', 'Secondary education'), ('Highest_education_level', 'Graduation and above')], names=['Variable', 'Description'])

# Sample Answer that we should obtain
data = {
    'Frequency': [25324, 24676, 17955, 29642, 2403, 7490, 5655, 24400, 12455]
}

# Create DataFrame
df = pd.DataFrame(data, index=index)
print(df)



                                              Frequency
Variable                Description                    
Sex                     Male                      25324
                        Female                    24676
Age_group               Below 22 years            17955
                        22-60 years               29642
                        Above 60 years             2403
Highest_education_level No_formal education        7490
                        Primary education          5655
                        Secondary education       24400
                        Graduation and above      12455
