In [19]:
import pandas as pd
import numpy as np
from faker import Faker
import json

fake = Faker()

In [15]:
# Function to generate a random 20 digit alphanumeric string`
def generate_name():
    return fake.name()

def generate_applicant_id():
    return ''.join(fake.random_choices(elements='0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ', length=20))

# Function to generate a normally distributed Casper score
def generate_casper_score():
    return round(np.random.normal(0, 1.5), 2)

# Function to generate highest education of parents
def generate_highest_education_parents():
    return np.random.choice(['high school', 'bachelors degree', 'graduate degree', 'professional degree'], p=[0.25, 0.25, 0.25, 0.25])

# Function to generate household income
def generate_household_income():
    return np.random.choice(['< 50000', '50000 to 75000', '75000 to 100000', '100000 to 150000', '> 150000'], p=[0.2, 0.2, 0.2, 0.2, 0.2])

# 
df_student_summaries = pd.read_csv('student_summaries.csv')


In [29]:
# Assuming df_schools is a DataFrame that has already been defined elsewhere
df_schools = pd.read_csv('df_schools.csv')
df_sampled_schools = df_schools[['zip_location', 'meps_poverty_pct']].sample(n=1000, random_state=1)

In [36]:
import json

# Load the institutions.json file
with open('institutions.json', 'r') as file:
    institutions = json.load(file)

# Extract unitid values
unitids = [institution['unitid'] for institution in institutions]


In [37]:
# Generate the dataset
data = {
    'applicant_id': [generate_applicant_id() for _ in range(1000)],
    'name': [generate_name() for _ in range(1000)],
    'casper_z': [generate_casper_score() for _ in range(1000)],
    'parent_ed': [generate_highest_education_parents() for _ in range(1000)],
    'household_income': [generate_household_income() for _ in range(1000)],
}

df = pd.DataFrame(data)

df['zip'] = df_sampled_schools['zip_location'].values
df['meps_poverty_pct'] = df_sampled_schools['meps_poverty_pct'].values
df['summary'] = df_student_summaries['summary'].values
# Generate the 'applied_to' column
df['applied_to'] = [np.random.choice(unitids, size=np.random.randint(1, 9), 
                     replace=False).tolist() for _ in range(1000)]

print(df.head())

           applicant_id               name  casper_z         parent_ed  \
0  9DEDTDUZ5Q9FHBUJPRHZ      Brandon Davis     -0.48   graduate degree   
1  2DITI9L1OBI7RM30FDU6  Courtney Anderson     -0.32  bachelors degree   
2  KZHA4K0MLMG1AO54V4PG     Gabriel Wilson     -2.28  bachelors degree   
3  JQIH6NGTNPFZIF3CUE1R       Kenneth Diaz      0.17       high school   
4  W4N4YJE7UB1H26JP4PHD         Donna Hall     -2.11  bachelors degree   

  household_income    zip  meps_poverty_pct  \
0         > 150000  35023         14.274462   
1          < 50000  72662         19.373596   
2   50000 to 75000  94539          0.000000   
3   50000 to 75000  92626               NaN   
4   50000 to 75000  93535         19.556446   

                                             summary  \
0  The response included a discussion of clear co...   
1  The response lacked detail about how the stude...   
2  The response briefly mentioned the use of incl...   
3  The student demonstrated strong leadership sk

In [38]:
# Assuming df is your DataFrame
json_data = df.to_json(orient='records')

# To pretty-print the JSON
parsed_json = json.loads(json_data)

with open('applicants.json', 'w') as f:
    json.dump(parsed_json, f, indent=4)