<a href="https://colab.research.google.com/github/autinn/legacy-assignment/blob/main/m28_legacy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Merging Data

In [None]:
import pandas as pd


# Load the CSV files
questionnaire_checklist = pd.read_csv('checklist.csv')
legacy_info = pd.read_csv('legacy.csv')
foundation_responses = pd.read_csv('response.csv')

# Merging the foundation questionnaire responses to extract 'voice story' and 'credo selection'
merged_df = pd.merge(questionnaire_checklist,
                     foundation_responses[['Email Address',
                                           'Choose 5 statements that resonate most with you. Please read through all of them before selecting your 5.']],
                     left_on='Minerva Email',
                     right_on='Email Address',
                     how='inner')

# Merging the legacy information to extract 'gender', 'country_1', and 'country_2'
merged_df = pd.merge(merged_df,
                     legacy_info[['Minerva Email', 'Gender','Country_1', 'Country_2']],
                     on='Minerva Email',
                     how='left')

# Dropping the redundant 'Email Address' column from the merged dataframe
merged_df.drop(columns=['Email Address'], inplace=True)

# Renaming columns for clarity
merged_df.rename(columns={
    'Choose 5 statements that resonate most with you. Please read through all of them before selecting your 5.': 'Credo Selection'
}, inplace=True)

# Display the first few rows of the merged dataframe
merged_df.head(20)


Unnamed: 0,Questionnaire,Full Name,Minerva Email,Credo Selection,Gender,Country_1,Country_2
0,True,Abhishek Amit Roy,abhroy@uni.minerva.edu,I honor the relationships and connections I ha...,male,India,
1,True,Ada Choudhry,ada.choudhry@uni.minerva.edu,I consistently surprise myself by accomplishin...,female,India,
2,True,Aditya Jha,aditya.jha@uni.minerva.edu,I honor the relationships and connections I ha...,male,India,
3,True,Ahmed Usman Khan,ahmed.khan@uni.minerva.edu,I honor the relationships and connections I ha...,male,Pakistan,
4,True,Aika H Meleus,aika.meleus@uni.minerva.edu,I honor the relationships and connections I ha...,female,United States,
5,True,Aleksandra Spasova Daskalova,alexandra.daskalova@uni.minerva.edu,"I see connections, realize the cyclical and pe...",female,Bulgaria,
6,True,Almas Sapar,almas.sapar@uni.minerva.edu,I see myself moving in an endless cycle forwar...,male,Kazakhstan,
7,True,Altair Adilkhan,altair.adilkhan@uni.minerva.edu,I honor the relationships and connections I ha...,male,Kazakhstan,
8,True,Alua Turumbayeva,alua.turumbayeva@uni.minerva.edu,I see myself moving in an endless cycle forwar...,female,Kazakhstan,
9,True,AMELIE Marie BADHEN,amelie.badhen@uni.minerva.edu,I honor the relationships and connections I ha...,female,United Kingdom,


In [None]:
merged_df.to_csv('updated.csv', index=False)

# Categorise to Legacy

In [None]:
# legacy selection

# 1. Create a dictionay for the legacy
legacies = {
    "Cable": [
        "I honor the relationships and connections I have made in life, both internally and externally."
    ],
    "Chronicle": [
        "I take note of what is happening around me and communicate that as authentically as possible."
    ],
    "Circuit": [
        "I see connections, realize the cyclical and periodic nature of life's journeys, and understand how they are all connected."
    ],
    "Civic": [
        "I think personal duty to others is critical; I feel a responsibility to create more value for the world than I extract."
    ],
    "Eureka": [
        "I consistently surprise myself by accomplishing things I previously believed I could not."
    ],
    "Field": [
        "I embrace the principles of nurture and growth through teamwork, perseverance, and preparedness  – a blade of grass does not make a field, but many blades together do."
    ],
    "Gate": [
        "I see myself moving in an endless cycle forward — continuously improving towards the next goal, the next dream, the next level, the next gate."
    ],
    "Hunter": [
        "I use my intuition to guide me through uncertainty and am constantly hunting down my goals."
    ],
    "Labyrinth": [
        "I appreciate the intrinsic value of the journey itself, with all of its wrong turns and out-of-the-way stops."
    ],
    "Lands": [
        "I develop a better understanding of both familiar and foreign lands, through deep cultural immersion."
    ],
    "Laurel": [
        "I constantly ask questions and explore new ideas — never resting on my laurels."
    ],
    "Legion": [
        "I take up arms for causes I choose to dedicate myself to, and work with others to achieve victory."
    ],
    "Liberty": [
        "I seek to understand the role freedom plays in each of our lives, considering what aspects of life make me feel most vital, capable, and effective."
    ],
    "Mason": [
        "I commit to working hard to accomplish shared goals, exploring common interests with those around me, and embracing lifelong learning."
    ],
    "Mission": [
        "I focus on finding my personal mission—that which engenders the strongest connection I can feel to the world."
    ],
    "North": [
        "I put faith in a future unrealized and look beyond what is immediately evident."
    ],
    "Ocean": [
        "I search for both breadth, the examination of vast expanses, and depth, exploration far beyond what can be immediately seen on the surface."
    ],
    "Octagon": [
        "I strive for balance and harmony among the various aspects of life."
    ],
    "Pier": [
        "I stand for the conviction I build before embarking on a journey into the boundless ocean, accompanied by the promise of a future of my own making."
    ],
    "Plaza": [
        "I step out of my comfort zone as a way to welcome others into my life."
    ],
    "Pyramid": [
        "I build my core identity through intentional reflection and introspection."
    ],
    "Reserve": [
        "I recognize the distinction between those things that may seem necessary but need not be kept, and those that appear unnecessary but are actually essential to retain."
    ],
    "Tower": [
        "I abandon fear and climb down to look beyond pleasant views." # Removed "Then embrace courage and climb up again." for the sake of convenience
    ],
    "Union": [
        'I understand "union" as more than just an act of meeting.' # Removed "It is a connection, a binding force, a loyalty, and a meeting of profundity." for the sake of convenience
    ],
    "Vista": [
        "I draw on my personal narrative, by taking snapshots of my life over time, being attentive to details, and staying open to change."
    ]
}

In [None]:
# Function to break down the credo selection into five sentences
import re

def breakdown_credo_selection(credo_selection):
    sentences = re.findall(r'(?:I|It).*?\.', credo_selection)  # Find sentences starting with 'I', not "It" and ending with '.'
    return '\n'.join(sentences[:5]) # Get the first five output

merged_df['Credo'] = merged_df['Credo Selection'].apply(breakdown_credo_selection)

print(merged_df["Credo"][0])

merged_df.to_csv('updated.csv', index=False)

I honor the relationships and connections I have made in life, both internally and externally.
I embrace the principles of nurture and growth through teamwork, perseverance, and preparedness  – a blade of grass does not make a field, but many blades together do.
I use my intuition to guide me through uncertainty and am constantly hunting down my goals.
I commit to working hard to accomplish shared goals, exploring common interests with those around me, and embracing lifelong learning.
I step out of my comfort zone as a way to welcome others into my life.


In [None]:
# Credo to Legacy mapping (from your previous code)
credo_to_legacy = {credo: legacy for legacy, credos in legacies.items() for credo in credos}

def match_credo_to_legacy(credo_list):
    matched_legacies = []
    for credo in credo_list:
        legacy = credo_to_legacy.get(credo, "Unknown")  # Match credo to legacy, default to "Unknown" if not found
        matched_legacies.append(legacy)
    return matched_legacies

# Assuming 'Credo' column contains the list of 5 sentences
merged_df['Legacy'] = merged_df['Credo'].apply(lambda x: match_credo_to_legacy(x.split('\n')))
merged_df.drop(columns=['Credo Selection'], inplace=True)
merged_df.drop(columns=['Credo'], inplace=True)
merged_df.to_csv('updated.csv', index=False)

# Assign to legacy
based on country, limited to 5-6 students

In [None]:
# Randomise the order of students
merged_df = merged_df.sample(frac=1).reset_index(drop=True)
merged_df.to_csv('updated.csv', index=False)


# Loop through the randomised student list and assign each person to the first available legacy group from preference, consider demographic diversity and group size
# Balance demographics - ensure each legacy group maintains a mix of people from different countries

In [None]:
# Randomise legacy selection
def shuffle_legacy(legacy_list):
    import random
    shuffled_list = legacy_list.copy()
    random.shuffle(shuffled_list)
    return shuffled_list

merged_df['Legacy'] = merged_df['Legacy'].apply(shuffle_legacy)
merged_df.to_csv('updated.csv', index=False)

In [None]:
missing_students = questionnaire_checklist[questionnaire_checklist['Questionnaire'] == False]

missing_df = pd.merge(missing_students,
                     legacy_info[['Minerva Email', 'Gender', 'Country_1', 'Country_2']],
                     on='Minerva Email',
                     how='left')

print(len(missing_df))

8


In [None]:
import pandas as pd
import random

# Assign students to legacies
def legacy_assignment(df, missing_df):
    import random
    # Unassigned list for students who cannot be assigned to their group
    unassigned = []

    # Initialise 25 legacy groups
    legacy_groups = {legacy: [] for legacy in legacies.keys()}

    # Loop through the randomized student list and assign each person a legacy
    for index, row in df.iterrows():
        assigned = False

        # 1) Track Group Sizes - Group legacies by current size
        size_groups = {}
        # Check if 'Legacy' column contains valid legacies before iterating
        for legacy in row['Legacy']:
            if legacy in legacy_groups:  # Check if legacy exists in the dictionary
                size = len(legacy_groups[legacy])
                if size not in size_groups:
                    size_groups[size] = []
                size_groups[size].append(legacy)

        # 2) Prioritize Smaller Groups - Iterate through size groups, prioritizing smaller ones
        for size in sorted(size_groups.keys()):
            # Randomize legacies within each size group
            random.shuffle(size_groups[size])
            for legacy in size_groups[size]:
                # 3) Balance Group Max Count & Country Diversity
                if len(legacy_groups[legacy]) < 6:
                    current_countries = [person['Country_1'] for person in legacy_groups[legacy]]
                    if row['Country_1'] not in current_countries:
                        legacy_groups[legacy].append(row)
                        assigned = True
                        break  # Move to the next person after assignment

            if assigned:
                break  # Stop checking larger size groups if assigned

        if not assigned:
            unassigned.append(row)

    # Assign Missing Students
    for index, row in missing_df.iterrows():
        assigned = False

        # Track Group Sizes - Group legacies by current size
        size_groups = {}
        for legacy in legacy_groups:
            size = len(legacy_groups[legacy])
            if size not in size_groups:
                size_groups[size] = []
            size_groups[size].append(legacy)

        # Prioritize Smaller Groups - Iterate through size groups, prioritizing smaller ones
        for size in sorted(size_groups.keys()):
            # Randomize legacies within each size group
            random.shuffle(size_groups[size])
            for legacy in size_groups[size]:
                # Balance Group Max Count & Country Diversity
                if len(legacy_groups[legacy]) < 6:
                    current_countries = [person['Country_1'] for person in legacy_groups[legacy]]
                    if row['Country_1'] not in current_countries:
                        legacy_groups[legacy].append(row)
                        assigned = True
                        break  # Move to the next person after assignment

            if assigned:
                break  # Stop checking larger size groups if assigned

        if not assigned:
            unassigned.append(row)

    # Print execute summary data (each group, total, unassigned)
    for legacy, members in legacy_groups.items():
        print(f"{legacy}: {len(members)}")

    total_assigned = sum(len(members) for members in legacy_groups.values())
    print(f"Total Assigned: {total_assigned}")

    print(f"Unassigned: {len(unassigned)}")

    # Output legacy groups & unassigned data to csv files
    # Convert legacy_group data to DataFrame
    legacy_group_dfs = []
    for legacy, members in legacy_groups.items():
        df = pd.DataFrame(members)
        df['Ultimate Legacy'] = legacy
        legacy_group_dfs.append(df)

    # Combine all legacy group DataFrames into a single DataFrame
    legacy_group_df = pd.concat(legacy_group_dfs, ignore_index=True)

    # Convert the unassigned list to a DataFrame
    unassigned_df = pd.DataFrame(unassigned)

    # Output to CSV files
    legacy_group_df.to_csv('legacy_groups.csv', index=False)
    unassigned_df.to_csv('unassigned_students.csv', index=False)

# Example usage with your DataFrame
legacy_assignment(merged_df, missing_df)


Cable: 6
Chronicle: 5
Circuit: 5
Civic: 6
Eureka: 6
Field: 5
Gate: 5
Hunter: 5
Labyrinth: 6
Lands: 5
Laurel: 5
Legion: 6
Liberty: 5
Mason: 5
Mission: 5
North: 5
Ocean: 5
Octagon: 6
Pier: 5
Plaza: 5
Pyramid: 6
Reserve: 6
Tower: 5
Union: 5
Vista: 6
Total Assigned: 134
Unassigned: 0


# Reorganise DataFrame

In [None]:
import pandas as pd

# Load the output CSV file containing the legacy group assignments
legacy_group_df = pd.read_csv('legacy_groups.csv')

# Reorganize the DataFrame to only contain the required columns
reorganized_df = legacy_group_df[['Ultimate Legacy', 'Full Name', 'Country_1']]

# Rename columns for clarity (if needed)
reorganized_df.rename(columns={'Ultimate Legacy': 'Legacy', 'Full Name': 'Student Name', 'Country_1': 'Citizenship'}, inplace=True)

# Save the reorganized DataFrame to a new CSV file
reorganized_df.to_csv('reorganized_legacy_groups.csv', index=False)

# Display the reorganized DataFrame
reorganized_df.head()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reorganized_df.rename(columns={'Ultimate Legacy': 'Legacy', 'Full Name': 'Student Name', 'Country_1': 'Citizenship'}, inplace=True)


Unnamed: 0,Legacy,Student Name,Citizenship
0,Cable,Abhishek Amit Roy,India
1,Cable,Hyunjun Lee,South Korea
2,Cable,Che Zhu,China
3,Cable,Azat Samatuly,Kazakhstan
4,Cable,Martina Dianda Rodriguez-Villar,Spain
