<a href="https://colab.research.google.com/github/brendanpshea/logic-prolog/blob/main/Werewolf_Python_Script.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [52]:
import pandas as pd
import numpy as np

# Setting the seed for reproducibility
np.random.seed(42)

# Constants
N = 1500  # Total number of students
WEREWOLF_PERCENTAGE = 0.25
N_WEREWOLVES = int(N * WEREWOLF_PERCENTAGE)
N_HUMANS = N - N_WEREWOLVES

# Sex distribution
sex = ['Male', 'Female']
students_sex = np.random.choice(sex, N)

# Height distribution (in inches)
# Assuming average heights for males and females are 69 and 64 inches respectively
# Standard deviation assumed to be 3 inches for both
heights_human = np.where(students_sex == 'Male',
                         np.random.normal(69, 3, N),
                         np.random.normal(64, 3, N))

# Werewolves are around 2 inches taller
heights_werewolf = heights_human + np.random.normal(2, 0.5, N)

# Eye color
eye_colors = ['Brown', 'Blue', 'Green', 'Grey']
eye_colors_werewolf = eye_colors + ['Yellow']

# Full moon absences
# Assuming an average of 1 absenees with a standard deviation of 2
full_moon_absence_human = np.random.normal(1, 2, N)
full_moon_absence_werewolf = full_moon_absence_human + 2

# GPA
# Normally distributed with mean 3.2 and standard deviation 0.5
gpa = np.random.normal(3.2, 0.5, N)

# Number of werewolf parents
def werewolf_parents(is_werewolf):
    if is_werewolf:
        return np.random.choice([0, 1, 2], p=[0.76, 0.2, 0.04])
    else:
        return np.random.choice([0, 1, 2], p=[0.96, 0.03, 0.01])

# Detentions (Pareto distribution)
alpha = 1.25  # Alpha parameter for the Pareto distribution
detentions = (np.random.pareto(alpha, N) * 3).astype(int)
detentions = np.minimum(180,detentions)

# Assign werewolf status and modify characteristics accordingly
is_werewolf = np.array([True] * N_WEREWOLVES + [False] * N_HUMANS)
np.random.shuffle(is_werewolf)

# Adjusting heights and tardy days based on werewolf status
heights = np.where(is_werewolf, heights_werewolf, heights_human)
full_moon_absence = np.where(is_werewolf, full_moon_absence_werewolf, full_moon_absence_human).astype(int)

# Probabilities for eye colors (earlier colors more likely)
probabilities_normal = [0.4, 0.3, 0.2, 0.1]  # For the normal eye colors list
probabilities_werewolf = [0.32, 0.24, 0.16, 0.08, 0.2]  # Adjusted for werewolf eye colors list

# Assigning eye colors with adjusted probabilities
eye_colors_final = [np.random.choice(eye_colors_werewolf if is_wolf else eye_colors,
                                     p=probabilities_werewolf if is_wolf else probabilities_normal)
                    for is_wolf in is_werewolf]


# Assigning number of werewolf parents with corrected probabilities
parents = [werewolf_parents(iw) for iw in is_werewolf]

# Detentions are associated with lower gpa
gpa -= 0.01 * detentions  # Element-wise subtraction
# Cap at 4.0
gpa = np.minimum(gpa, 4.0)
gpa = np.maximum(gpa,0)

# Creating the DataFrame with all corrected data
df = pd.DataFrame({
    'Sex': students_sex,
    'Height': heights,
    'EyeColor': eye_colors_final,
    'FullMoonAbsence': np.maximum(full_moon_absence,0),
    'GPA': gpa,
    'WerewolfParents': parents,
    'Detentions': detentions,
    'IsWerewolf': is_werewolf
})

df = df.round(2)


In [53]:
# Homeroom assignment function
def assign_homeroom(student_index, students_per_homeroom=30):
    year = 10 + (student_index // (students_per_homeroom * 4))
    section = chr(65 + (student_index % (students_per_homeroom * 4)) // students_per_homeroom)
    return f"{year}-{section}"

# Assign homerooms to students
homerooms = [assign_homeroom(i) for i in range(N)]

# Adding homeroom data to the DataFrame
df['Homeroom'] = homerooms


In [54]:
# CSV file path
csv_file_path = 'high_school_werewolf_data.csv'

# Saving to CSV
df.to_csv(csv_file_path, index=False)

In [55]:
df.head(20)

Unnamed: 0,Sex,Height,EyeColor,FullMoonAbsence,GPA,WerewolfParents,Detentions,IsWerewolf,Homeroom
0,Male,64.56,Brown,1,3.26,0,7,False,10-A
1,Female,62.65,Brown,3,2.27,0,3,False,10-A
2,Male,70.02,Grey,0,3.0,0,6,False,10-A
3,Male,67.75,Blue,1,3.31,0,1,False,10-A
4,Male,70.9,Blue,1,2.98,0,9,False,10-A
5,Female,57.11,Blue,0,2.43,0,0,False,10-A
6,Male,69.55,Green,2,3.98,0,2,False,10-A
7,Male,69.74,Brown,5,3.25,1,0,False,10-A
8,Male,67.62,Brown,0,3.9,0,2,False,10-A
9,Female,63.56,Blue,2,3.06,1,1,False,10-A


In [49]:
my_class = df[df['Homeroom']=="11-B"]

In [51]:
my_class

Unnamed: 0,Sex,Height,EyeColor,FullMoonAbsence,GPA,WerewolfParents,Detentions,IsWerewolf,Homeroom
150,Male,66.42,Brown,2,3.21,0,10,False,11-B
151,Male,67.85,Grey,0,3.04,0,1,False,11-B
152,Male,73.4,Yellow,2,3.08,0,1,True,11-B
153,Male,67.27,Grey,0,3.57,0,1,False,11-B
154,Male,71.51,Green,1,2.68,0,0,False,11-B
155,Female,69.32,Brown,3,4.0,0,0,False,11-B
156,Male,72.48,Green,4,2.01,2,28,True,11-B
157,Female,64.47,Blue,0,2.9,0,0,True,11-B
158,Male,61.59,Blue,3,3.39,0,0,False,11-B
159,Female,65.41,Brown,1,3.49,0,0,False,11-B


## Data Dictionary for High School Werewolf Dataset
This data dictionary explains the variables in the "High School Werewolf Dataset." The dataset simulates a high school environment with a twist: some students are werewolves. The dataset is designed for educational purposes, allowing students and teachers to explore statistical concepts in a fun, engaging manner.

-   **Sex:** A categorical variable indicating the gender of the student. Possible values are 'Male' and 'Female'.

-   **Height:** A continuous variable representing the student's height in inches. Heights follow a normal distribution. On average, male students are taller than female students, and werewolf students tend to be taller than their non-werewolf peers.

-   **EyeColor:** A categorical variable indicating the eye color of the student. Possible values are 'Brown', 'Blue', 'Green', 'Grey', and 'Yellow'. Yellow eyes are a unique trait found only among werewolves.

-   **FullMoonAbsence:** A discrete variable representing the number of days the student was absent after a full moon. This variable is normally distributed, with werewolves tending to be absent more on such days.

-   **GPA:** A continuous variable representing the student's Grade Point Average. This variable is normally distributed and is the same for werewolves and non-werewolves.

-   **WerewolfParents:** A discrete variable indicating the number of the student's parents who are werewolves. Possible values are 0, 1, or 2, with different base probabilities for werewolf and non-werewolf students.

-   **Detentions:** A discrete variable indicating the number of times the student has been in detention. This follows a Pareto distribution, implying that most students have few detentions, but a few have many. There is no difference in this variable between werewolves and non-werewolves.

-   **IsWerewolf:** A binary variable indicating whether the student is a werewolf or not. Possible values are True (werewolf) or False (non-werewolf).

In [56]:
my_class["Sex"].describe()

count       30
unique       2
top       Male
freq        18
Name: Sex, dtype: object