In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict

In [2]:
import random
import string

def get_random_string(prefix, length):
    letters = string.ascii_lowercase
    result_str = ''.join(random.choice(letters) for i in range(length))
    return prefix + "-" + result_str

# Simulating school data

The idea is that we want to be able to identify statistical anomalies in high school to make interventions and improve student outcomes. Possible anomalies are 
- individual student performance changes
- teacher's performance deviating from average
- curriculum being taught poorly

## Available data
The available data looks like
- individual student
 - list of classes
 - list of test scores in those classes
- individual teacher
 - list of classes
- class
 - list of students in that class
 - teacher who teaches the class
 - time of the class
 - scores of each student in that class
- score
 - class (uid)
 - score (int)
 - student (uid)

In [3]:
ASSIGNMENT_TYPES = ['homework', 'test']

In [4]:
# create fake student
class Student:
    def __init__(self, name = ''):
        self.name = name
        self.id = get_random_string('s', 6)
        self.assignments = defaultdict(list)
        self.courses = {}
        self.grades = {}
    
    def show(self):
        print("--- {} ---".format(self.id))
        print("assignments: {}".format([self.assignments]))
        print("courses: {}".format([ courseId for (courseId, c) in self.courses.items() ]))
        print("\n")
        
    def addCourse(self, course):
        self.courses[course.id] = course
        
    def addAssignment(self, courseId, assignment, grade):
        self.assignments[courseId] += [(assignment, grade)]
        
    def getCumulativeGrades(self):
        for (courseId, course) in self.courses.items():
            rubric = course.rubric
            assignments = self.assignments[courseId]
            
            # generalize
            totalGrade = 0
            
            for assignmentType in ASSIGNMENT_TYPES:
                assigmentList = list(filter(lambda a: a[0].assignmentType == assignmentType, assignments))
                assignmentWeight = np.mean([a[1] for a in assigmentList]) * rubric[assignmentType]
                totalGrade += assignmentWeight
                
            self.grades[courseId] = totalGrade
            
        return self.grades

In [5]:
class Assignment:
    def __init__(self, assignmentType, time):
        self.id = get_random_string('a', 5)
        self.assignmentType = assignmentType
        self.time = time

In [6]:
# creating a fake course
class Course:
    def __init__(self, name = '', slot = 0 ):
        self.name = name
        self.slot = slot
        self.id = get_random_string('c', 5)
        self.assignments = []
        self.teacher = None
        self.students = []
        self.rubric = dict(zip(ASSIGNMENT_TYPES, np.random.dirichlet((1, 1), 1).tolist()[0]))
        
    def setStudents(self, students):
        self.students = students
        for student in students:
            student.addCourse(self)
            
    def setTeacher(self, teacher):
        self.teacher = teacher
        teacher.addCourse(self)
        
    def createAssignment(self, assignmentType, time):
        assignment = Assignment(assignmentType, time)
        self.assignments += [assignment]
        for student in self.students:
            grade = random.random()
            student.addAssignment(self.id, assignment, grade)
    
    def show(self):
        print("--- {} ---".format(self.id))
        print("teacher: {}".format(self.teacher.id))
        print("students: {}".format([s.id for s in self.students]))
        print("assignments: {}".format([a[0] for a in self.assignments]))
        print("rubric: {}".format(self.rubric))
        print("\n")

In [7]:
class Teacher:
    def __init__(self, name = ''):
        self.name = name
        self.id = get_random_string('t', 5)
        self.courses = []
        
    def addCourse(self, course):
        self.courses += [course]
        
    def show(self):
        print("--- {} ---".format(self.id))
        print("courses: {}".format(self.courses))
        print("total number of students: {}".format(sum([len(course.students) for course in self.courses])))
        print("\n")

In [8]:
students = [Student() for i in range(10)]

In [9]:
history = Course('history')
history.setStudents(students)

In [10]:
students[0].show()

--- s-jdglgz ---
assignments: [defaultdict(<class 'list'>, {})]
courses: ['c-ooont']




In [11]:
history.createAssignment('test', 100)

In [12]:
students[0].show()

--- s-jdglgz ---
assignments: [defaultdict(<class 'list'>, {'c-ooont': [(<__main__.Assignment object at 0x1242d1240>, 0.8457544866431838)]})]
courses: ['c-ooont']




## Scaling out

In [13]:
# instantiate all the courses and students at 
# the beginning of the school year (t = 0)

students = [Student() for i in range(100)] # school has 100 students
teachers = [Teacher() for i in range(10)] # school has 10 teachers

courses = {}
for teacher in teachers:
    # get number of courses taught by a teacher
    numberOfCourses = random.randint(1, 5)
    
    for i in range(numberOfCourses):
        # get a random number of students in a course
        numberOfStudentsInCourse = random.randint(10, 25)
        # sample students
        studentsInCourse = random.sample(students, numberOfStudentsInCourse)

        # create the course
        c = Course()
        c.setTeacher(teacher)
        c.setStudents(studentsInCourse)
        
        courses[c.id] = c

In [14]:
students[0].show()
teachers[0].show()
courses[list(courses.keys())[0]].show()

--- s-tdgowx ---
assignments: [defaultdict(<class 'list'>, {})]
courses: ['c-swcew', 'c-abicz', 'c-fqdgc', 'c-wuzer', 'c-qnrnd', 'c-pkjop', 'c-fvsnx']


--- t-gpowi ---
courses: [<__main__.Course object at 0x1181e06a0>, <__main__.Course object at 0x124303ac8>, <__main__.Course object at 0x124303e80>, <__main__.Course object at 0x124303ef0>]
total number of students: 67


--- c-swcew ---
teacher: t-gpowi
students: ['s-omhtld', 's-oenmgh', 's-gjcsyg', 's-lvtwvo', 's-sacijk', 's-jsngky', 's-usdjic', 's-ywsdul', 's-hzqlhk', 's-cnqrco', 's-wjavlc', 's-cqylvd', 's-syikmr', 's-yxzpqy', 's-mcbxoc', 's-tdgowx', 's-dnbbyo', 's-lhbtyy', 's-njabwh', 's-hotxzu', 's-qgyhvz']
assignments: []
rubric: {'homework': 0.24969950900119636, 'test': 0.7503004909988037}




In [15]:
for t in range(100):
    for (courseId, course) in courses.items():
        if random.randint(0, 1): # randomly determine whether to assign something
            assignmentType = random.sample(ASSIGNMENT_TYPES, 1).pop() # determine if it's a test or homework
            course.createAssignment(assignmentType, t)

In [16]:
students[0].getCumulativeGrades()

{'c-swcew': 0.5288150218078249,
 'c-abicz': 0.46505356477848125,
 'c-fqdgc': 0.46715625549964457,
 'c-wuzer': 0.49960536061481714,
 'c-qnrnd': 0.5046756516836763,
 'c-pkjop': 0.5358740623004308,
 'c-fvsnx': 0.47819355339570946}