In [None]:
import uuid

import numpy as np
import pandas as pd
import scipy.stats as stats

In [None]:
class Sample():
    def __init__(self, size):
        self._size = size
        self._classes = self._gen_classes()
        self._data = None
        self._df = None
    
    def __repr__(self):
        return f"Sample with size {self._size}"
    
    @property
    def size(self):
        return self._size
    
    @property
    def classes(self):
        return self._classes
    
    @property
    def data(self):
        if not self._data:
            self._data = [item for sublist in [c.data for c in self.classes] for item in sublist]
        return self._data
    
    @property
    def df(self):
        if self._df is None:
            self._df = pd.DataFrame(self.data)
            self._df["sex"] = self._df["sex"].astype('category')
        return self._df
    
    def _gen_class(self):
        return Class()
    
    def _gen_classes(self):
        return [self._gen_class() for i in range(self._size)]

In [None]:
class Class():
    def __init__(self):
        self._id = "c_" + uuid.uuid4().hex[:6]
        self._n_students = self._gen_n_students()
        self._grade = self._gen_grade()
        self._students = self._gen_students()
        self._data = None

    def __repr__(self):
        return f"Class {self._id}: {self.grade} grade with {self.n_students} students"
        
    @property
    def n_students(self):
        return self._n_students
        
    @property
    def grade(self):
        return self._grade

    @property
    def students(self):
        return self._students
    
    @property
    def data(self):
        if not self._data:
            self._data = [s.data for s in self.students]
        return self._data
    
    def _gen_n_students(self):
        lo, hi = 20, 34
        m, sd = 28.7, 3.4
        a, b = (lo - m) / sd, (hi - m) / sd
        return int(round(stats.truncnorm.rvs(a, b, loc=m, scale=sd)))

    def _gen_grade(self):
        return np.random.choice([8, 9])
    
    def _gen_student(self):
        return Student(self.grade, self._id)
    
    def _gen_students(self):
        return [self._gen_student() for i in range(self.n_students)]

In [None]:
class Student():
    def __init__(self, grade, class_id):
        self._id = "s_" + uuid.uuid4().hex[:6]
        self._grade = grade
        self._class_id = class_id
        self._sex = self._gen_sex()
        self._age = self._gen_age(grade)

    def __repr__(self):
        return f"Student {self._id}: {self.age:.1f}y/{self.sex}, {self.grade} grade"

    @property
    def grade(self):
        return self._grade
    
    @property
    def sex(self):
        return self._sex
    
    @property
    def age(self):
        return self._age
    
    @property
    def data(self):
        return dict(uid=self._id, class_id=self._class_id, grade=self.grade, sex=self.sex, age=self.age)
        
    def _gen_sex(self):
        return np.random.choice(['male', 'female'])
    
    def _gen_age(self, grade):
        if grade == 8:
            m, sd = 13.5, 0.5
        elif grade == 9:
            m, sd = 14.5, 0.5
        else:
            raise("Form")

        return np.random.normal(loc=m, scale=sd)

In [None]:
%%time
s = Sample(1000)

In [None]:
s.df.head()

In [None]:
s.df.dtypes

In [None]:
s.df.describe(include="all")

In [None]:
s.df.groupby("grade").mean()

In [None]:
s.df