In [3]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

from faker import Faker
fake = Faker()

fake.name()

'Matthew Brewer'

# Student Class

In [36]:
class Student:
    def __init__(self, id_seed):
        self.sleeper, self.eater, self.attender, self.tardyer = np.random.choice(['l','m','h'], p = [.25, .5, .25], size = 4)
        self.base_score = np.random.uniform(55,70)
        self.info = gen_student(id_seed)
        
    def sleep(self):
        if self.sleeper == 'l':
            return np.random.choice([3,4,5,6,7], p = [.1, .4, .2, .2, .1])
        elif self.sleeper == 'm':
            return np.random.choice([3,4,5,6,7], p = [.05, .35, .3, .25, .05])
        else:
            return np.random.choice([3,4,5,6,7], p = [.02, .28, .3, .25, .15])
        
    def eat(self):
        if self.eater == 'l':
            return np.random.choice([1,2], p = [.4, .6])
        elif self.eater == 'm':
            return np.random.choice([1,2], p = [.7, .3])
        else:
            return np.random.choice([1,2], p = [.85, .15])
    
    def attend(self):
        if self.attender == 'l':
            return np.random.choice([True, False], p = [.9, .1])
        elif self.attender == 'm':
            return np.random.choice([True, False], p = [.95, .05])
        else:
            return np.random.choice([True, False], p = [.99, .01])
    
    def tardy(self):
        if self.tardyer == 'l':
            return np.random.choice([False, True], p = [.85, .15])
        elif self.tardyer == 'm':
            return np.random.choice([False, True], p = [.90, .1])
        else:
            return np.random.choice([False, True], p = [.99, .01])

# Convenience Methods

In [368]:
def gen_name():
    return {'first_name': fake.first_name(),
            'last_name': fake.last_name()}

def gen_teachers(n):
    to_ret = []
    for i in range(1,n+1):
        info = {
            **gen_name(),
            'email': f'teacher{i}@school.com',
            'password': 'password',
            'id': i,
            'role': 2
        }
        to_ret.append(info)
    return to_ret

def gen_student(id_seed):
    info = {
        **gen_name(),
        'student_id': 100000 + id_seed,
        'google_id': fake.sha1()[:21],
        'grade_level': np.random.choice(range(9,13)),
        'id': id_seed
    }
    return info

def gen_courses(n, period=1, course_id_start = 1):
    teacher_ids = np.random.permutation(range(1,n+1))
    levels = ["Beginner", "Intermediate", "Advanced", "Honors", "Remedial"]
    subjects = ["English", "Algebra", "Chemistry", "Biology", "Sociology","Philosophy",
                "Literature", "Spanish","French", "Chinese", "German", "Swahili", 
                "Physics", "Geometry", "Calculus","Japanese", "Italian", "Neuroscience",
                "Linear Algebra", "Differential Equations", "History", "Art History", 
                "Anatomy", "Economics", "Derivatives"]
    sequence = ["1","2", "3","4", "5"]

    course_names = []
    for l in levels:
        for s in subjects:
            for r in sequence:
                course_names.append(" ".join([l,s,r]))
                
        
    to_ret = {'courses':[], 'teacher_courses':[],}

    period_times = {
        'start_time': f'{period+7}:00:00',
        'end_time': f'{period+8}:00:00'
    }
    chosen = np.random.choice(course_names, replace = False, size = n)
    for i in range(n):
        info = {
            'id': i + course_id_start,
            'name': chosen[i],
            'period': period,
            **period_times,
        }
        to_ret['courses'].append(info)
        to_ret['teacher_courses'].append({'teacher_id': teacher_ids[i], 'course_id': i+course_id_start})
    return to_ret

# Create Data for Fitting Model

In [372]:
%%time
number_of_students = 6000
number_of_tests = 5
number_of_teachers = 300
start_day = datetime(2019,1,1,2)
day = timedelta(days = 1)
attendances = []
surveys = []
students = []
student_courses = []
survey_defaults = {'choice_type':'id', 'correct': 'nil', 'text_answer': 'nil'}
grade_defaults = {'choice_type': 'text', 'correct': 'nil', 'choice_id': 'nil'}

for s in range(1, number_of_students + 1):
    if s %1000 == 0:
        print(s)
    s_id = s
    c_id = (s //20) + 1
    
    stu_defaults = {
        'course_id': c_id,
        'student_id': s_id
    }
    
    student_courses.append(stu_defaults)
    
    stu = Student(s)
    students.append(stu)
    
    for_test = {'sleep':[], 'eat':[]}
    
    for d in range(number_of_tests*10):
        date = start_day + (d * day)
        date = date.strftime("%Y-%m-%d %H:%M:%S")
        stamps = {'created_at': date, 'updated_at': date}
        
        if stu.attend():

            attendance = 'tardy' if stu.tardy() else 'present'
            attendances.append({
                **stu_defaults,
                **stamps,
                'attendance': attendance
            })
            ate = stu.eat()
            slept = stu.sleep()
            
            food = {
                **stu_defaults,
                'question_id': 1, 
                'choice_id': ate, 
                **stamps, 
                **survey_defaults
            }
            
            sleep = {
                **stu_defaults,
                'question_id': 2, 
                'choice_id': slept, 
                **stamps, 
                **survey_defaults}
            
            surveys.extend([food,sleep])
            for_test['sleep'].append(slept)
            for_test['eat'].append(ate)
            
        if d % 10 == 9:
            score = (abs(sum(for_test['eat'])-20) * 5) + \
                ((sum(for_test['sleep']) /70) * 50) + \
                np.random.normal(scale = 4)
            
            test = {**stu_defaults, 'question_id': 3, 'text_answer': score, **grade_defaults, **stamps}
            surveys.append(test)
            for_test = {'sleep':[], 'eat':[]}
            
teachers = gen_teachers(300)
courses = {}
for i in range(1, 6):
    print('per' + str(i))
    courses['period '+str(i)] = gen_courses(300, i, (i-1)*300)
    for s in students:
        course_taken = np.random.choice(courses['period '+str(i)]['courses'])
        student_courses.append({'course_id':course_taken['id'], 'student_id': s.info['id']})

def flat_list(lst):
    to_ret = []
    for l in lst:
        for i in l:
            to_ret.append(i)
    return to_ret
            
course_inter = [ courses[key]['courses'] for key in courses.keys()]
tc_inter = [ courses[key]['teacher_courses'] for key in courses.keys()]
teacher_course_info = flat_list(tc_inter)
course_info = flat_list(course_inter)

c_df = pd.DataFrame(course_info)
tc_df = pd.DataFrame(teacher_course_info)
sc_df = pd.DataFrame(student_courses)
t_df = pd.DataFrame(teachers)
s_df = pd.DataFrame([s.info for s in students])
a_df = pd.DataFrame(attendances)
sur_df = pd.DataFrame(surveys)


for c in [col for col in sur_df.columns if col[-3:] == '_at']:
    sur_df[c] = pd.to_datetime(sur_df[c])
    
def to_proportions(df, question_ids = [1,2], to_drop = [2,3]):
    dfs = []
    for q_id in question_ids:
        col_dat = df.loc[df['question_id'] == q_id]
        counts = col_dat.groupby('student_id')['choice_id'].value_counts()
        counts.name = 'count'
        counts = counts.reset_index()
        
        total = counts.groupby('student_id')['count'].sum()
        
        pivot = counts.pivot(index = 'student_id', columns = 'choice_id')
        pivot.columns = pivot.columns.levels[1]
        for c in pivot:
            pivot[c] = pivot[c]/total
        pivot = pivot.fillna(0)
        
        dfs.append(pivot)
    return pd.concat(dfs, axis = 1).drop(to_drop, axis = 1)

    
quiz_days = list(sur_df[sur_df.question_id == 3].updated_at.value_counts().index)
quiz_days.append(sur_df.updated_at.min())
quiz_days = sorted(quiz_days)
weeks = []
for start, end in zip(quiz_days, quiz_days[1:]):
    idf = sur_df[(sur_df.updated_at >= start) & (sur_df.updated_at<= end)]
    props = to_proportions(idf)
    scores = idf[idf['question_id'] == 3].groupby('student_id')['text_answer'].max()
    weeks.append(pd.concat([scores,props], axis = 1))

for_analysis = pd.concat(weeks, axis = 0)
for_analysis.rename(columns = {'text_answer': 'score'}, inplace = True)
# for_analysis.to_csv('data_for_fit.csv')

1000
2000
3000
4000
5000
6000
per1
per2
per3
per4
per5
CPU times: user 37 s, sys: 1.69 s, total: 38.7 s
Wall time: 40.7 s


# Create Data for seeding

In [373]:
students_for_seeds = students[:50]
course_info = []
teacher_course_info = []
student_course_info = []
attendances = []
for i in range(1,6):
    c = gen_courses(10, i, (i-1)*10 + 1)
    course_info.extend(c['courses'])
    teacher_course_info.extend(c['teacher_courses'])
    
    for s in students_for_seeds:
        student_course_info.append({'course_id': np.random.choice(c['courses'])['id'], 'student_id':s.info['id']})
teacher_info = gen_teachers(10)
attendances = []
surveys = []
survey_defaults = {'choice_type':'id', 'correct': 'nil', 'text_answer': 'nil'}
df = pd.DataFrame(student_course_info)
for d in range(9):
    for stu in students_for_seeds:
        for c in list(df[df.student_id == stu.info['id']].course_id):

            if stu.attend():

                attendance = 'tardy' if stu.tardy() else 'present'
                attendances.append({
                    'student_id':stu.info['id'],
                    'course_id': c,
                    'days_ago': d,
                    'attendance': attendance
                })
                
                ate = stu.eat()
                slept = stu.sleep()

                food = {
                    'student_id': stu.info['id'],
                    'course_id':c,
                    'question_id': 1, 
                    'choice_id': ate, 
                    'days_ago': d, 
                    **survey_defaults
                }

                sleep = {
                    'student_id':stu.info['id'],
                    'course_id':c,
                    'question_id': 2, 
                    'choice_id': slept, 
                    'days_ago': d, 
                    **survey_defaults}

                surveys.extend([food,sleep])
            else:
                attendances.append({
                    'student_id':stu.info['id'],
                    'course_id': c,
                    'days_ago': d,
                    'attendance': 'absent'
                })
                
c_df = pd.DataFrame(course_info)
tc_df = pd.DataFrame(teacher_course_info)
sc_df = pd.DataFrame(student_course_info)
t_df = pd.DataFrame(teacher_info)
s_df = pd.DataFrame([s.info for s in students_for_seeds])
a_df = pd.DataFrame(attendances)
sur_df = pd.DataFrame(surveys)

In [374]:
# Save as .csv
c_df.to_csv('courses.csv', index = False)
tc_df.to_csv('teacher_courses.csv', index = False)
sc_df.to_csv('student_courses.csv', index = False)
t_df.to_csv('teachers.csv', index = False)
s_df.to_csv('students.csv', index = False)
a_df.to_csv('attendances.csv', index = False)
sur_df.to_csv('responses.csv', index = False)


# Model fitting nonsense


In [None]:
# sklearn Analysis Suite
from sklearn import model_selection
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import median_absolute_error

from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

X = for_analysis.drop('score')
y = for_analysis['score']

test_size = 0.20
seed = 7

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, 
                                                                    test_size=test_size, 
                                                                    random_state=seed)
# max_depth options in linear space from None - 100
max_depth = [int(x) for x in np.linspace(1, 20, num = 11)]
max_depth.append(None)

tree_params = {
               'max_depth': max_depth,
               'n_estimators': np.logspace(1, 5, num=10, dtype='int')
               }

adaboost_params = {
                'base_estimator': [DecisionTreeRegressor(max_depth=3), DecisionTreeRegressor(max_depth=4), DecisionTreeRegressor(max_depth=5)],
                'learning_rate' : [0.01,0.05,0.1,0.3,1,2],
                'loss' : ['linear', 'square'],
                'n_estimators': np.logspace(1, 4, num=10, dtype='int')
                }

gboost_params = {
                'learning_rate' : [0.01,0.05,0.1,0.3,1,2],
                'n_estimators': np.logspace(1, 4, num=10, dtype='int'),
                'loss' : ['ls', 'lad']  
                }

models = []
models.append(('ABR', AdaBoostRegressor(), adaboost_params))
models.append(('RFR', RandomForestRegressor(), tree_params))
models.append(('GBR', GradientBoostingRegressor(), gboost_params))
models.append(('ETR', ExtraTreesRegressor(), tree_params))

# Evaluate each model and pick most optimized model with associated hyper parameters
results = []
names = []
besat_estimators = []
all_best_params = []
all_cv_results = []
all_model_results = []

for name, model, params in models:
    print(name)
    rs_cv = model_selection.RandomizedSearchCV(estimator=model,
                                               param_distributions=params,
                                               cv=5,
                                               random_state=seed)

    model_result = rs_cv.fit(X_train, y_train)
    best_estimator = model_result.best_estimator_
    score = model_result.best_score_
    best_params = model_result.best_params_
    cv_results = model_result.cv_results_
    
    all_model_results.append(model_result)
    all_cv_results.append(cv_results)
    all_best_params.append(best_params)
    best_estimators.append(best_estimator)
    results.append(score)
    names.append(name)
    msg = "%s: %f" % (name, score)
    print(msg)