In [2]:
import json
import numpy as np
from scipy.special import softmax

In [3]:
with open("../new_planned_courses.json") as f:
    planned_courses = json.load(f)

with open("../metrics.json") as f:
    metrics = json.load(f)

In [4]:
years = ["Freshman", "Sophomore", "Junior", "Senior", "5+ Year Undergrad", "Coterm", "Professional Degree", "Master's", "PhD/Doctoral", "SCPD/Nonmatriculated", "Other"]
year_to_index = {years[i]: i for i in range(len(years))}
nextYear = {
    "Freshman": "Sophomore",
    "Sophomore": "Junior",
    "Junior": "Senior",
    "Senior": "Coterm",
    "5+ Year Undergrad": "5+ Year Undergrad",
    "Coterm": "Coterm",
    "Professional Degree": "Professional Degree",
    "Master's": "Master's",
    "PhD/Doctoral": "PhD/Doctoral",
    "SCPD/Nonmatriculated": "SCPD/Nonmatriculated",
    "Other": "Other"
}
index_to_index = {year_to_index[key]: year_to_index[nextYear[key]] for key in nextYear}

In [5]:
# convert raw metrics to probabilities, with some buffer
BUFFER = 1
year_probs = {}

for course in metrics:
    if metrics[course] is not None and 'years' in metrics[course]:
        total = sum(metrics[course]['years']) + BUFFER * len(years)
        year_probs[course] = np.array([((num + BUFFER) / total) for num in metrics[course]['years']])
    else: # assume equally likely if no data is given
        year_probs[course] = np.ones(len(years)) / len(years)

In [6]:
CURRENT_YEAR = 2023

def class_probability(person):
    p = np.ones(len(years))
    mappings = {j : j for j in index_to_index}
    for year in range(CURRENT_YEAR, CURRENT_YEAR - 6, -1):
        for quarter in range(4):
            p_quarter = np.ones(len(years))
            code = str(year) + "-" + str(quarter)
            if code not in person: continue
            
            for course in person[code]:
                min_p_course = min(year_probs[course])
                p_course = np.ones(len(years)) * min_p_course
                for i in range(len(years)):
                    p_course[mappings[i]] += year_probs[course][i]
                p_quarter *= p_course
            
            p_quarter /= np.sum(p_quarter)
            p *= p_quarter
        mappings = {j : index_to_index[mappings[j]] for j in mappings}
    p /= np.sum(p)
    return p

In [7]:
for user in planned_courses:
    if planned_courses[user] is not None:
        if '2023-0' in planned_courses[user]:
            classes = ['CS229', 'CS265', 'MATH155', 'MATH193']
            stuff = [course in planned_courses[user]['2023-0'] for course in classes]
            if all(stuff):
                print(user)

1335


In [8]:
user = "1100"
np.set_printoptions(suppress=True)
print(class_probability(planned_courses[user]))
planned_courses[user]

[0.         0.99866209 0.00133791 0.         0.         0.
 0.         0.         0.         0.         0.        ]


{'2022-0': ['COLLEGE101', 'CS106B', 'CS529', 'MATH51', 'STS10SI'],
 '2022-1': ['CME510',
  'CS103',
  'CS106L',
  'CS529',
  'DATASCI112',
  'EE292T',
  'PHYSICS41',
  'PWR1OS'],
 '2022-2': ['COLLEGE112', 'CS109', 'CS161', 'CS198', 'STATS100'],
 '2023-0': ['CEE252',
  'CS107',
  'CS221',
  'FRENLANG1',
  'PHYSWELL36',
  'RESTRICTED'],
 '2023-1': ['CS129', 'CS224N', 'CS246', 'MATH104', 'PWR2STA', 'STATS200'],
 '2023-2': ['CS227B',
  'CS41',
  'EE104',
  'ENGR76',
  'FINANCE637',
  'LAW1040',
  'MATH104',
  'MATH21',
  'STATS191',
  'STATS203',
  'STATS217'],
 '2024-0': ['CS148', 'CS229', 'INTLPOL268'],
 '2024-1': ['ECON1']}

In [9]:
year_preds = {}
# year_preds["years"] = years
for user in planned_courses:
    year_preds[user] = list(class_probability(planned_courses[user]))

In [35]:
counts = [0 for _ in range(len(years))]
unconfident = []
for user in year_preds:
    if len(planned_courses[user]) == 0: continue
    counts[np.argmax(year_preds[user])] += 1
    if np.max(year_preds[user]) < 0.5:
        unconfident.append(user)

print("total users:", sum(counts))
print()

print("based on highest likelihood:")
for i in range(len(years)):
    print(str(years[i]) + ":", counts[i])
print()
print("unconfident:", len(unconfident))
print()
print()

print("based on sampling of probability distribution:")
rng = np.random.default_rng(0)
counts = [0 for _ in range(len(years))]
for user in year_preds:
    if len(planned_courses[user]) == 0: continue
    sampled_year = rng.choice(np.arange(len(years)), p=year_preds[user])
    counts[sampled_year] += 1
for i in range(len(years)):
    print(str(years[i]) + ":", counts[i])
print()

total users: 2173

based on highest likelihood:
Freshman: 830
Sophomore: 664
Junior: 392
Senior: 166
5+ Year Undergrad: 0
Coterm: 19
Professional Degree: 2
Master's: 80
PhD/Doctoral: 18
SCPD/Nonmatriculated: 1
Other: 1

unconfident: 255


based on sampling of probability distribution:
Freshman: 784
Sophomore: 666
Junior: 405
Senior: 174
5+ Year Undergrad: 8
Coterm: 24
Professional Degree: 6
Master's: 66
PhD/Doctoral: 22
SCPD/Nonmatriculated: 13
Other: 5



In [12]:
avg_classes = 0
for user in unconfident:
    avg_classes += len(planned_courses[user])
avg_classes /= len(unconfident)
print("average number of classes pinned for unconfident users:")
print(avg_classes)

average number of classes pinned for unconfident users:
1.1568627450980393


In [166]:
with open("year_predictions.json", "w") as f:
    json.dump(year_preds, f)