See link https://www.bgsu.edu/graduate/catalogs-and-policies/graduate-catalog/data-science.html

In [1]:
import pandas as pd
import numpy as np

In [2]:
# requirements
req = pd.read_csv("../data//corses_list.csv")
req = req.iloc[:,1:4]

In [3]:
req["type"] = ""
req.loc[:14, "type"] = "required"
req.loc[15:46, "type"] = "electives"
# req.set_index("Code", inplace=True)

## Transripts

In [4]:
def parse_transcript(F, name, old_req):
    df =  [[
        a.split()[0]+"_"+a.split()[1], a.split()[-2]
    ] for a in F.splitlines() if len(a)>0]
    df = pd.DataFrame(df)
    df.columns = ["Code", name]
    new_req = pd.merge(old_req, df, on="Code", how = "left")
    return new_req

In [5]:
fall_2020 = """
CS 5620 Database Mgmt Systems 3.000 3.000 B 9.000
CS 6010 Data Science Programming 3.000 3.000 A 12.000
MATH 6410 Probability Theory I 3.000 3.000 A 12.000
OR 6610 Linear-Integer Prgmg 3.000 3.000 A 12.000
"""

req = parse_transcript(fall_2020, "Fall_2020", old_req=req)

In [6]:
dat = """
MATH 6420 Math Statistics II 3.000 3.000 A 12.000
STAT 5160 Time Series Analysis 3.000 3.000 A 12.000
STAT 6440 Data Mining 3.000 3.000 A 12.000
"""
req = parse_transcript(dat, "Spring_2021", old_req=req)

In [7]:
dat = """
DATA 6910 Data Science Project 3.000 3.000 S 0.000
MATH 6570 Statistical Computing 3.000 3.000 A 12.000
STAT 5020 Regression Analysis 3.000 3.000 A 12.000
"""
req = parse_transcript(dat, "Fall_2021", old_req=req)

In [8]:
dat = """
CS 6500 Big Data Analytics 3.000 3.000 A 12.000
MATH 6820 Topics In Math or Stat 3.000 3.000 A 12.000
MATH 7590 Gen Linear Models 3.000 3.000 A 12.000
"""
req = parse_transcript(dat, "Spring_2022", old_req=req)

In [9]:
dat = """
CS 5170 Intro Parallel Computing 3.000 3.000 A 12.000
CS 7300 Unsupervised Feature Learning 3.000 3.000 A 12.000
MATH 6490 Statistical Graphics 3.000 3.000 A 12.000
"""
req = parse_transcript(dat, "Fall_2022", old_req=req)

In [10]:
dat = """
CS 5630 Python for Comput & Data Sci 3.000 3.000 A 12.000
STAT 7440 Adv Topics in Data Mining 3.000 3.000 A 12.000
PHIL 6400 Data Science Ethics 3.000 3.000 A 12.000
"""
req = parse_transcript(dat, "Spring_2023", old_req=req)

In [11]:
dat = """
DATA 7770 Data Science Exploration 1.000 0.000 IP 0.000
MATH 6440 Stochastic Processes 3.000 0.000 IP 0.000
MATH 6480 Bayesian Decision Theory 3.000 0.000 IP 0.000
MATH 7570 Linear Stat Inference 3.000 0.000 IP 0.000
"""
req=parse_transcript(dat, "Fall_2023", old_req=req)

## Analysis

In [12]:
grade_columns = np.array([c for c in req.columns if "_" in c])
ms_grade_columns = grade_columns[:3]
phd_grade_columns = grade_columns[3:]

## Export

In [13]:
long_req = pd.melt(req, 
        id_vars=["Code", "Name","Credits","type"], value_vars=grade_columns, var_name="season",value_name="grade")
long_req["year_half"] = long_req.season.str.split("_").str.get(0)
long_req["year"] = long_req.season.str.split("_").str.get(1).astype("int")
grade_dict = {"A":4,"B":3,"C":2, "D":1,"F":0,np.nan:0}
long_req["number_grade"] = long_req["grade"].map(grade_dict)
long_req = long_req[~long_req["grade"].isna()]

In [14]:
long_req["Lector"] = np.nan
long_req.loc[long_req["Code"]=="OR_6610", "Lector"] = "Christopher Rump"
long_req.loc[long_req["Code"]=="STAT_6440", "Lector"] = "Shuchismita Sarkar"
long_req.loc[long_req["Code"]=="CS_6010", "Lector"] = "Robert C. Green II"
long_req.loc[long_req["Code"]=="CS_7300", "Lector"] = "Qing Tian"
long_req.loc[long_req["Code"]=="MATH_6490", "Lector"] = "Junfeng Shang"
long_req.loc[long_req["Code"]=="CS_5630", "Lector"] = "Qing Tian"
long_req.loc[long_req["Code"]=="STAT_7440", "Lector"] = "Trent D. Buskirk"
long_req.loc[long_req["Code"]=="PHIL_6400", "Lector"] = "Justin Donhauser"
long_req.loc[long_req["Code"]=="CS_5620", "Lector"] = "Ray Kresman"
long_req.loc[long_req["Code"]=="MATH_6410", "Lector"] = "Wei Ning"
long_req.loc[long_req["Code"]=="MATH_6420", "Lector"] = "Wei Ning"
long_req.loc[long_req["Code"]=="STAT_5160", "Lector"] = "Yuhang Xu"
long_req.loc[long_req["Code"]=="MATH_6570", "Lector"] = "Maria Rizzo"
long_req.loc[long_req["Code"]=="STAT_5020", "Lector"] = "Shuchismita Sarkar"
long_req.loc[long_req["Code"]=="DATA_6910", "Lector"] = "Yuhang Xu"
long_req.loc[long_req["Code"]=="CS_6500", "Lector"] = "Shuteng Niu"
long_req.loc[long_req["Code"]=="MATH_7590", "Lector"] = "Hanfeng Chen"
long_req.loc[long_req["Code"]=="MATH_6820", "Lector"] = "Craig Zirbel"
long_req.loc[long_req["Code"]=="CS_5170", "Lector"] = "Robert C. Green II"
long_req.loc[long_req["Code"]=="MATH_6440", "Lector"] = "Craig Zirbel"
long_req.loc[long_req["Code"]=="MATH_6480", "Lector"] = "Wei Ning"
long_req.loc[long_req["Code"]=="MATH_7570", "Lector"] = "Junfeng Shang"
long_req.loc[long_req["Code"]=="DATA_6910", "Name"] = "MS Thesis"
long_req.loc[long_req["Code"]=="DATA_6910", "Credits"] = 3
long_req.loc[long_req["Code"]=="DATA_6910", "type"] = "required"
#long_req.loc[long_req["Code"]=="???", "Lector"] = "???"
print(f'{long_req["Lector"].isna().sum()} Nans')
long_req[long_req["Lector"].isna()]

1 Nans


Unnamed: 0,Code,Name,Credits,type,season,grade,year_half,year,number_grade,Lector
290,DATA_7770,Data Science Exploration,1.0,required,Fall_2023,IP,Fall,2023,,


In [15]:
long_req["Subject"] = long_req["Code"].str.split("_").str.get(0)

In [16]:
long_req.to_csv("../data//long_list_raw.csv", index=False)

In [17]:
long_req

Unnamed: 0,Code,Name,Credits,type,season,grade,year_half,year,number_grade,Lector,Subject
17,CS_5620,Database Management Systems,3.0,electives,Fall_2020,B,Fall,2020,3.0,Ray Kresman,CS
18,CS_6010,Data Science Programming,3.0,electives,Fall_2020,A,Fall,2020,4.0,Robert C. Green II,CS
25,MATH_6410,Probability Theory I,3.0,electives,Fall_2020,A,Fall,2020,4.0,Wei Ning,MATH
39,OR_6610,Linear and Integer Programming,3.0,electives,Fall_2020,A,Fall,2020,4.0,Christopher Rump,OR
73,MATH_6420,Mathematical Statistics II,3.0,electives,Spring_2021,A,Spring,2021,4.0,Wei Ning,MATH
89,STAT_5160,Time Series Analysis,3.0,electives,Spring_2021,A,Spring,2021,4.0,Yuhang Xu,STAT
92,STAT_6440,Data Mining,3.0,electives,Spring_2021,A,Spring,2021,4.0,Shuchismita Sarkar,STAT
126,MATH_6570,Statistical Computing (new),3.0,electives,Fall_2021,A,Fall,2021,4.0,Maria Rizzo,MATH
135,STAT_5020,Regression Analysis,3.0,electives,Fall_2021,A,Fall,2021,4.0,Shuchismita Sarkar,STAT
143,CS_6500,Big Data Analytics,3.0,required,Spring_2022,A,Spring,2022,4.0,Shuteng Niu,CS


## Analysis

In [18]:
long_req = pd.read_csv("../data/long_list_raw.csv")
len(long_req)

22

In [19]:
type_cp = ["required"]
int(long_req[(long_req["type"].isin(type_cp)) & (~long_req["grade"].isna())]["Credits"].sum())

19

In [20]:
subj = long_req["Code"].str.split("_").str.get(0).unique()
subj = ["CS"]

In [21]:
dff = long_req
dff[dff["Code"].str.split("_").str.get(0).isin(subj)]

Unnamed: 0,Code,Name,Credits,type,season,grade,year_half,year,number_grade,Lector,Subject
0,CS_5620,Database Management Systems,3.0,electives,Fall_2020,B,Fall,2020,3.0,Ray Kresman,CS
1,CS_6010,Data Science Programming,3.0,electives,Fall_2020,A,Fall,2020,4.0,Robert C. Green II,CS
9,CS_6500,Big Data Analytics,3.0,required,Spring_2022,A,Spring,2022,4.0,Shuteng Niu,CS
12,CS_7300,Unsupervised Feature Learning,3.0,required,Fall_2022,A,Fall,2022,4.0,Qing Tian,CS
13,CS_5170,Introduction to Parallel Computing,3.0,electives,Fall_2022,A,Fall,2022,4.0,Robert C. Green II,CS
17,CS_5630,Python for Computational and Data Sciences,3.0,electives,Spring_2023,A,Spring,2023,4.0,Qing Tian,CS


In [22]:
ms_courses = long_req[
    (long_req["season"].isin(ms_grade_columns)) &
    (~long_req["grade"].isna())
].fillna(0)
ms_gpa = sum(ms_courses["Credits"]*ms_courses["number_grade"])/sum(ms_courses["Credits"])
print("MS Courses: GPS=", np.round(ms_gpa, 3))
ms_courses

MS Courses: GPS= 3.889


Unnamed: 0,Code,Name,Credits,type,season,grade,year_half,year,number_grade,Lector,Subject
0,CS_5620,Database Management Systems,3.0,electives,Fall_2020,B,Fall,2020,3.0,Ray Kresman,CS
1,CS_6010,Data Science Programming,3.0,electives,Fall_2020,A,Fall,2020,4.0,Robert C. Green II,CS
2,MATH_6410,Probability Theory I,3.0,electives,Fall_2020,A,Fall,2020,4.0,Wei Ning,MATH
3,OR_6610,Linear and Integer Programming,3.0,electives,Fall_2020,A,Fall,2020,4.0,Christopher Rump,OR
4,MATH_6420,Mathematical Statistics II,3.0,electives,Spring_2021,A,Spring,2021,4.0,Wei Ning,MATH
5,STAT_5160,Time Series Analysis,3.0,electives,Spring_2021,A,Spring,2021,4.0,Yuhang Xu,STAT
6,STAT_6440,Data Mining,3.0,electives,Spring_2021,A,Spring,2021,4.0,Shuchismita Sarkar,STAT
7,MATH_6570,Statistical Computing (new),3.0,electives,Fall_2021,A,Fall,2021,4.0,Maria Rizzo,MATH
8,STAT_5020,Regression Analysis,3.0,electives,Fall_2021,A,Fall,2021,4.0,Shuchismita Sarkar,STAT


In [23]:
phd_courses = long_req[
    (long_req["season"].isin(phd_grade_columns)) &
    (~long_req["grade"].isna())
].dropna()
phd_gpa = sum(phd_courses["Credits"]*phd_courses["number_grade"])/sum(phd_courses["Credits"])
print("MS Courses: GPA=", np.round(phd_gpa, 3))
phd_courses

MS Courses: GPA= 4.0


Unnamed: 0,Code,Name,Credits,type,season,grade,year_half,year,number_grade,Lector,Subject
9,CS_6500,Big Data Analytics,3.0,required,Spring_2022,A,Spring,2022,4.0,Shuteng Niu,CS
10,MATH_7590,Gen Linear Models and Ext,3.0,required,Spring_2022,A,Spring,2022,4.0,Hanfeng Chen,MATH
11,MATH_6820,"Topics in Mathematics or Statistics, with advi...",3.0,electives,Spring_2022,A,Spring,2022,4.0,Craig Zirbel,MATH
12,CS_7300,Unsupervised Feature Learning,3.0,required,Fall_2022,A,Fall,2022,4.0,Qing Tian,CS
13,CS_5170,Introduction to Parallel Computing,3.0,electives,Fall_2022,A,Fall,2022,4.0,Robert C. Green II,CS
14,MATH_6490,Statistical Graphics (new),3.0,electives,Fall_2022,A,Fall,2022,4.0,Junfeng Shang,MATH
15,PHIL_6400,Ethical Issues in Data Science,3.0,required,Spring_2023,A,Spring,2023,4.0,Justin Donhauser,PHIL
16,STAT_7440,Advanced Data Mining,3.0,required,Spring_2023,A,Spring,2023,4.0,Trent D. Buskirk,STAT
17,CS_5630,Python for Computational and Data Sciences,3.0,electives,Spring_2023,A,Spring,2023,4.0,Qing Tian,CS


In [24]:
phd_required = long_req[
     (long_req["season"].isin(phd_grade_columns)) &
     (long_req["type"] == "required") &
     (~long_req["grade"].isna())
]
print("Required:", phd_required["Credits"].sum())
phd_required

Required: 19.0


Unnamed: 0,Code,Name,Credits,type,season,grade,year_half,year,number_grade,Lector,Subject
9,CS_6500,Big Data Analytics,3.0,required,Spring_2022,A,Spring,2022,4.0,Shuteng Niu,CS
10,MATH_7590,Gen Linear Models and Ext,3.0,required,Spring_2022,A,Spring,2022,4.0,Hanfeng Chen,MATH
12,CS_7300,Unsupervised Feature Learning,3.0,required,Fall_2022,A,Fall,2022,4.0,Qing Tian,CS
15,PHIL_6400,Ethical Issues in Data Science,3.0,required,Spring_2023,A,Spring,2023,4.0,Justin Donhauser,PHIL
16,STAT_7440,Advanced Data Mining,3.0,required,Spring_2023,A,Spring,2023,4.0,Trent D. Buskirk,STAT
18,MATH_7570,Linear Stat Inference,3.0,required,Fall_2023,IP,Fall,2023,,Junfeng Shang,MATH
19,DATA_7770,Data Science Exploration,1.0,required,Fall_2023,IP,Fall,2023,,,DATA


In [25]:
phd_electives = long_req[
     (long_req["season"].isin(phd_grade_columns)) &
     (long_req["type"] == "electives") &
     (~long_req["grade"].isna())
]
print("Electives:", phd_electives["Credits"].sum())
phd_electives

Electives: 18.0


Unnamed: 0,Code,Name,Credits,type,season,grade,year_half,year,number_grade,Lector,Subject
11,MATH_6820,"Topics in Mathematics or Statistics, with advi...",3.0,electives,Spring_2022,A,Spring,2022,4.0,Craig Zirbel,MATH
13,CS_5170,Introduction to Parallel Computing,3.0,electives,Fall_2022,A,Fall,2022,4.0,Robert C. Green II,CS
14,MATH_6490,Statistical Graphics (new),3.0,electives,Fall_2022,A,Fall,2022,4.0,Junfeng Shang,MATH
17,CS_5630,Python for Computational and Data Sciences,3.0,electives,Spring_2023,A,Spring,2023,4.0,Qing Tian,CS
20,MATH_6440,Stochastic Processes,3.0,electives,Fall_2023,IP,Fall,2023,,Craig Zirbel,MATH
21,MATH_6480,Bayesian Statistical Inference,3.0,electives,Fall_2023,IP,Fall,2023,,Wei Ning,MATH


In [26]:
long_req[(long_req["season"]=="Fall_2023") & (~long_req["grade"].isna()) ]

Unnamed: 0,Code,Name,Credits,type,season,grade,year_half,year,number_grade,Lector,Subject
18,MATH_7570,Linear Stat Inference,3.0,required,Fall_2023,IP,Fall,2023,,Junfeng Shang,MATH
19,DATA_7770,Data Science Exploration,1.0,required,Fall_2023,IP,Fall,2023,,,DATA
20,MATH_6440,Stochastic Processes,3.0,electives,Fall_2023,IP,Fall,2023,,Craig Zirbel,MATH
21,MATH_6480,Bayesian Statistical Inference,3.0,electives,Fall_2023,IP,Fall,2023,,Wei Ning,MATH
