See link https://www.bgsu.edu/graduate/catalogs-and-policies/graduate-catalog/data-science.html

In [148]:
import pandas as pd
import numpy as np

In [49]:
# requirements
req = pd.read_csv("./csv/corses_list.csv")
req = req.iloc[:,1:4]

In [50]:
req["type"] = ""
req.at[:14, "type"] = "required"
req.at[15:46, "type"] = "electives"

In [4]:
req.set_index("Code", inplace=True)

## Transripts

In [51]:
def parse_transcript(F, name, replace = False):
    global req
    df =  [[
        a.split()[0]+"_"+a.split()[1], a.split()[-2]
    ] for a in F.splitlines() if len(a)>0]
    df = pd.DataFrame(df)
    df.columns = ["Code", name]
    df.set_index("Code", inplace = True)
    new_req = pd.concat([req, df], axis=1)
    if(replace):
        req = new_req.copy()
    return new_req

### Fall 2020

In [52]:
fall_2020 = """
CS 5620 Database Mgmt Systems 3.000 3.000 B 9.000
CS 6010 Data Science Programming 3.000 3.000 A 12.000
MATH 6410 Probability Theory I 3.000 3.000 A 12.000
OR 6610 Linear-Integer Prgmg 3.000 3.000 A 12.000
"""

In [53]:
parse_transcript(fall_2020, "Fall_2020", replace=True);

In [54]:
req

Unnamed: 0,Code,Name,Credits,type,Fall_2020
0,CS_6260,Visualization,3.0,required,
1,CS_7200,Machine Learning,3.0,required,
2,CS_6500,Big Data Analytics,3.0,required,
3,CS_7300,Unsupervised Feature Learning,3.0,required,
4,MATH_7550,Statistical Learning I,3.0,required,
5,MATH_7560,Statistical Learning II,3.0,required,
6,MATH_7570,Linear Stat Inference,3.0,required,
7,MATH_7590,Gen Linear Models and Ext,3.0,required,
8,DATA_7770,Data Science Exploration,1.0,required,
9,DATA_7780,Data Science Communication,1.0,required,


### Spring_2021

In [8]:
dat = """
MATH 6420 Math Statistics II 3.000 3.000 A 12.000
STAT 5160 Time Series Analysis 3.000 3.000 A 12.000
STAT 6440 Data Mining 3.000 3.000 A 12.000
"""
parse_transcript(dat, "Spring_2021", replace=True);

### Fall_2021

In [9]:
dat = """
DATA 6910 Data Science Project 3.000 3.000 S 0.000
MATH 6570 Statistical Computing 3.000 3.000 A 12.000
STAT 5020 Regression Analysis 3.000 3.000 A 12.000
"""
parse_transcript(dat, "Fall_2021", replace=True);

### Spring_2022

In [10]:
dat = """
CS 6500 Big Data Analytics 3.000 3.000 A 12.000
MATH 6820 Topics In Math or Stat 3.000 3.000 A 12.000
MATH 7590 Gen Linear Models 3.000 3.000 A 12.000
"""
parse_transcript(dat, "Spring_2022", replace=True);

### Fall_2022

In [11]:
dat = """
CS 5170 Intro Parallel Computing 3.000 3.000 A 12.000
CS 7300 Unsupervised Feature Learning 3.000 3.000 A 12.000
MATH 6490 Statistical Graphics 3.000 3.000 A 12.000
"""
parse_transcript(dat, "Fall_2022", replace=True);

### Spring_2023

In [12]:
dat = """
CS 5630 Python for Comput & Data Sci 3.000 3.000 A 12.000
STAT 7440 Adv Topics in Data Mining 3.000 3.000 A 12.000
PHIL 6400 Data Science Ethics 3.000 3.000 A 12.000
"""
parse_transcript(dat, "Spring_2023", replace=True);

### Fall_2023

In [13]:
dat = """
DATA 7770 Data Science Exploration 1.000 0.000 IP 0.000
MATH 6440 Stochastic Processes 3.000 0.000 IP 0.000
MATH 6480 Bayesian Decision Theory 3.000 0.000 IP 0.000
MATH 7570 Linear Stat Inference 3.000 0.000 IP 0.000
"""
parse_transcript(dat, "Fall_2023", replace=True);

## Analysis

In [17]:
req.fillna("_")

Unnamed: 0,Name,Credits,type,Fall_2020,Spring_2021,Fall_2021,Spring_2022,Fall_2022,Spring_2023,Fall_2023
CS_6260,Visualization,3,required,_,_,_,_,_,_,_
CS_7200,Machine Learning,3,required,_,_,_,_,_,_,_
CS_6500,Big Data Analytics,3,required,_,_,_,A,_,_,_
CS_7300,Unsupervised Feature Learning,3,required,_,_,_,_,A,_,_
MATH_7550,Statistical Learning I,3,required,_,_,_,_,_,_,_
MATH_7560,Statistical Learning II,3,required,_,_,_,_,_,_,_
MATH_7570,Linear Stat Inference,3,required,_,_,_,_,_,_,IP
MATH_7590,Gen Linear Models and Ext,3,required,_,_,_,A,_,_,_
DATA_7770,Data Science Exploration,1,required,_,_,_,_,_,_,IP
DATA_7780,Data Science Communication,1,required,_,_,_,_,_,_,_


In [18]:
# my MS
taken_ms = req[ms_grade_columns].dropna(axis=0, how="all").index
print("Total hours:", req.loc[taken_ms].Credits.sum())
req.loc[taken_ms].fillna("_").iloc[:,:6]

Total hours: 27.0


Unnamed: 0,Name,Credits,type,Fall_2020,Spring_2021,Fall_2021
CS_5620,Database Management Systems,3,electives,B,_,_
CS_6010,Data Science Programming,3,electives,A,_,_
MATH_6410,Probability Theory I,3,electives,A,_,_
MATH_6420,Mathematical Statistics II,3,electives,_,A,_
MATH_6570,Statistical Computing (new),3,electives,_,_,A
OR_6610,Linear and Integer Programming,3,electives,A,_,_
STAT_5020,Regression Analysis,3,electives,_,_,A
STAT_5160,Time Series Analysis,3,electives,_,A,_
STAT_6440,Data Mining,3,electives,_,A,_
DATA_6910,_,_,_,_,_,S


In [19]:
# required, that I have taken
print("Required Courses")
taken_required = req[req.type == "required"][phd_grade_columns].dropna(axis=0, how="all").index
print("Total hours:",int(req.loc[taken_required].Credits.sum())," c.h.")
req.loc[taken_required].fillna("-").iloc[:, np.r_[0, 1, 2, 6, 7, 8, 9]]

Required Courses
Total hours: 19  c.h.


Unnamed: 0,Name,Credits,type,Spring_2022,Fall_2022,Spring_2023,Fall_2023
CS_6500,Big Data Analytics,3.0,required,A,-,-,-
CS_7300,Unsupervised Feature Learning,3.0,required,-,A,-,-
MATH_7570,Linear Stat Inference,3.0,required,-,-,-,IP
MATH_7590,Gen Linear Models and Ext,3.0,required,A,-,-,-
DATA_7770,Data Science Exploration,1.0,required,-,-,-,IP
PHIL_6400,Ethical Issues in Data Science,3.0,required,-,-,A,-
STAT_7440,Advanced Data Mining,3.0,required,-,-,A,-


In [20]:
# selectives, that I have taken
print("Selective Courses")
taken_eletives = req[req.type == "electives"][phd_grade_columns].dropna(axis=0, how="all").index
print("Total hours:",int(req.loc[taken_eletives].Credits.sum())," c.h.")
req.loc[taken_eletives].fillna("-").iloc[:, np.r_[0, 1, 2, 6, 7, 8, 9]]

Selective Courses
Total hours: 18  c.h.


Unnamed: 0,Name,Credits,type,Spring_2022,Fall_2022,Spring_2023,Fall_2023
CS_5170,Introduction to Parallel Computing,3.0,electives,-,A,-,-
CS_5630,Python for Computational and Data Sciences,3.0,electives,-,-,A,-
MATH_6440,Stochastic Processes,3.0,electives,-,-,-,IP
MATH_6480,Bayesian Statistical Inference,3.0,electives,-,-,-,IP
MATH_6490,Statistical Graphics (new),3.0,electives,-,A,-,-
MATH_6820,"Topics in Mathematics or Statistics, with advi...",3.0,electives,A,-,-,-


In [21]:
16+18

34

In [22]:
print("Not Passed")
passed = req[grade_columns].dropna(axis=0, how="all").index
req.loc[~req.index.isin(passed)].iloc[:,:2]

Not Passed


Unnamed: 0,Name,Credits
CS_6260,Visualization,3.0
CS_7200,Machine Learning,3.0
MATH_7550,Statistical Learning I,3.0
MATH_7560,Statistical Learning II,3.0
DATA_7780,Data Science Communication,1.0
XXXX_XXXX,Applied Data Science Experience,3.0
DATA_7890,Internship,1.0
DATA_7930,Directed Reading,3.0
CS_5200,Artificial Intelligence Methods,3.0
CS_6200,Advanced Topics in Artificial Intelligence,3.0


## Export

In [177]:
long_req = pd.melt(req.reset_index().rename(columns={"index":"Code"}), 
        id_vars=["Code", "Name","Credits","type"], value_vars=grade_columns, var_name="season",value_name="grade")
long_req["year_half"] = long_req.season.str.split("_").str.get(0)
long_req["year"] = long_req.season.str.split("_").str.get(1).astype("int")
grade_dict = {"A":4,"B":3,"C":2, "D":1,"F":0,np.nan:0}
long_req["number_grade"] = long_req["grade"].map(grade_dict)

long_req.to_excel("./excel/long_list_raw.xlsx")
long_req.to_csv("./csv/long_list_raw.csv")

NameError: name 'grade_columns' is not defined

In [178]:
long_req = pd.read_csv("./csv/long_list_raw.csv")
long_req = long_req[~long_req["grade"].isna()]
# long_req = long_req.reset_index().drop(columns = ["Unnamed: 0.1", "Unnamed: 0", "index"])

In [179]:
long_req["Lector"] = np.nan
long_req.loc[long_req["Code"]=="OR_6610", "Lector"] = "Christopher Rump"
long_req.loc[long_req["Code"]=="STAT_6440", "Lector"] = "Shuchismita Sarkar"
long_req.loc[long_req["Code"]=="CS_6010", "Lector"] = "Robert C. Green II"
long_req.loc[long_req["Code"]=="CS_7300", "Lector"] = "Qing Tian"
long_req.loc[long_req["Code"]=="MATH_6490", "Lector"] = "Junfeng Shang"
long_req.loc[long_req["Code"]=="CS_5630", "Lector"] = "Qing Tian"
long_req.loc[long_req["Code"]=="STAT_7440", "Lector"] = "Trent D. Buskirk"
long_req.loc[long_req["Code"]=="PHIL_6400", "Lector"] = "Justin Donhauser"
long_req.loc[long_req["Code"]=="CS_5620", "Lector"] = "Ray Kresman"
long_req.loc[long_req["Code"]=="MATH_6410", "Lector"] = "Wei Ning"
long_req.loc[long_req["Code"]=="MATH_6420", "Lector"] = "Wei Ning"
long_req.loc[long_req["Code"]=="STAT_5160", "Lector"] = "Yuhang Xu"
long_req.loc[long_req["Code"]=="MATH_6570", "Lector"] = "Maria Rizzo"
long_req.loc[long_req["Code"]=="STAT_5020", "Lector"] = "Shuchismita Sarkar"
long_req.loc[long_req["Code"]=="DATA_6910", "Lector"] = "Yuhang Xu"
long_req.loc[long_req["Code"]=="CS_6500", "Lector"] = "Shuteng Niu"
long_req.loc[long_req["Code"]=="MATH_7590", "Lector"] = "Hanfeng Chen"
long_req.loc[long_req["Code"]=="MATH_6820", "Lector"] = "Craig Zirbel"
long_req.loc[long_req["Code"]=="CS_5170", "Lector"] = "Robert C. Green II"
long_req.loc[long_req["Code"]=="MATH_6440", "Lector"] = "Craig Zirbel"
long_req.loc[long_req["Code"]=="MATH_6480", "Lector"] = "Wei Ning"
long_req.loc[long_req["Code"]=="MATH_7570", "Lector"] = "Junfeng Shang"
long_req.loc[long_req["Code"]=="DATA_6910", "Name"] = "MS Thesis"
long_req.loc[long_req["Code"]=="DATA_6910", "Credits"] = 3
long_req.loc[long_req["Code"]=="DATA_6910", "type"] = "required"
#long_req.loc[long_req["Code"]=="???", "Lector"] = "???"
print(f'{long_req["Lector"].isna().sum()} Nans')
long_req[long_req["Lector"].isna()]

1 Nans


Unnamed: 0,Code,Name,Credits,type,season,grade,year_half,year,number_grade,Lector,Subject
20,DATA_7770,Data Science Exploration,1.0,required,Fall_2023,IP,Fall,2023,,,DATA


In [180]:
long_req["Subject"] = long_req["Code"].str.split("_").str.get(0)

In [181]:
# long_req.to_excel("./excel/long_list_raw.xlsx")
long_req.to_csv("./csv/long_list_raw.csv", index=False)

## Analysis

In [135]:
long_req = pd.read_csv("csv/long_list_raw.csv")
len(long_req)

23

In [136]:
type_cp = ["required"]
int(long_req[(long_req["type"].isin(type_cp)) & (~long_req["grade"].isna())]["Credits"].sum())

19

In [137]:
subj = long_req["Code"].str.split("_").str.get(0).unique()
subj = ["CS"]

In [138]:
dff = long_req
dff[dff["Code"].str.split("_").str.get(0).isin(subj)]

Unnamed: 0.1,Unnamed: 0,Code,Name,Credits,type,season,grade,year_half,year,number_grade,Lector
0,0,CS_5620,Database Management Systems,3.0,electives,Fall_2020,B,Fall,2020,3.0,Ray Kresman
1,1,CS_6010,Data Science Programming,3.0,electives,Fall_2020,A,Fall,2020,4.0,Robert C. Green II
10,10,CS_6500,Big Data Analytics,3.0,required,Spring_2022,A,Spring,2022,4.0,Shuteng Niu
13,13,CS_7300,Unsupervised Feature Learning,3.0,required,Fall_2022,A,Fall,2022,4.0,Qing Tian
14,14,CS_5170,Introduction to Parallel Computing,3.0,electives,Fall_2022,A,Fall,2022,4.0,Robert C. Green II
18,18,CS_5630,Python for Computational and Data Sciences,3.0,electives,Spring_2023,A,Spring,2023,4.0,Qing Tian


In [26]:
dff = long_req
dff = dff[(dff["number_grade"]>0) & (~dff["Credits"].isna())]
# len(dff)
a


3.9444444444444446

In [None]:
long_req["a"]

In [14]:
long_req["season"].unique()[3:]

array(['Spring_2022', 'Fall_2022', 'Spring_2023', 'Fall_2023'],
      dtype=object)

In [36]:
grade_columns = np.array([c for c in req.columns if "_" in c])
ms_grade_columns = grade_columns[:3]
phd_grade_columns = grade_columns[3:]

In [51]:
ms_courses = long_req[
    (long_req["season"].isin(ms_grade_columns)) &
    (~long_req["grade"].isna())
].fillna(0)
ms_gpa = sum(ms_courses["Credits"]*ms_courses["number_grade"])/sum(ms_courses["Credits"])
print("MS Courses: GPS=", np.round(ms_gpa, 3))
ms_courses

MS Courses: GPS= 3.889


Unnamed: 0,Code,Name,Credits,type,season,grade,year_half,year,number_grade
17,CS_5620,Database Management Systems,3.0,electives,Fall_2020,B,Fall,2020,3.0
18,CS_6010,Data Science Programming,3.0,electives,Fall_2020,A,Fall,2020,4.0
25,MATH_6410,Probability Theory I,3.0,electives,Fall_2020,A,Fall,2020,4.0
39,OR_6610,Linear and Integer Programming,3.0,electives,Fall_2020,A,Fall,2020,4.0
74,MATH_6420,Mathematical Statistics II,3.0,electives,Spring_2021,A,Spring,2021,4.0
90,STAT_5160,Time Series Analysis,3.0,electives,Spring_2021,A,Spring,2021,4.0
93,STAT_6440,Data Mining,3.0,electives,Spring_2021,A,Spring,2021,4.0
128,MATH_6570,Statistical Computing (new),3.0,electives,Fall_2021,A,Fall,2021,4.0
137,STAT_5020,Regression Analysis,3.0,electives,Fall_2021,A,Fall,2021,4.0
143,DATA_6910,0,0.0,0,Fall_2021,S,Fall,2021,0.0


In [56]:
phd_courses = long_req[
    (long_req["season"].isin(phd_grade_columns)) &
    (~long_req["grade"].isna())
].dropna()
phd_gpa = sum(phd_courses["Credits"]*phd_courses["number_grade"])/sum(phd_courses["Credits"])
print("MS Courses: GPA=", np.round(phd_gpa, 3))
phd_courses

MS Courses: GPA= 4.0


Unnamed: 0,Code,Name,Credits,type,season,grade,year_half,year,number_grade
146,CS_6500,Big Data Analytics,3.0,required,Spring_2022,A,Spring,2022,4.0
151,MATH_7590,Gen Linear Models and Ext,3.0,required,Spring_2022,A,Spring,2022,4.0
179,MATH_6820,"Topics in Mathematics or Statistics, with advi...",3.0,electives,Spring_2022,A,Spring,2022,4.0
195,CS_7300,Unsupervised Feature Learning,3.0,required,Fall_2022,A,Fall,2022,4.0
207,CS_5170,Introduction to Parallel Computing,3.0,electives,Fall_2022,A,Fall,2022,4.0
223,MATH_6490,Statistical Graphics (new),3.0,electives,Fall_2022,A,Fall,2022,4.0
250,PHIL_6400,Ethical Issues in Data Science,3.0,required,Spring_2023,A,Spring,2023,4.0
251,STAT_7440,Advanced Data Mining,3.0,required,Spring_2023,A,Spring,2023,4.0
263,CS_5630,Python for Computational and Data Sciences,3.0,electives,Spring_2023,A,Spring,2023,4.0


In [57]:
phd_required = long_req[
     (long_req["season"].isin(phd_grade_columns)) &
     (long_req["type"] == "required") &
     (~long_req["grade"].isna())
]
print("Required:", phd_required["Credits"].sum())
phd_required

Required: 19.0


Unnamed: 0,Code,Name,Credits,type,season,grade,year_half,year,number_grade
146,CS_6500,Big Data Analytics,3.0,required,Spring_2022,A,Spring,2022,4.0
151,MATH_7590,Gen Linear Models and Ext,3.0,required,Spring_2022,A,Spring,2022,4.0
195,CS_7300,Unsupervised Feature Learning,3.0,required,Fall_2022,A,Fall,2022,4.0
250,PHIL_6400,Ethical Issues in Data Science,3.0,required,Spring_2023,A,Spring,2023,4.0
251,STAT_7440,Advanced Data Mining,3.0,required,Spring_2023,A,Spring,2023,4.0
294,MATH_7570,Linear Stat Inference,3.0,required,Fall_2023,IP,Fall,2023,
296,DATA_7770,Data Science Exploration,1.0,required,Fall_2023,IP,Fall,2023,


In [27]:
phd_electives = long_req[
     (long_req["season"].isin(phd_grade_columns)) &
     (long_req["type"] == "electives") &
     (~long_req["grade"].isna())
]
print("Electives:", phd_electives["Credits"].sum())
phd_electives

Electives: 18.0


Unnamed: 0,Code,Name,Credits,type,season,grade,year_half,year,number_grade
179,MATH_6820,"Topics in Mathematics or Statistics, with advi...",3.0,electives,Spring_2022,A,Spring,2022,4.0
207,CS_5170,Introduction to Parallel Computing,3.0,electives,Fall_2022,A,Fall,2022,4.0
223,MATH_6490,Statistical Graphics (new),3.0,electives,Fall_2022,A,Fall,2022,4.0
263,CS_5630,Python for Computational and Data Sciences,3.0,electives,Spring_2023,A,Spring,2023,4.0
315,MATH_6440,Stochastic Processes,3.0,electives,Fall_2023,IP,Fall,2023,
318,MATH_6480,Bayesian Statistical Inference,3.0,electives,Fall_2023,IP,Fall,2023,


In [58]:
long_req[(long_req["season"]=="Fall_2023") & (~long_req["grade"].isna()) ]

Unnamed: 0,Code,Name,Credits,type,season,grade,year_half,year,number_grade
294,MATH_7570,Linear Stat Inference,3.0,required,Fall_2023,IP,Fall,2023,
296,DATA_7770,Data Science Exploration,1.0,required,Fall_2023,IP,Fall,2023,
315,MATH_6440,Stochastic Processes,3.0,electives,Fall_2023,IP,Fall,2023,
318,MATH_6480,Bayesian Statistical Inference,3.0,electives,Fall_2023,IP,Fall,2023,
