In [150]:
import pandas as pd
import glob

# Get a list of all CSV file paths in the data folder
csv_files = glob.glob('data/*.csv')

# Load each CSV file as a dataframe
dataframes = {}
for file in csv_files:
    # Extract the filename without extension as the dataframe name
    dataframe_name = file.split('/')[-1].split('\\')[-1].split('.')[0]
    # Load the CSV file as a dataframe
    dataframes[dataframe_name] = pd.read_csv(file)

# Access the dataframes using their names
enlistment_facts_df = dataframes['enlistment_facts']
classes_df = dataframes['classes']
departments_df = dataframes['departments']
schools_df = dataframes['schools']
students_df = dataframes['students']
payments_df = dataframes['payments']


In [151]:
enlistment_facts_df.head()

Unnamed: 0,student_id,class_id,payment_id
0,1,21,1
1,1,32,2
2,1,35,3
3,1,44,4
4,1,11,5


In [152]:
classes_df.head()

Unnamed: 0,class_id,department_id,section_label,units,rate_per_unit,revenue_per_student,class_year,class_semester,class_type,class_cost
0,1,16,V,3,5000,15000,2023,2nd,Lecture,6031
1,2,17,Q,2,5000,10000,2023,IS,Lab,12229
2,3,8,A,3,5000,15000,2024,2nd,Lecture,9410
3,4,1,Z,3,4500,13500,2024,1st,Lecture,9030
4,5,12,Y,3,5000,15000,2023,1st,Lecture,5517


In [153]:
departments_df.head()

Unnamed: 0,department_id,department_name,school_id
0,1,Department of Computer Science,1
1,2,Department of Information Systems and Computer...,1
2,3,Department of Mathematics,1
3,4,Department of Physics,1
4,5,Department of Chemistry,1


In [154]:
schools_df.head()

Unnamed: 0,school_id,school_name
0,1,School of Science and Engineering
1,2,John Gokongwei School of Management
2,3,School of Humanities
3,4,School of Social Sciences
4,5,Gokongwei Brothers School of Education and Lea...


In [155]:
students_df.head()

Unnamed: 0,student_id,student_name,year_level
0,1,William Chung,4
1,2,Rachel Golden,2
2,3,Mark Wright,2
3,4,Jessica Castro,4
4,5,Laura Wilcox,2


In [156]:
# Merge the Enlistment Facts with Classes, Departments, and Schools
merged_df = pd.merge(enlistment_facts_df, classes_df, how='left', on='class_id')
merged_df = pd.merge(merged_df, departments_df, how='left', on='department_id')
merged_df = pd.merge(merged_df, schools_df, how='left', on='school_id')
merged_df = pd.merge(merged_df, payments_df, how='left', on='payment_id')
merged_df = pd.merge(merged_df, students_df, how='left', on='student_id')

merged_df


Unnamed: 0,student_id,class_id,payment_id,department_id,section_label,units,rate_per_unit,revenue_per_student,class_year,class_semester,class_type,class_cost,department_name,school_id,school_name,payment_status,student_name,year_level
0,1,21,1,7,O,3,4500,13500,2024,IS,Lecture,5692,Department of Psychology,4,School of Social Sciences,On Time,William Chung,4
1,1,32,2,6,R,3,4500,13500,2024,2nd,Lecture,7595,Department of Biology,1,School of Science and Engineering,On Time,William Chung,4
2,1,35,3,2,Q,3,5000,15000,2024,2nd,Lecture,8054,Department of Information Systems and Computer...,1,School of Science and Engineering,On Time,William Chung,4
3,1,44,4,7,F,3,4500,13500,2024,1st,Lecture,7557,Department of Psychology,4,School of Social Sciences,On Time,William Chung,4
4,1,11,5,9,K,3,5000,15000,2023,IS,Lecture,8125,Department of Philosophy,3,School of Humanities,On Time,William Chung,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,100,25,496,7,Y,3,5000,15000,2024,1st,Lecture,5652,Department of Psychology,4,School of Social Sciences,On Time,Ryan Huffman,4
496,100,18,497,7,V,3,4500,13500,2023,IS,Lecture,5142,Department of Psychology,4,School of Social Sciences,On Time,Ryan Huffman,4
497,100,32,498,6,R,3,4500,13500,2024,2nd,Lecture,7595,Department of Biology,1,School of Science and Engineering,On Time,Ryan Huffman,4
498,100,8,499,1,Z,3,5000,15000,2023,1st,Lecture,9121,Department of Computer Science,1,School of Science and Engineering,On Time,Ryan Huffman,4


## Extracting Metrics

In [157]:
# Revenue per school and department for the current semester vs the last semester of the same school year

current_sem_2024_df = merged_df.loc[(merged_df['class_semester'] == '2nd') & (merged_df['class_year'] == 2024)]
current_sem_2024_df = current_sem_2024_df[["revenue_per_student", "department_id"]]
current_sem_2024_df = current_sem_2024_df.groupby(['department_id']).sum()
current_sem_2024_df = current_sem_2024_df.rename(columns={"revenue_per_student": "revenue"})
current_sem_2024_df

# incomplete; data is skewed; not all depts making revenue in a sem

Unnamed: 0_level_0,revenue
department_id,Unnamed: 1_level_1
2,120000
3,175500
6,175500
8,315000
9,105000
12,105000
14,240000


In [158]:
last_sem_2024_df = merged_df.loc[(merged_df['class_semester'] == '1st') & (merged_df['class_year'] == 2024)]
last_sem_2024_df = last_sem_2024_df[["revenue_per_student", "department_id"]]
last_sem_2024_df = last_sem_2024_df.groupby(['department_id']).sum()
last_sem_2024_df = last_sem_2024_df.rename(columns={"revenue_per_student": "revenue"})
last_sem_2024_df

# incomplete; data is skewed; not all depts making revenue in a sem

Unnamed: 0_level_0,revenue
department_id,Unnamed: 1_level_1
1,267000
7,259500
15,75000
16,162000
17,165000


In [159]:
# Revenue per school and department for the current semester of the current year vs the same semester of the last school year

current_sem_2023_df = merged_df.loc[(merged_df['class_semester'] == '2nd') & (merged_df['class_year'] == 2023)]
current_sem_2023_df = current_sem_2023_df[["revenue_per_student", "department_id"]]
current_sem_2023_df = current_sem_2023_df.groupby(['department_id']).sum()
current_sem_2023_df = current_sem_2023_df.rename(columns={"revenue_per_student": "revenue"})
current_sem_2023_df

# incomplete; data is skewed; not all depts making revenue in a sem

Unnamed: 0_level_0,revenue
department_id,Unnamed: 1_level_1
5,148500
8,80000
9,108000
10,148500
13,165000
16,210000
17,310500
18,94500


In [161]:
# Which school offers the most profitable classes?

school_profit_df = merged_df[["school_id", "units", "rate_per_unit", "class_cost"]]
school_profit_df['revenue'] = school_profit_df.apply(lambda row: row.units * row.rate_per_unit, axis=1)
school_profit_df['profit'] = school_profit_df.apply(lambda row: row.revenue - row.class_cost, axis=1)
school_profit_df = school_profit_df.groupby(['school_id']).sum()
school_profit_df = pd.merge(school_profit_df, schools_df, how='inner', on='school_id')
school_profit_df = school_profit_df[["school_id", "school_name", "profit"]]
school_profit_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  school_profit_df['revenue'] = school_profit_df.apply(lambda row: row.units * row.rate_per_unit, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  school_profit_df['profit'] = school_profit_df.apply(lambda row: row.revenue - row.class_cost, axis=1)


Unnamed: 0,school_id,school_name,profit
0,1,School of Science and Engineering,640212
1,2,John Gokongwei School of Management,636284
2,3,School of Humanities,369868
3,4,School of Social Sciences,1187814


In [162]:
# Which department offers the most profitable classes?

dept_profit_df = merged_df[["department_id", "units", "rate_per_unit", "class_cost"]]
dept_profit_df['revenue'] = dept_profit_df.apply(lambda row: row.units * row.rate_per_unit, axis=1)
dept_profit_df['profit'] = dept_profit_df.apply(lambda row: row.revenue - row.class_cost, axis=1)
dept_profit_df = dept_profit_df.groupby(['department_id']).sum()
dept_profit_df = pd.merge(dept_profit_df, departments_df, how='inner', on='department_id')
dept_profit_df = dept_profit_df[["department_id", "department_name", "profit"]]
dept_profit_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dept_profit_df['revenue'] = dept_profit_df.apply(lambda row: row.units * row.rate_per_unit, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dept_profit_df['profit'] = dept_profit_df.apply(lambda row: row.revenue - row.class_cost, axis=1)


Unnamed: 0,department_id,department_name,profit
0,1,Department of Computer Science,186624
1,2,Department of Information Systems and Computer...,62762
2,3,Department of Mathematics,75101
3,4,Department of Physics,147800
4,5,Department of Chemistry,91160
5,6,Department of Biology,76765
6,7,Department of Psychology,408167
7,8,Department of History,195033
8,9,Department of Philosophy,232012
9,10,Department of English,83666


In [166]:
# Which type of class offers the most profitable classes?

class_type_profit_df = merged_df[["class_type", "units", "rate_per_unit", "class_cost"]]
class_type_profit_df['revenue'] = class_type_profit_df.apply(lambda row: row.units * row.rate_per_unit, axis=1)
class_type_profit_df['profit'] = class_type_profit_df.apply(lambda row: row.revenue - row.class_cost, axis=1)
class_type_profit_df = class_type_profit_df.groupby(['class_type']).sum()
class_type_profit_df = class_type_profit_df[["profit"]]
class_type_profit_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  class_type_profit_df['revenue'] = class_type_profit_df.apply(lambda row: row.units * row.rate_per_unit, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  class_type_profit_df['profit'] = class_type_profit_df.apply(lambda row: row.revenue - row.class_cost, axis=1)


Unnamed: 0_level_0,profit
class_type,Unnamed: 1_level_1
Lab,-100051
Lecture,2934229
