In [1]:
import pandas as pd
from scipy.stats import ttest_ind
from scipy.stats import skew

In [2]:
# Load the dataframe
#df = pd.read_csv('Evaluation-Final.csv')
#df = pd.read_csv('Performance-Final.csv')

#df = pd.read_csv('Evaluation-Mid.csv')
df = pd.read_csv('Performance-Mid.csv')

In [3]:
# Summary statistics (8 semesters)
summary_short = df.describe()
summary_short_df = pd.DataFrame(summary_short)
summary_short_df = summary_short_df.round(2)
summary_short_df

Unnamed: 0,F20,F21,F22,F23,S21,S22,S23,S24
count,127.0,130.0,128.0,130.0,111.0,116.0,114.0,118.0
mean,75.31,75.64,76.03,78.64,72.81,68.92,67.08,68.96
std,19.79,17.16,16.41,13.82,18.64,18.32,18.07,16.12
min,0.0,0.0,0.0,21.57,0.0,0.0,0.0,0.0
25%,68.47,67.54,70.84,71.88,63.59,56.85,58.41,55.25
50%,78.87,76.76,79.04,80.56,75.05,70.26,68.98,70.78
75%,87.76,85.98,84.74,87.73,85.66,83.52,79.65,81.91
max,96.89,99.92,94.44,96.95,98.78,94.61,89.84,96.94


##### I will create a merged variables  from the original data:
- 2 "megavariables" that combine 4 fall semesters and 4 spring semesters.
- 4 variables that combine fall and spring semesters from the same academic year (e.g. F20 + S21).

In [4]:
# Fall: Harkness and Interviews teaching method
F20_23 = pd.DataFrame({'F20-23': pd.concat([df[['F20', 'F21', 'F22', 'F23']].stack()], ignore_index=True).dropna()})
F20_23.reset_index(drop=True, inplace=True)

# Spring: Traditional teaching method
S21_24 = pd.DataFrame({'S21-24': pd.concat([df[['S21', 'S22', 'S23', 'S24']].stack()], ignore_index=True).dropna()})
S21_24.reset_index(drop=True, inplace=True)

# Create the variable that comprises the entire dataset
F20_S24 = pd.DataFrame({'F20-S21': pd.concat([df[['F20', 'S21', 'F21', 'S22', 'F22', 'S23', 'F23', 'S24']].stack()], ignore_index=True).dropna()})
F20_S24.reset_index(drop=True, inplace=True)


# Determine the maximum length between these two megavariables
max_length = max(len(F20_23), len(S21_24), len(F20_S24)) # of course it should be the latter one
# Pad df with NaN values to match the maximum length
df = df.reindex(range(max_length))


# Create variables that comprise the academic years
F20_S21 = pd.DataFrame({'F20-S21': pd.concat([df[['F20', 'S21']].stack()], ignore_index=True).dropna()})
F20_S21.reset_index(drop=True, inplace=True)

F21_S22 = pd.DataFrame({'F21-S22': pd.concat([df[['F21', 'S22']].stack()], ignore_index=True).dropna()})
F21_S22.reset_index(drop=True, inplace=True)

F22_S23 = pd.DataFrame({'F22-S23': pd.concat([df[['F22', 'S23']].stack()], ignore_index=True).dropna()})
F22_S23.reset_index(drop=True, inplace=True)

F23_S24 = pd.DataFrame({'F23-S24': pd.concat([df[['F23', 'S24']].stack()], ignore_index=True).dropna()})
F23_S24.reset_index(drop=True, inplace=True)



df['F20-S21'] = F20_S21
df['F21-S22'] = F21_S22
df['F22-S23'] = F22_S23
df['F23-S24'] = F23_S24

df['F20-23'] = F20_23
df['S21-24'] = S21_24

df['F20-S24'] = F20_S24

df
# Save DataFrame to CSV
#df.to_csv('output.csv', index=False)

Unnamed: 0,F20,F21,F22,F23,S21,S22,S23,S24,F20-S21,F21-S22,F22-S23,F23-S24,F20-23,S21-24,F20-S24
0,96.41,94.45,91.78,96.67,66.41,61.45,71.76,61.32,96.41,94.45,91.78,96.67,96.41,66.41,96.41
1,80.75,73.38,90.85,73.21,62.90,90.53,87.82,84.42,66.41,61.45,71.76,61.32,94.45,61.45,66.41
2,89.85,68.68,69.30,91.08,53.98,87.02,83.53,85.10,80.75,73.38,90.85,73.21,91.78,71.76,94.45
3,90.21,65.98,69.22,81.25,75.05,54.11,69.14,54.99,62.90,90.53,87.82,84.42,96.67,61.32,61.45
4,89.44,67.63,75.53,83.09,77.02,66.23,78.99,48.21,89.85,68.68,69.30,91.08,80.75,62.90,91.78
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
969,,,,,,,,,,,,,,,22.01
970,,,,,,,,,,,,,,,0.00
971,,,,,,,,,,,,,,,21.57
972,,,,,,,,,,,,,,,0.00


In [5]:
# Summary statistics (8 semesters)
summary_all = df.describe()
skewness = df.apply(lambda x: skew(x.dropna()))
summary_all.loc['skew'] = skewness

order = ['count', 'mean', 'std', 'skew', 'min', '25%', '50%', '75%', 'max']
summary_all = summary_all.loc[order]
summary_all = summary_all[['F20', 'F21', 'F22', 'F23', 'F20-23', 'S21', 'S22', 'S23', 'S24', 'S21-24', 'F20-S21', 'F21-S22', 'F22-S23', 'F23-S24', 'F20-S24']]

summary_all_df = pd.DataFrame(summary_all.T)
summary_all_df = summary_all_df.round(2)
summary_all_df

Unnamed: 0,count,mean,std,skew,min,25%,50%,75%,max
F20,127.0,75.31,19.79,-2.41,0.0,68.47,78.87,87.76,96.89
F21,130.0,75.64,17.16,-1.65,0.0,67.54,76.76,85.98,99.92
F22,128.0,76.03,16.41,-2.64,0.0,70.84,79.04,84.74,94.44
F23,130.0,78.64,13.82,-1.6,21.57,71.88,80.56,87.73,96.95
F20-23,515.0,76.41,16.92,-2.24,0.0,69.44,78.72,86.7,99.92
S21,111.0,72.81,18.64,-1.52,0.0,63.59,75.05,85.66,98.78
S22,116.0,68.92,18.32,-1.28,0.0,56.85,70.26,83.52,94.61
S23,114.0,67.08,18.07,-1.7,0.0,58.41,68.98,79.65,89.84
S24,118.0,68.96,16.12,-0.65,0.0,55.25,70.78,81.91,96.94
S21-24,459.0,69.41,17.86,-1.3,0.0,57.94,71.11,82.74,98.78


##### Series of two samples t-tests for the difference in means

In [6]:
# First, test for the difference between Fall (merged) and Spring (merged) semesters

print("Two samples t-test for the difference in means between \n Fall (20,21,22,23) and Spring (21,22,23,24) pooled semesters")
t_statistic, p_value = ttest_ind(df['F20-23'].dropna(), df['S21-24'].dropna())
print(f"t-statistic: {t_statistic:.3f}")
print(f"p-value: {p_value}")

if p_value < 0.05:
    print("There is a significant difference between the two groups.")
else:
    print("There is no significant difference between the two groups.")

Two samples t-test for the difference in means between 
 Fall (20,21,22,23) and Spring (21,22,23,24) pooled semesters
t-statistic: 6.279
p-value: 5.138247087277026e-10
There is a significant difference between the two groups.


In [7]:
# Next tests for the difference between Fall and Spring semesters for each academic year

# Fall 2020 vs Spring 2021
print("Two samples t-test for the difference in means between \n Fall'2020 vs Spring'2021 semesters")
t_statistic, p_value = ttest_ind(df['F20'].dropna(), df['S21'].dropna())
print(f"t-statistic: {t_statistic:.3f}")
print(f"p-value: {p_value}")

if p_value < 0.05:
    print("There is a significant difference between the two groups.")
else:
    print("There is no significant difference between the two groups.")

Two samples t-test for the difference in means between 
 Fall'2020 vs Spring'2021 semesters
t-statistic: 1.002
p-value: 0.31746969308656964
There is no significant difference between the two groups.


In [8]:
# Fall 2021 vs Spring 2022
print("Two samples t-test for the difference in means between \n Fall'2021 vs Spring'2022 semesters")
t_statistic, p_value = ttest_ind(df['F21'].dropna(), df['S22'].dropna())
print(f"t-statistic: {t_statistic:.3f}")
print(f"p-value: {p_value}")

if p_value < 0.05:
    print("There is a significant difference between the two groups.")
else:
    print("There is no significant difference between the two groups.")

Two samples t-test for the difference in means between 
 Fall'2021 vs Spring'2022 semesters
t-statistic: 2.969
p-value: 0.0032813878632524684
There is a significant difference between the two groups.


In [9]:
# Fall 2022 vs Spring 2023
print("Two samples t-test for the difference in means between \n Fall'2022 vs Spring'2023 semesters")
t_statistic, p_value = ttest_ind(df['F22'].dropna(), df['S23'].dropna())
print(f"t-statistic: {t_statistic:.3f}")
print(f"p-value: {p_value}")

if p_value < 0.05:
    print("There is a significant difference between the two groups.")
else:
    print("There is no significant difference between the two groups.")

Two samples t-test for the difference in means between 
 Fall'2022 vs Spring'2023 semesters
t-statistic: 4.036
p-value: 7.32183301317173e-05
There is a significant difference between the two groups.


In [10]:
# Fall 2023 vs Spring 2024
print("Two samples t-test for the difference in means between \n Fall'2023 vs Spring'2024 semesters")
t_statistic, p_value = ttest_ind(df['F23'].dropna(), df['S24'].dropna())
print(f"t-statistic: {t_statistic:.3f}")
print(f"p-value: {p_value}")

if p_value < 0.05:
    print("There is a significant difference between the two groups.")
else:
    print("There is no significant difference between the two groups.")

Two samples t-test for the difference in means between 
 Fall'2023 vs Spring'2024 semesters
t-statistic: 5.090
p-value: 7.1307097574539e-07
There is a significant difference between the two groups.
