In [2]:
import numpy as np
import pandas as pd
import statistics as st
from scipy import stats
import random
import math

# 近年碩士人數變化

load dataset

In [74]:
# GitHub 檔案路徑
url = "https://raw.githubusercontent.com/YiHsiu7893/Statistics_Final/refs/heads/main/data/graduates.csv"

df = pd.read_csv(url)
df.head()

Unnamed: 0,Year,School,gradTotal,Type,Ownership
0,106,世新大學,202,General,Private
1,106,中信金融管理學院,14,General,Private
2,106,中原大學,610,General,Private
3,106,中國文化大學,298,General,Private
4,106,中國科技大學,47,Tech,Private


split into four categories

(general, public), (general, private), (tech, public), (tech, private)

In [75]:
general_public = df[(df['Type'] == 'General') & (df['Ownership'] == 'Public')]
general_private = df[(df['Type'] == 'General') & (df['Ownership'] == 'Private')]
tech_public = df[(df['Type'] == 'Tech') & (df['Ownership'] == 'Public')]
tech_private = df[(df['Type'] == 'Tech') & (df['Ownership'] == 'Private')]

print("number of schools in each category")
print("general, public：", len(general_public['School'].unique()))
print("general, private：", len(general_private['School'].unique()))
print("tech, public：", len(tech_public['School'].unique()))
print("tech, private：", len(tech_private['School'].unique()))

number of schools in each category
general, public： 31
general, private： 37
tech, public： 12
tech, private： 55


### if n >= 30 -> perform 1-sided, 2-sample z-test

In [76]:
def z_test (data):
  df_106 = data[data['Year'] == 106]
  df_111 = data[data['Year'] == 111]
  df_n = len(data['School'].unique())

  print("1. Analyze Statistical Measures")
  print("Calculate the average total number of master's graduates per school for the 106th and 111th academic years, µ_106 & µ_111")

  df_106_mean = df_106['gradTotal'].mean()
  df_111_mean = df_111['gradTotal'].mean()

  df_106_std = df_106['gradTotal'].std()
  df_111_std = df_111['gradTotal'].std()

  print("number of schools:", df_n)
  print('mean number of master\'s graduates in year 106: ', df_106_mean)
  print('mean number of master\'s graduates in year 111: ', df_111_mean)
  print('std of master\'s graduates in year 106: ', df_106_std)
  print('std of master\'s graduates in year 111: ', df_111_std)


  print("\n2. Hypothesis")
  print("H0: µ106 - µ111 ≧ 0")
  print("Ha: µ106 - µ111 < 0")


  print("\n3. Perform 1-sided, 2-sample z-test")
  z = (df_106_mean - df_111_mean) / math.sqrt(df_106_std**2 / df_n + df_111_std**2 / df_n)
  print("the test statistic is ", z)

  z_alpha = -1.65
  if z < z_alpha:
    print(f"Since the observed value of the test statistic falls in the rejection region (z < -1.65), H0 is rejected.")
    print("There is sufficient evidence to indicate that there is a difference in the mean number of students in year 106 and year 111 at alpha = 0.05.")
  else:
    print(f"Since the observed value of the test statistic does not fall in the rejection region (z >= -1.65), H0 is not rejected.")
    print("There is insufficient evidence to indicate that there is a difference in the mean number of students in year 106 and year 111 at alpha = 0.05.")

### if n < 30, variance unknown -> perform 1-sided, 2-sample t-test

1. use F-test to check if the sample variance are the same

In [77]:
def F_test(std1, std2, n):
  print("H0: the variances are the same, Ha: the variances are different")
  F = std1**2 / std2**2
  p_value = stats.f.cdf(F, n-1, n-1)

  if p_value < 0.025:
    print(f"Since the p-value({p_value}) is less than 0.025, H0 is rejected.")
    print("There is sufficient evidence to indicate the master\'s graduates variances for year 106 and year 111 differ at alpha = 0.05")
    return True
  else:
    print(f"Since the p-value({p_value}) is greater than 0.025, H0 is not rejected.")
    print("There is insufficient evidence to indicate the master\'s graduates variances for year 106 and year 111 differ at alpha = 0.05")
    return False

2. perform perform 1-sided, 2-sample

In [78]:
def t_test (data):
  df_106 = data[data['Year'] == 106]
  df_111 = data[data['Year'] == 111]
  df_n = len(data['School'].unique())

  print("1. Analyze Statistical Measures")
  print("Calculate the average total number of master's graduates per school for the 106th and 111th academic years, µ_106 & µ_111")

  df_106_mean = df_106['gradTotal'].mean()
  df_111_mean = df_111['gradTotal'].mean()

  df_106_std = df_106['gradTotal'].std()
  df_111_std = df_111['gradTotal'].std()

  print("number of schools:", df_n)
  print('mean number of master\'s graduates in year 106: ', df_106_mean)
  print('mean number of master\'s graduates in year 111: ', df_111_mean)
  print('std of master\'s graduates in year 106: ', df_106_std)
  print('std of master\'s graduates in year 111: ', df_111_std)


  print("\n2. Hypothesis")
  print("H0: µ106 - µ111 ≧ 0")
  print("Ha: µ106 - µ111 < 0")


  print("\n3. Perform F test to check if the sample variance are the same")
  if F_test(df_106_std, df_111_std, df_n) == True:
    t, p_value = stats.ttest_ind(df_106['gradTotal'], df_111['gradTotal'], equal_var=False, alternative="less")
  else:
    t, p_value = stats.ttest_ind(df_106['gradTotal'], df_111['gradTotal'], equal_var=True, alternative="less")


  print("\n4. Perform 1-sided, 2-sample t-test")
  print(f"the test statistic is {t}, p-value is {p_value}")

  alpha = 0.05
  if p_value < alpha:
    print(f"Since the p-value is less than 0.05, H0 is rejected.")
    print("There is sufficient evidence to indicate that there is a difference in the mean number of master\'s graduates in year 106 and year 111 at alpha = 0.05.")
  else:
    print(f"Since the p-value is greater than 0.05, H0 is not rejected.")
    print("There is insufficient evidence to indicate that there is a difference in the mean number of master\'s graduates in year 106 and year 111 at alpha = 0.05.")

### statistical testing of each category

1. general, public

In [79]:
z_test(general_public)

1. Analyze Statistical Measures
Calculate the average total number of master's graduates per school for the 106th and 111th academic years, µ_106 & µ_111
number of schools: 31
mean number of master's graduates in year 106:  703.5806451612904
mean number of master's graduates in year 111:  720.1290322580645
std of master's graduates in year 106:  826.32571359376
std of master's graduates in year 111:  864.0040023802159

2. Hypothesis
H0: µ106 - µ111 ≧ 0
Ha: µ106 - µ111 < 0

3. Perform 1-sided, 2-sample z-test
the test statistic is  -0.07706766896695796
Since the observed value of the test statistic does not fall in the rejection region (z >= -1.65), H0 is not rejected.
There is insufficient evidence to indicate that there is a difference in the mean number of students in year 106 and year 111 at alpha = 0.05.


2. general, private

In [80]:
z_test(general_private)

1. Analyze Statistical Measures
Calculate the average total number of master's graduates per school for the 106th and 111th academic years, µ_106 & µ_111
number of schools: 37
mean number of master's graduates in year 106:  216.0
mean number of master's graduates in year 111:  196.22222222222223
std of master's graduates in year 106:  181.5986111057999
std of master's graduates in year 111:  172.59285055729296

2. Hypothesis
H0: µ106 - µ111 ≧ 0
Ha: µ106 - µ111 < 0

3. Perform 1-sided, 2-sample z-test
the test statistic is  0.4801920061905414
Since the observed value of the test statistic does not fall in the rejection region (z >= -1.65), H0 is not rejected.
There is insufficient evidence to indicate that there is a difference in the mean number of students in year 106 and year 111 at alpha = 0.05.


3. tech, public

In [81]:
t_test(tech_public)

1. Analyze Statistical Measures
Calculate the average total number of master's graduates per school for the 106th and 111th academic years, µ_106 & µ_111
number of schools: 12
mean number of master's graduates in year 106:  474.0833333333333
mean number of master's graduates in year 111:  511.6666666666667
std of master's graduates in year 106:  471.479963583402
std of master's graduates in year 111:  516.6362063357249

2. Hypothesis
H0: µ106 - µ111 ≧ 0
Ha: µ106 - µ111 < 0

3. Perform F test to check if the sample variance are the same
H0: the variances are the same, Ha: the variances are different
Since the p-value(0.3834861411965651) is greater than 0.025, H0 is not rejected.
There is insufficient evidence to indicate the master's graduates variances for year 106 and year 111 differ at alpha = 0.05

4. Perform 1-sided, 2-sample t-test
the test statistic is -0.1861400759576133, p-value is 0.42702029127984
Since the p-value is greater than 0.05, H0 is not rejected.
There is insufficien

4. tech, private

In [82]:
z_test(tech_private)

1. Analyze Statistical Measures
Calculate the average total number of master's graduates per school for the 106th and 111th academic years, µ_106 & µ_111
number of schools: 55
mean number of master's graduates in year 106:  55.96296296296296
mean number of master's graduates in year 111:  50.89795918367347
std of master's graduates in year 106:  59.123140750363405
std of master's graduates in year 111:  51.61001392573893

2. Hypothesis
H0: µ106 - µ111 ≧ 0
Ha: µ106 - µ111 < 0

3. Perform 1-sided, 2-sample z-test
the test statistic is  0.4786313099990561
Since the observed value of the test statistic does not fall in the rejection region (z >= -1.65), H0 is not rejected.
There is insufficient evidence to indicate that there is a difference in the mean number of students in year 106 and year 111 at alpha = 0.05.


# 近年資訊工程女性學生比例變化

In [83]:
# GitHub 檔案路徑
url = "https://raw.githubusercontent.com/YiHsiu7893/Statistics_Final/refs/heads/main/data/cs_students.csv"

df = pd.read_csv(url)
df.head()

Unnamed: 0,Year,School,Total,Male,Female,Type,Ownership
0,107,國立清華大學,584,445,139,General,Public
1,107,國立臺灣大學,554,485,69,General,Public
2,107,國立臺灣師範大學,201,161,40,General,Public
3,107,國立成功大學,478,364,114,General,Public
4,107,國立交通大學,781,629,152,General,Public


split into four categories

(general, public), (general, private), (tech, public), (tech, private)

In [84]:
general_public = df[(df['Type'] == 'General') & (df['Ownership'] == 'Public')]
general_private = df[(df['Type'] == 'General') & (df['Ownership'] == 'Private')]
tech_public = df[(df['Type'] == 'Tech') & (df['Ownership'] == 'Public')]
tech_private = df[(df['Type'] == 'Tech') & (df['Ownership'] == 'Private')]

print("每個類別的學校數")
print("普，公：", len(general_public['School'].unique()))
print("普，私：", len(general_private['School'].unique()))
print("科，公：", len(tech_public['School'].unique()))
print("科，私：", len(tech_private['School'].unique()))

每個類別的學校數
普，公： 23
普，私： 18
科，公： 8
科，私： 20


### if n < 30, variance unknown -> perform 1-sided, 2-sample t-test

1. use F-test to check if the sample variance are the same

In [102]:
def F_test(std1, std2, n):
  print("H0: the variances are the same, Ha: the variances are different")
  F = std1**2 / std2**2
  p_value = stats.f.cdf(F, n-1, n-1)

  if p_value < 0.025:
    print(f"Since the p-value({p_value}) is less than 0.025, H0 is rejected.")
    print("There is sufficient evidence to indicate the variances of female student ratio for year 107 and year 112 differ at alpha = 0.05")
    return True
  else:
    print(f"Since the p-value({p_value}) is greater than 0.025, H0 is not rejected.")
    print("There is insufficient evidence to indicate the variances of female student ratio for year 107 and year 112 differ at alpha = 0.05")
    return False

2. perform perform 1-sided, 2-sample

In [107]:
def t_test (data):
  df_107 = data[data['Year'] == 107]
  df_112 = data[data['Year'] == 112]
  df_n = len(data['School'].unique())

  print("Analyze Statistical Measures")
  print("Calculate the average female student ratio (µ107 & µ112) for each school in the academic years 107 and 112")

  df_107_mean = (df_107['Female'] / df_107['Total']).mean()
  df_112_mean = (df_112['Female'] / df_112['Total']).mean()

  df_107_std = (df_107['Female'] / df_107['Total']).std()
  df_112_std = (df_112['Female'] / df_112['Total']).std()

  print("number of schools:", df_n)
  print('mean number of master\'s graduates in year 107: ', df_107_mean)
  print('mean number of master\'s graduates in year 112: ', df_112_mean)
  print('std of master\'s graduates in year 107: ', df_107_std)
  print('std of master\'s graduates in year 112: ', df_112_std)

  print("\n2. Hypothesis")
  print("H0: µ107 - µ112 ≧ 0")
  print("Ha: µ107 - µ112 < 0")


  print("\n3. Perform F test to check if the sample variance are the same")
  if F_test(df_107_std, df_112_std, df_n) == True:
    t, p_value = stats.ttest_ind(df_107['Total'], df_112['Total'], equal_var=False, alternative="less")
  else:
    t, p_value = stats.ttest_ind(df_107['Total'], df_112['Total'], equal_var=True, alternative="less")


  print("\n4. Perform 1-sided, 2-sample t-test")
  print(f"the test statistic is {t}, p-value is {p_value}")

  alpha = 0.05
  if p_value < alpha:
    print(f"Since the p-value is less than 0.05, H0 is rejected.")
    print("There is sufficient evidence to indicate that there is a difference in the mean number of female student ratio in year 107 and year 112 at alpha = 0.05.")
  else:
    print(f"Since the p-value is greater than 0.05, H0 is not rejected.")
    print("There is insufficient evidence to indicate that there is a difference in the mean number of female student ratio in year 107 and year 112 at alpha = 0.05.")

### statistical testing of each category

1. general, public

In [108]:
t_test(general_public)

Analyze Statistical Measures
Calculate the average female student ratio (µ107 & µ112) for each school in the academic years 107 and 112
number of schools: 23
mean number of master's graduates in year 107:  0.18806324212459577
mean number of master's graduates in year 112:  0.22024074892477066
std of master's graduates in year 107:  0.03854424961559331
std of master's graduates in year 112:  0.04599962261965814

2. Hypothesis
H0: µ107 - µ112 ≧ 0
Ha: µ107 - µ112 < 0

3. Perform F test to check if the sample variance are the same
H0: the variances are the same, Ha: the variances are different
Since the p-value(0.20670050418011424) is greater than 0.025, H0 is not rejected.
There is insufficient evidence to indicate the variances of female student ratio for year 107 and year 112 differ at alpha = 0.05

4. Perform 1-sided, 2-sample t-test
the test statistic is -0.5840523289455661, p-value is 0.2811183693684919
Since the p-value is greater than 0.05, H0 is not rejected.
There is insufficient

2. general, private

In [109]:
t_test(general_private)

Analyze Statistical Measures
Calculate the average female student ratio (µ107 & µ112) for each school in the academic years 107 and 112
number of schools: 18
mean number of master's graduates in year 107:  0.16819386337942302
mean number of master's graduates in year 112:  0.21379035340886318
std of master's graduates in year 107:  0.05409858482773544
std of master's graduates in year 112:  0.06390164683957127

2. Hypothesis
H0: µ107 - µ112 ≧ 0
Ha: µ107 - µ112 < 0

3. Perform F test to check if the sample variance are the same
H0: the variances are the same, Ha: the variances are different
Since the p-value(0.24980573357970332) is greater than 0.025, H0 is not rejected.
There is insufficient evidence to indicate the variances of female student ratio for year 107 and year 112 differ at alpha = 0.05

4. Perform 1-sided, 2-sample t-test
the test statistic is -0.5895213050029624, p-value is 0.27970449333992153
Since the p-value is greater than 0.05, H0 is not rejected.
There is insufficien

3. tech, public

In [110]:
t_test(tech_public)

Analyze Statistical Measures
Calculate the average female student ratio (µ107 & µ112) for each school in the academic years 107 and 112
number of schools: 8
mean number of master's graduates in year 107:  0.09430542486580698
mean number of master's graduates in year 112:  0.11693152361531407
std of master's graduates in year 107:  0.026743150704289505
std of master's graduates in year 112:  0.032488648970467

2. Hypothesis
H0: µ107 - µ112 ≧ 0
Ha: µ107 - µ112 < 0

3. Perform F test to check if the sample variance are the same
H0: the variances are the same, Ha: the variances are different
Since the p-value(0.3101609452715127) is greater than 0.025, H0 is not rejected.
There is insufficient evidence to indicate the variances of female student ratio for year 107 and year 112 differ at alpha = 0.05

4. Perform 1-sided, 2-sample t-test
the test statistic is -0.7112397782512805, p-value is 0.24475079549365297
Since the p-value is greater than 0.05, H0 is not rejected.
There is insufficient e

4. tech, private

In [111]:
t_test(tech_private)

Analyze Statistical Measures
Calculate the average female student ratio (µ107 & µ112) for each school in the academic years 107 and 112
number of schools: 20
mean number of master's graduates in year 107:  0.06025355802108652
mean number of master's graduates in year 112:  0.091619651103692
std of master's graduates in year 107:  0.032292814336185714
std of master's graduates in year 112:  0.0348183301509321

2. Hypothesis
H0: µ107 - µ112 ≧ 0
Ha: µ107 - µ112 < 0

3. Perform F test to check if the sample variance are the same
H0: the variances are the same, Ha: the variances are different
Since the p-value(0.37305135000264417) is greater than 0.025, H0 is not rejected.
There is insufficient evidence to indicate the variances of female student ratio for year 107 and year 112 differ at alpha = 0.05

4. Perform 1-sided, 2-sample t-test
the test statistic is 0.6727284335979001, p-value is 0.7471665575728981
Since the p-value is greater than 0.05, H0 is not rejected.
There is insufficient ev