In [518]:
import pandas as pd
import numpy as np

df = pd.read_csv('dataset/student_scores_final_clean.csv')

In [519]:
behavior_cols = ['part_time_job', 'absence_days', 'extracurricular_activities', 'weekly_self_study_hours']
# behavior_cols = ['weekly_self_study_hours']

career_cols = [col for col in df.columns if 'career_' in col]

score_cols = ['math_score', 'history_score', 'physics_score', 'chemistry_score', 'biology_score', 'english_score', 'geography_score']

df['average_score'] = df[score_cols].mean(axis=1)

df

Unnamed: 0,id,gender,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score,career_Accountant,career_Artist,career_Banker,career_Business Owner,career_Construction Engineer,career_Designer,career_Doctor,career_Game Developer,career_Government Officer,career_Lawyer,career_Real Estate Developer,career_Software Engineer,career_Stock Investor,career_Teacher,career_Unknown,average_score
0,1,1,0,0.3,0,0.54,0.550000,0.62,0.86,0.94,0.471429,0.612245,0.675,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0.675525
1,2,0,0,0.2,0,0.94,0.833333,0.72,0.92,1.00,0.857143,0.775510,0.750,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0.836569
2,3,0,0,0.9,1,0.26,0.683333,0.94,0.90,0.92,0.500000,0.551020,0.850,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0.763479
3,4,0,0,0.5,0,0.06,0.516667,0.48,0.76,0.60,0.842857,0.265306,0.650,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.587833
4,5,1,0,0.5,0,0.20,0.733333,0.54,0.30,0.30,0.714286,0.489796,0.400,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.496774
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1834,1995,1,0,0.1,0,0.38,0.600000,0.24,0.80,0.64,0.900000,0.428571,0.025,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.519082
1835,1996,1,0,0.2,0,0.60,0.716667,0.54,0.68,0.46,0.642857,0.693878,0.550,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0.611914
1836,1997,1,0,0.2,0,0.40,0.816667,0.30,0.46,0.60,0.814286,0.346939,0.325,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.523270
1837,1998,0,0,0.5,0,0.28,0.950000,0.70,0.26,0.86,0.542857,0.897959,0.450,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.665831


In [520]:
import statsmodels.api as sm

X = sm.add_constant(df[behavior_cols + career_cols])

y = df['average_score']

model = sm.OLS(y, X).fit()
results_df = model.summary2().tables[1]

print("【全部學生 OLS 結果】")
print(results_df)

most_influential = results_df.drop('const')['Coef.'].abs().idxmax()
print(f"\n>> 全部學生中，影響最大且顯著的行為是：{most_influential}")


【全部學生 OLS 結果】
                                 Coef.  Std.Err.  ...    [0.025    0.975]
const                         0.638947  0.014655  ...  0.610204  0.667690
part_time_job                -0.006193  0.006755  ... -0.019442  0.007056
absence_days                  0.000700  0.010472  ... -0.019839  0.021240
extracurricular_activities   -0.000496  0.005363  ... -0.011014  0.010022
weekly_self_study_hours       0.138266  0.015970  ...  0.106946  0.169587
career_Accountant            -0.082238  0.014107  ... -0.109906 -0.054569
career_Artist                -0.020796  0.017725  ... -0.055559  0.013968
career_Banker                -0.050876  0.013167  ... -0.076700 -0.025053
career_Business Owner        -0.115426  0.015893  ... -0.146597 -0.084256
career_Construction Engineer -0.023658  0.015701  ... -0.054453  0.007136
career_Designer              -0.023044  0.016849  ... -0.056090  0.010001
career_Doctor                 0.059101  0.013904  ...  0.031831  0.086371
career_Game Developer   

In [521]:
X_male = sm.add_constant(df[df['gender'] == 1][behavior_cols + career_cols])

y_male = df[df['gender'] == 1]['average_score']

model_male = sm.OLS(y_male, X_male).fit()
results_df = model_male.summary2().tables[1]

print("【男生 OLS 結果】")
print(results_df)

most_influential_male = results_df.drop('const')['Coef.'].abs().idxmax()
print(f"\n>> 男生中，影響最大且顯著的行為是：{most_influential_male}")

【男生 OLS 結果】
                                 Coef.  Std.Err.  ...    [0.025    0.975]
const                         0.617783  0.019898  ...  0.578731  0.656835
part_time_job                -0.012453  0.009417  ... -0.030935  0.006029
absence_days                 -0.028429  0.014326  ... -0.056546 -0.000313
extracurricular_activities   -0.002394  0.007276  ... -0.016673  0.011886
weekly_self_study_hours       0.143575  0.021169  ...  0.102029  0.185121
career_Accountant            -0.055192  0.019265  ... -0.093001 -0.017383
career_Artist                 0.002087  0.025427  ... -0.047816  0.051990
career_Banker                -0.008684  0.018732  ... -0.045447  0.028079
career_Business Owner        -0.079563  0.021703  ... -0.122158 -0.036969
career_Construction Engineer  0.004698  0.018495  ... -0.031601  0.040997
career_Designer              -0.000549  0.023912  ... -0.047479  0.046381
career_Doctor                 0.089423  0.018702  ...  0.052718  0.126128
career_Game Developer     

  return np.sqrt(eigvals[0]/eigvals[-1])


In [None]:
X_female = sm.add_constant(df[df['gender'] == 0][behavior_cols + career_cols])

y_female = df[df['gender'] == 0]['average_score']

model_female = sm.OLS(y_female, X_female).fit()
results_df = model_female.summary2().tables[1]

print("【女生 OLS 結果】")
print(results_df)

most_influential_female = results_df.drop('const')['Coef.'].abs().idxmax()
print(f"\n>> 女生中，影響最大且顯著的行為是：{most_influential_female}")

【女生 OLS 結果】
                                     Coef.  ...        0.975]
const                         6.607836e-01  ...  7.028923e-01
part_time_job                -2.895126e-03  ...  1.611117e-02
absence_days                  3.068924e-02  ...  6.067664e-02
extracurricular_activities    1.873567e-03  ...  1.733608e-02
weekly_self_study_hours       1.295112e-01  ...  1.766979e-01
career_Accountant            -1.096406e-01  ... -6.942662e-02
career_Artist                -4.481303e-02  ...  4.183972e-03
career_Banker                -8.829589e-02  ... -5.190654e-02
career_Business Owner        -1.535322e-01  ... -1.081072e-01
career_Construction Engineer -5.370141e-17  ... -2.685451e-17
career_Designer              -4.861454e-02  ... -2.103051e-03
career_Doctor                 2.853289e-02  ...  6.893201e-02
career_Game Developer        -9.105665e-03  ...  4.249768e-02
career_Government Officer    -7.413147e-02  ... -2.471666e-02
career_Lawyer                -5.072088e-02  ... -1.228896e

XGBoost

In [523]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
X = df[behavior_cols + career_cols]
X = df[df['gender'] == 1][behavior_cols + career_cols] # male
X = df[df['gender'] == 0][behavior_cols + career_cols] # female

y = df['average_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)


In [525]:
from xgboost import XGBRegressor

model = XGBRegressor(objective='reg:squarederror', random_state=42)

model.fit(X_train, y_train)


In [526]:
y_pred = model.predict(X_test)

accuracy = np.mean((np.abs(y_pred - y_test)) <= 5)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'accuracy: {accuracy:.2f}')
print(f'MSE: {mse:.2f}')
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f'R-squared: {r2:.2f}')


accuracy: 1.00
MSE: 0.01
RMSE: 0.10
MAE: 0.08
R-squared: 0.16


In [527]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=42)

model.fit(X_train, y_train)

In [528]:
y_pred = model.predict(X_test)

accuracy = np.mean((np.abs(y_pred - y_test)) <= 5)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'accuracy: {accuracy:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'MAE: {mae:.2f}')
print(f'R-squared: {r2:.2f}')

accuracy: 1.00
MSE: 0.01
RMSE: 0.10
MAE: 0.07
R-squared: 0.25


In [529]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(X_train, y_train)

In [530]:
y_pred = model.predict(X_test)

accuracy = np.mean((np.abs(y_pred - y_test)) <= 5)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'accuracy: {accuracy:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'MAE: {mae:.2f}')
print(f'R-squared: {r2:.2f}')

accuracy: 1.00
MSE: 0.01
RMSE: 0.09
MAE: 0.07
R-squared: 0.29


In [532]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

df = pd.read_csv('dataset/student_scores_clean.csv')

behavior_cols = ['absence_days', 'weekly_self_study_hours']
score_cols = ['math_score', 'history_score', 'physics_score', 'chemistry_score', 'biology_score', 'english_score', 'geography_score']
career_cols = [col for col in df.columns if 'career_' in col]

# # 正規化成績欄位
# scaler = MinMaxScaler()
# df[score_cols] = scaler.fit_transform(df[score_cols])

# # 正規化行為欄位
# df[behavior_cols] = scaler.fit_transform(df[behavior_cols])

# 移除個人成績標準差 < 20 的人
df['score_std_per_student'] = df[score_cols].std(axis=1)
df = df[df['score_std_per_student'] <= 20].copy()
df.drop(columns=['score_std_per_student'], inplace=True)

# 移除人數 < 10 的職業志向
career_counts = df[career_cols].sum()
valid_career_cols = career_counts[career_counts >= 50].index.tolist()
cols_to_drop = list(set(career_cols) - set(valid_career_cols))
df.drop(columns=cols_to_drop, inplace=True)

# 移除成績離群值
outlier_indices = set()
for col in score_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)].index
    outlier_indices.update(outliers)

df = df.drop(index=outlier_indices)

# 移除行為離群值
outlier_indices = set()
for col in behavior_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)].index
    outlier_indices.update(outliers)

df = df.drop(index=outlier_indices)

df.to_csv('dataset/student_scores_final_clean.csv', index=False)

print(f"原始筆數: {len(df)}, 篩選後筆數: {len(df)}")

df


原始筆數: 1832, 篩選後筆數: 1832


Unnamed: 0,id,gender,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score,career_Accountant,career_Artist,career_Banker,career_Business Owner,career_Construction Engineer,career_Designer,career_Doctor,career_Game Developer,career_Government Officer,career_Lawyer,career_Real Estate Developer,career_Software Engineer,career_Stock Investor,career_Teacher,career_Unknown
0,1,1,0,3,0,27,73,81,93,97,63,80,87,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,2,0,0,2,0,47,90,86,96,100,90,88,90,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2,3,0,0,9,1,13,81,97,95,96,65,77,94,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,4,0,0,5,0,3,71,74,88,80,89,63,86,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,5,1,0,5,0,10,84,77,65,65,80,74,76,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1994,1995,1,0,1,0,19,76,62,90,82,93,71,61,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1995,1996,1,0,2,0,30,83,77,84,73,75,84,82,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1996,1997,1,0,2,0,20,89,65,73,80,87,67,73,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1997,1998,0,0,5,0,14,97,85,63,93,68,94,78,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
