## Modelling

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

train_df = pd.read_csv('/Users/okuran/Desktop/ML/DTRA/train_df.csv')
test_df = pd.read_csv('/Users/okuran/Desktop/ML/DTRA/test_df.csv')

In [2]:
# 选择特征和标签
features = ['home_team', 'away_team', 'year', 'month', 'day_of_week', 'weather_simple', 'kick_off_hour', 'capacity', 'is_holiday', 'temperature', 'humidity', 'broadcast_num', 'section_encoded', 'round_encoded', 'pop_match', 'pop_home', 'derby_match', 'is_weekend', 'is_night_game']
label = 'attendance'

# 对分类特征进行独热编码
train_df_encoded = pd.get_dummies(train_df[features])
test_df_encoded = pd.get_dummies(test_df[features])

# 保证训练集和测试集有相同的列
train_df_encoded, test_df_encoded = train_df_encoded.align(test_df_encoded, join='left', axis=1, fill_value=0)

train_df_encoded.to_csv("train_df_encoded.csv", index=None)
test_df_encoded.to_csv("test_df_encoded.csv", index=None)

# 检测缺失值
print(train_df_encoded.isnull().sum().sum())  # 应为0
print(test_df_encoded.isnull().sum().sum())   # 应为0


0
0


In [3]:
from sklearn.preprocessing import StandardScaler

# 提取标签
y = train_df[label]

# 划分训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(train_df_encoded, y, test_size=0.2, random_state=42)

# 标准化数值特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
test_encoded_scaled = scaler.transform(test_df_encoded)



### Linear Regression

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# 初始化线性回归模型
model = LinearRegression()

# 训练模型
model.fit(X_train_scaled, y_train)

# 使用验证集进行预测
y_pred = model.predict(X_val_scaled)

# 计算RMSE
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print("Validation RMSE:", rmse)

Validation RMSE: 4648.653541015154


### Random Forest

In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# 初始化随机森林模型
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# 训练模型
rf_model.fit(X_train_scaled, y_train)

# 使用验证集进行预测
y_pred_rf = rf_model.predict(X_val_scaled)

# 计算RMSE
rmse_rf = np.sqrt(mean_squared_error(y_val, y_pred_rf))
print("Random Forest Validation RMSE:", rmse_rf)


Random Forest Validation RMSE: 4004.7497155047326


In [6]:
# 使用测试集进行预测
test_predictions_rf = rf_model.predict(test_encoded_scaled)

# 生成提交文件
submission_rf = pd.DataFrame({
    'id': test_df['id'],
    'attendance': test_predictions_rf
})

# 保存提交文件
submission_rf.to_csv('submission_rf.csv', index=False, header=False)

### 梯度提升树模型

In [7]:
from sklearn.ensemble import GradientBoostingRegressor

gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)

gb_model.fit(X_train_scaled, y_train)

y_pred_gb = gb_model.predict(X_val_scaled)

rmse_gb = np.sqrt(mean_squared_error(y_val, y_pred_gb))
print("Gradient Boosting Validation RMSE:", rmse_gb)


Gradient Boosting Validation RMSE: 4025.49415532306


### GridSearch

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV

# 定义参数网格
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}

# 初始化模型
gb_model = GradientBoostingRegressor(random_state=42)

# 网格搜索
grid_search = GridSearchCV(gb_model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# 最佳参数
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

y_pred = best_model.predict(X_val_scaled)

rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print("Validation RMSE:", rmse)

In [None]:
test_predictions = best_model.predict(test_encoded_scaled)

submission_gs = pd.DataFrame({
    'id': test_df['id'],
    'attendance': test_predictions
})

submission_gs.to_csv('submission_gs.csv', index=False, header=False)

### XGBoost

In [6]:
from xgboost import XGBRegressor

# 初始化XGBoost模型
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)

# 训练模型
xgb_model.fit(X_train_scaled, y_train)

# 使用验证集进行预测
y_pred_xgb = xgb_model.predict(X_val_scaled)

# 计算RMSE
rmse_xgb = np.sqrt(mean_squared_error(y_val, y_pred_xgb))
print("XGBoost Validation RMSE:", rmse_xgb)

XGBoostError: 
XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed
    - vcomp140.dll or libgomp-1.dll for Windows
    - libomp.dylib for Mac OSX
    - libgomp.so for Linux and other UNIX-like OSes
    Mac OSX users: Run `brew install libomp` to install OpenMP runtime.

  * You are running 32-bit Python on a 64-bit OS

Error message(s): ["dlopen(/Users/okuran/Library/Python/3.8/lib/python/site-packages/xgboost/lib/libxgboost.dylib, 0x0006): Library not loaded: @rpath/libomp.dylib\n  Referenced from: <A860C87F-F0F6-32B7-824B-DC2534A91D6C> /Users/okuran/Library/Python/3.8/lib/python/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: tried: '/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/usr/local/opt/libomp/lib/libomp.dylib' (no such file)"]


In [None]:
# 使用测试集进行预测
test_predictions_xgb = xgb_model.predict(test_encoded_scaled)

# 生成提交文件
submission_xgb = pd.DataFrame({
    'id': test_df['id'],
    'attendance': test_predictions_xgb
})

# 保存提交文件
submission_xgb.to_csv('submission_xgb.csv', index=False, header=False)
