### This notebook uses preprocessed training data to train different models in order
### to compare their performance.

In [1]:
import pandas as pd
import numpy as np
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupShuffleSplit
import time
from tqdm import tqdm
import xgboost

In [25]:
period_features = pd.read_csv("period_features_glove.csv") # preprocessing using Glove and dropping certain types of tweets

In [26]:
# We drop the non-numerical features and keep the embeddings values for each period
X = period_features.drop(columns=["EventType", "MatchID", "PeriodID", "ID"]).values
# We extract the labels of our training samples
y = period_features["EventType"].values

In [27]:
###### Evaluating on a test set:

# We split our data into a training and test set that we can use to train our classifier without fine-tuning into the
# validation set and without submitting too many times into Kaggle
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

### Logistic regression

In [5]:
period_features_test = pd.read_csv("period_features_test_glove.csv")

# We set up a basic classifier that we train and then calculate the accuracy on our test set
clf = LogisticRegression(random_state=42, max_iter=1000).fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Test set: ", accuracy_score(y_test, y_pred))

# This time we train our classifier on the full dataset that it is available to us.
clf = LogisticRegression(random_state=42, max_iter=1000).fit(X, y)
# We add a dummy classifier for sanity purposes
dummy_clf = DummyClassifier(strategy="most_frequent").fit(X, y)

X_eval = period_features_test.drop(columns=["MatchID", "PeriodID", "ID"]).values

# Predict using the trained classifiers
preds = clf.predict(X_eval)

# Add predictions to the dataframe
period_features_test["EventType"] = preds

# Prepare the final prediction dataframes
predictions = period_features_test[["ID", "EventType"]]

pred_df = predictions
pred_df.to_csv("logistic_better_preprocessing_predictions.csv", index=False)

Test set:  0.7383177570093458


### Random Forest

In [6]:
period_features_test = pd.read_csv("period_features_test_glove.csv")

# We set up a basic classifier that we train and then calculate the accuracy on our test set
clf = RandomForestClassifier(random_state=42, n_estimators=100).fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Test set: ", accuracy_score(y_test, y_pred))

# This time we train our classifier on the full dataset that it is available to us.
clf = RandomForestClassifier(random_state=42, n_estimators=100).fit(X, y)

X_eval = period_features_test.drop(columns=["MatchID", "PeriodID", "ID"]).values

# Predict using the trained classifiers
preds = clf.predict(X_eval)

# Add predictions to the dataframe
period_features_test["EventType"] = preds

# Prepare the final prediction dataframes
predictions = period_features_test[["ID", "EventType"]]

pred_df = predictions
pred_df.to_csv("rf_better_preprocessing_predictions.csv", index=False)

Test set:  0.764797507788162


### SVM

In [28]:
# Load test dataset
period_features_test = pd.read_csv("period_features_test_glove.csv")

# Train the SVM classifier on the train set
svm_clf = SVC(random_state=42, kernel='rbf', probability=True).fit(X_train, y_train)

# Test set prediction and evaluation
y_pred = svm_clf.predict(X_test)
print("Test set accuracy (SVM):", accuracy_score(y_test, y_pred))

from sklearn.model_selection import cross_val_score

scores = cross_val_score(svm_clf, X_train, y_train, cv=5)
print("Cross-validation scores:", scores)
print("Mean CV score:", scores.mean())

from sklearn.metrics import confusion_matrix, classification_report

y_pred = svm_clf.predict(X_test)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Train the SVM classifier on the full dataset
svm_clf = SVC(random_state=42, kernel='rbf', probability=True).fit(X, y)

# Add a dummy classifier for sanity purposes
dummy_clf = DummyClassifier(strategy="most_frequent").fit(X, y)

# Prepare the evaluation dataset
X_eval = period_features_test.drop(columns=["MatchID", "PeriodID", "ID"]).values

# Predict using the trained SVM classifier
svm_preds = svm_clf.predict(X_eval)

# Add predictions to the dataframe
period_features_test["EventType"] = svm_preds

# Prepare the final prediction dataframe
predictions = period_features_test[["ID", "EventType"]]

# Save predictions to a CSV file
pred_df = predictions
pred_df.to_csv("svm_bert_predictions_glove.csv", index=False)

Test set accuracy (SVM): 0.6931464174454829
Cross-validation scores: [0.66889632 0.62876254 0.66555184 0.65886288 0.62541806]
Mean CV score: 0.6494983277591974
Confusion Matrix:
[[181 119]
 [ 78 264]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.70      0.60      0.65       300
         1.0       0.69      0.77      0.73       342

    accuracy                           0.69       642
   macro avg       0.69      0.69      0.69       642
weighted avg       0.69      0.69      0.69       642



In [37]:
# Import BaggingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC

# Load test dataset
period_features_test = pd.read_csv("period_features_test_glove.csv")

# Create base SVM classifier
base_svm = SVC(random_state=42, kernel='rbf', probability=True)

# Create bagged SVM classifier
bagged_svm = BaggingClassifier(
    estimator=base_svm,
    n_estimators=70,  # you can adjust this number
    max_samples=0.8,  # you can adjust this fraction
    random_state=42
)

# Train the bagged SVM classifier
bagged_svm.fit(X_train, y_train)

# Test set prediction and evaluation
y_pred = bagged_svm.predict(X_test)
print("Test set accuracy (Bagged SVM):", accuracy_score(y_test, y_pred))

# Cross-validation
scores = cross_val_score(bagged_svm, X_train, y_train, cv=5)
print("Cross-validation scores:", scores)
print("Mean CV score:", scores.mean())

# Confusion matrix and classification report
y_pred = bagged_svm.predict(X_test)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Train the bagged SVM classifier on the full dataset
bagged_svm = BaggingClassifier(
    estimator=base_svm,
    n_estimators=70,
    max_samples=0.8,
    random_state=42
).fit(X, y)

# Add a dummy classifier for sanity purposes
dummy_clf = DummyClassifier(strategy="most_frequent").fit(X, y)

# Prepare the evaluation dataset
X_eval = period_features_test.drop(columns=["MatchID", "PeriodID", "ID"]).values

# Predict using the trained bagged SVM classifier
svm_preds = bagged_svm.predict(X_eval)

# Add predictions to the dataframe
period_features_test["EventType"] = svm_preds

# Prepare the final prediction dataframe
predictions = period_features_test[["ID", "EventType"]]

# Save predictions to a CSV file
pred_df = predictions
pred_df.to_csv("bagged_svm_rbf_predictions.csv", index=False)

Test set accuracy (Bagged SVM): 0.6791277258566978
Cross-validation scores: [0.6722408  0.6187291  0.65217391 0.63879599 0.61538462]
Mean CV score: 0.6394648829431439
Confusion Matrix:
[[181 119]
 [ 87 255]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.68      0.60      0.64       300
         1.0       0.68      0.75      0.71       342

    accuracy                           0.68       642
   macro avg       0.68      0.67      0.67       642
weighted avg       0.68      0.68      0.68       642



### XGboost

In [20]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.dummy import DummyClassifier

# Load test dataset
period_features_test = pd.read_csv("period_features_test_glove.csv")

# Train the XGBoost classifier on the train set
xgb_clf = XGBClassifier(
    random_state=42,
    learning_rate=0.05,  # Reduced
    n_estimators=200,    # Increased
    max_depth=3,         # Reduced to prevent overfitting
    min_child_weight=3,  # Helps with overfitting
    subsample=0.8,       # Use 80% of data per tree
    colsample_bytree=0.8 # Use 80% of features per tree
).fit(X_train, y_train)

# Test set prediction and evaluation
y_pred = xgb_clf.predict(X_test)
print("Test set accuracy (XGBoost):", accuracy_score(y_test, y_pred))

from sklearn.model_selection import cross_val_score

scores = cross_val_score(xgb_clf, X_train, y_train, cv=5)
print("Cross-validation scores:", scores)
print("Mean CV score:", scores.mean())

from sklearn.metrics import confusion_matrix, classification_report

y_pred = xgb_clf.predict(X_test)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Prepare the evaluation dataset
X_eval = period_features_test.drop(columns=["MatchID", "PeriodID", "ID"]).values

# Predict using the trained XGBoost classifier
xgb_preds = xgb_clf.predict(X_eval).astype(float)

# Add predictions to the dataframe
period_features_test["EventType"] = xgb_preds

# Prepare the final prediction dataframe
predictions = period_features_test[["ID", "EventType"]]

# Save predictions to a CSV file
pred_df = predictions
pred_df.to_csv("xgboost_predictions.csv", index=False)

# Optional: Print feature importance
feature_importance = pd.DataFrame({
    'feature': range(X.shape[1]),
    'importance': xgb_clf.feature_importances_
})
print("\nTop 10 most important features:")
print(feature_importance.sort_values('importance', ascending=False).head(10))

Test set accuracy (XGBoost): 0.7757009345794392
Cross-validation scores: [0.74916388 0.71571906 0.74916388 0.74247492 0.7458194 ]
Mean CV score: 0.7404682274247492
Confusion Matrix:
[[226  74]
 [ 70 272]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.76      0.75      0.76       300
         1.0       0.79      0.80      0.79       342

    accuracy                           0.78       642
   macro avg       0.77      0.77      0.77       642
weighted avg       0.78      0.78      0.78       642



ValueError: Feature shape mismatch, expected: 768, got 200