### This notebook uses preprocessed training data to train different models in order
### to compare their performance.

In [4]:
import pandas as pd
import numpy as np
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupShuffleSplit
import time
from tqdm import tqdm
import xgboost

In [17]:
def analyze_match_tweets(df):
    """
    Analyze tweets for each match with normalized metrics.
    
    Args:
        df: DataFrame with columns ['MatchID', 'Timestamp', 'Tweet', 'EventType']
    
    Returns:
        DataFrame with minute-wise analysis including normalized metrics
    """
    results = []
    
    # List of key events to track
    key_events = ['goal', 'own goal', 'red card', 'yellow card', 'penalty', 
                 'match start', 'match end', 'half time']
    
    for match_id, match_data in df.groupby('MatchID'):
        # Calculate minute-wise data
        start_time = match_data['Timestamp'].min()
        match_data['Minute'] = (match_data['Timestamp'] - start_time).dt.total_seconds() // 60
        
        data_per_minute = match_data.groupby('Minute')
        tweet_counts = data_per_minute.size()
        
        # Calculate event proportions per minute
        minute_data = []
        for minute, group in data_per_minute:
            # Count tweets containing key events
            event_tweets = sum(any(f' {event} ' in f' {tweet} ' for event in key_events) 
                             for tweet in group['Tweet'])
            event_proportion = event_tweets / len(group) if len(group) > 0 else 0
            
            minute_data.append({
                'MatchID': float(match_id),
                'PeriodID': minute,
                'TweetCount': len(group),
                'EventProportion': event_proportion
            })
        
        # Convert to DataFrame for easier processing
        minute_df = pd.DataFrame(minute_data)
        
        # Normalize tweet counts and event proportions to [0,1] range
        min_tweets = minute_df['TweetCount'].min()
        max_tweets = minute_df['TweetCount'].max()
        minute_df['tweet_percentage'] = (minute_df['TweetCount'] - min_tweets) / \
                                      (max_tweets - min_tweets) if max_tweets > min_tweets else 0
        
        min_events = minute_df['EventProportion'].min()
        max_events = minute_df['EventProportion'].max()
        minute_df['event_percentage'] = (minute_df['EventProportion'] - min_events) / \
                                      (max_events - min_events) if max_events > min_events else 0
        
        results.append(minute_df)
    
    # Combine results from all matches
    final_df = pd.concat(results, ignore_index=True)
    return final_df

In [19]:
df = pd.read_csv('preprocessed_tweets.csv')
df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='ms')
final_df = analyze_match_tweets(df)

In [20]:
final_df

Unnamed: 0,MatchID,PeriodID,TweetCount,EventProportion,tweet_percentage,event_percentage
0,0.0,0.0,63,0.063492,0.117460,0.150567
1,0.0,1.0,63,0.047619,0.117460,0.112925
2,0.0,2.0,75,0.053333,0.155556,0.126476
3,0.0,3.0,90,0.033333,0.203175,0.079048
4,0.0,4.0,130,0.023077,0.330159,0.054725
...,...,...,...,...,...,...
2144,19.0,125.0,560,0.019643,0.451751,0.052778
2145,19.0,126.0,508,0.027559,0.407344,0.074048
2146,19.0,127.0,465,0.040860,0.370623,0.109786
2147,19.0,128.0,406,0.051724,0.320239,0.138976


In [8]:
period_features = pd.read_csv("period_features_glove.csv") # preprocessing using Glove and dropping certain types of tweets

In [9]:
period_features

Unnamed: 0,MatchID,PeriodID,ID,EventType,0,1,2,3,4,5,...,190,191,192,193,194,195,196,197,198,199
0,0.0,0.0,0_0,0.0,0.091802,0.214838,0.033723,-0.108459,-0.106916,0.133919,...,-0.031619,-0.075651,0.031717,0.022333,-0.094636,0.037390,0.245412,0.092223,-0.007181,0.137786
1,0.0,1.0,0_1,0.0,0.114414,0.210811,0.023566,-0.127374,-0.112024,0.133017,...,-0.024327,-0.037781,0.024221,-0.004945,-0.124407,0.053369,0.249249,0.114808,0.011257,0.118851
2,0.0,2.0,0_2,0.0,0.084405,0.195237,0.064077,-0.152090,-0.065317,0.114473,...,-0.037781,-0.051512,0.030181,0.024942,-0.131769,0.070388,0.238555,0.086125,-0.009607,0.120924
3,0.0,3.0,0_3,0.0,0.085226,0.205044,0.018470,-0.124588,-0.083020,0.140426,...,-0.034552,-0.058735,0.046085,0.009542,-0.107682,0.032268,0.242973,0.096682,-0.000472,0.116347
4,0.0,4.0,0_4,0.0,0.094227,0.212355,-0.029741,-0.111316,-0.118624,0.135703,...,-0.045568,-0.054026,0.025960,0.033239,-0.139287,0.057312,0.235727,0.131300,-0.038934,0.158211
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2132,19.0,125.0,19_125,1.0,0.008027,0.273417,-0.008235,-0.195834,0.082377,0.101006,...,-0.046492,-0.037399,0.013002,-0.019155,0.016608,0.013361,0.226146,0.096884,0.004263,0.164308
2133,19.0,126.0,19_126,1.0,-0.011890,0.258186,-0.023810,-0.179704,0.079112,0.107770,...,-0.047332,-0.041637,0.026681,-0.011228,0.020459,0.005865,0.219020,0.106579,0.012994,0.163897
2134,19.0,127.0,19_127,1.0,-0.003920,0.270321,-0.013028,-0.183066,0.076836,0.099952,...,-0.041568,-0.040465,0.031711,-0.012235,0.011195,0.006392,0.228404,0.101799,0.006633,0.158170
2135,19.0,128.0,19_128,1.0,0.017887,0.271453,-0.014517,-0.195398,0.070879,0.104112,...,-0.053858,-0.048433,0.041437,-0.021536,0.023279,0.014279,0.220829,0.103918,0.003919,0.162534


In [21]:
merged_df = pd.merge(
        period_features,
        final_df,
        on=['MatchID', 'PeriodID'],
        how='left'  # Keep all rows from other_df
    )

In [29]:
merged_df

Unnamed: 0,MatchID,PeriodID,ID,EventType,0,1,2,3,4,5,...,194,195,196,197,198,199,TweetCount,EventProportion,tweet_percentage,event_percentage
0,0.0,0.0,0_0,0.0,0.091802,0.214838,0.033723,-0.108459,-0.106916,0.133919,...,-0.094636,0.037390,0.245412,0.092223,-0.007181,0.137786,63,0.063492,0.117460,0.150567
1,0.0,1.0,0_1,0.0,0.114414,0.210811,0.023566,-0.127374,-0.112024,0.133017,...,-0.124407,0.053369,0.249249,0.114808,0.011257,0.118851,63,0.047619,0.117460,0.112925
2,0.0,2.0,0_2,0.0,0.084405,0.195237,0.064077,-0.152090,-0.065317,0.114473,...,-0.131769,0.070388,0.238555,0.086125,-0.009607,0.120924,75,0.053333,0.155556,0.126476
3,0.0,3.0,0_3,0.0,0.085226,0.205044,0.018470,-0.124588,-0.083020,0.140426,...,-0.107682,0.032268,0.242973,0.096682,-0.000472,0.116347,90,0.033333,0.203175,0.079048
4,0.0,4.0,0_4,0.0,0.094227,0.212355,-0.029741,-0.111316,-0.118624,0.135703,...,-0.139287,0.057312,0.235727,0.131300,-0.038934,0.158211,130,0.023077,0.330159,0.054725
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2132,19.0,125.0,19_125,1.0,0.008027,0.273417,-0.008235,-0.195834,0.082377,0.101006,...,0.016608,0.013361,0.226146,0.096884,0.004263,0.164308,560,0.019643,0.451751,0.052778
2133,19.0,126.0,19_126,1.0,-0.011890,0.258186,-0.023810,-0.179704,0.079112,0.107770,...,0.020459,0.005865,0.219020,0.106579,0.012994,0.163897,508,0.027559,0.407344,0.074048
2134,19.0,127.0,19_127,1.0,-0.003920,0.270321,-0.013028,-0.183066,0.076836,0.099952,...,0.011195,0.006392,0.228404,0.101799,0.006633,0.158170,465,0.040860,0.370623,0.109786
2135,19.0,128.0,19_128,1.0,0.017887,0.271453,-0.014517,-0.195398,0.070879,0.104112,...,0.023279,0.014279,0.220829,0.103918,0.003919,0.162534,406,0.051724,0.320239,0.138976


In [30]:
# We drop the non-numerical features and keep the embeddings values for each period
X = merged_df.drop(columns=["EventType", "MatchID", "PeriodID", "ID", "TweetCount", "EventProportion"]).values
# We extract the labels of our training samples
y = merged_df["EventType"].values

In [31]:
###### Evaluating on a test set:

# We split our data into a training and test set that we can use to train our classifier without fine-tuning into the
# validation set and without submitting too many times into Kaggle
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

### Logistic regression

In [5]:
period_features_test = pd.read_csv("period_features_test_glove.csv")

# We set up a basic classifier that we train and then calculate the accuracy on our test set
clf = LogisticRegression(random_state=42, max_iter=1000).fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Test set: ", accuracy_score(y_test, y_pred))

# This time we train our classifier on the full dataset that it is available to us.
clf = LogisticRegression(random_state=42, max_iter=1000).fit(X, y)
# We add a dummy classifier for sanity purposes
dummy_clf = DummyClassifier(strategy="most_frequent").fit(X, y)

X_eval = period_features_test.drop(columns=["MatchID", "PeriodID", "ID"]).values

# Predict using the trained classifiers
preds = clf.predict(X_eval)

# Add predictions to the dataframe
period_features_test["EventType"] = preds

# Prepare the final prediction dataframes
predictions = period_features_test[["ID", "EventType"]]

pred_df = predictions
pred_df.to_csv("logistic_better_preprocessing_predictions.csv", index=False)

Test set:  0.7383177570093458


### Random Forest

In [6]:
period_features_test = pd.read_csv("period_features_test_glove.csv")

# We set up a basic classifier that we train and then calculate the accuracy on our test set
clf = RandomForestClassifier(random_state=42, n_estimators=100).fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Test set: ", accuracy_score(y_test, y_pred))

# This time we train our classifier on the full dataset that it is available to us.
clf = RandomForestClassifier(random_state=42, n_estimators=100).fit(X, y)

X_eval = period_features_test.drop(columns=["MatchID", "PeriodID", "ID"]).values

# Predict using the trained classifiers
preds = clf.predict(X_eval)

# Add predictions to the dataframe
period_features_test["EventType"] = preds

# Prepare the final prediction dataframes
predictions = period_features_test[["ID", "EventType"]]

pred_df = predictions
pred_df.to_csv("rf_better_preprocessing_predictions.csv", index=False)

Test set:  0.764797507788162


### SVM

In [28]:
# Load test dataset
period_features_test = pd.read_csv("period_features_test_glove.csv")

# Train the SVM classifier on the train set
svm_clf = SVC(random_state=42, kernel='rbf', probability=True).fit(X_train, y_train)

# Test set prediction and evaluation
y_pred = svm_clf.predict(X_test)
print("Test set accuracy (SVM):", accuracy_score(y_test, y_pred))

from sklearn.model_selection import cross_val_score

scores = cross_val_score(svm_clf, X_train, y_train, cv=5)
print("Cross-validation scores:", scores)
print("Mean CV score:", scores.mean())

from sklearn.metrics import confusion_matrix, classification_report

y_pred = svm_clf.predict(X_test)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Train the SVM classifier on the full dataset
svm_clf = SVC(random_state=42, kernel='rbf', probability=True).fit(X, y)

# Add a dummy classifier for sanity purposes
dummy_clf = DummyClassifier(strategy="most_frequent").fit(X, y)

# Prepare the evaluation dataset
X_eval = period_features_test.drop(columns=["MatchID", "PeriodID", "ID"]).values

# Predict using the trained SVM classifier
svm_preds = svm_clf.predict(X_eval)

# Add predictions to the dataframe
period_features_test["EventType"] = svm_preds

# Prepare the final prediction dataframe
predictions = period_features_test[["ID", "EventType"]]

# Save predictions to a CSV file
pred_df = predictions
pred_df.to_csv("svm_bert_predictions_glove.csv", index=False)

Test set accuracy (SVM): 0.6931464174454829
Cross-validation scores: [0.66889632 0.62876254 0.66555184 0.65886288 0.62541806]
Mean CV score: 0.6494983277591974
Confusion Matrix:
[[181 119]
 [ 78 264]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.70      0.60      0.65       300
         1.0       0.69      0.77      0.73       342

    accuracy                           0.69       642
   macro avg       0.69      0.69      0.69       642
weighted avg       0.69      0.69      0.69       642



In [37]:
# Import BaggingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC

# Load test dataset
period_features_test = pd.read_csv("period_features_test_glove.csv")

# Create base SVM classifier
base_svm = SVC(random_state=42, kernel='rbf', probability=True)

# Create bagged SVM classifier
bagged_svm = BaggingClassifier(
    estimator=base_svm,
    n_estimators=70,  # you can adjust this number
    max_samples=0.8,  # you can adjust this fraction
    random_state=42
)

# Train the bagged SVM classifier
bagged_svm.fit(X_train, y_train)

# Test set prediction and evaluation
y_pred = bagged_svm.predict(X_test)
print("Test set accuracy (Bagged SVM):", accuracy_score(y_test, y_pred))

# Cross-validation
scores = cross_val_score(bagged_svm, X_train, y_train, cv=5)
print("Cross-validation scores:", scores)
print("Mean CV score:", scores.mean())

# Confusion matrix and classification report
y_pred = bagged_svm.predict(X_test)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Train the bagged SVM classifier on the full dataset
bagged_svm = BaggingClassifier(
    estimator=base_svm,
    n_estimators=70,
    max_samples=0.8,
    random_state=42
).fit(X, y)

# Add a dummy classifier for sanity purposes
dummy_clf = DummyClassifier(strategy="most_frequent").fit(X, y)

# Prepare the evaluation dataset
X_eval = period_features_test.drop(columns=["MatchID", "PeriodID", "ID"]).values

# Predict using the trained bagged SVM classifier
svm_preds = bagged_svm.predict(X_eval)

# Add predictions to the dataframe
period_features_test["EventType"] = svm_preds

# Prepare the final prediction dataframe
predictions = period_features_test[["ID", "EventType"]]

# Save predictions to a CSV file
pred_df = predictions
pred_df.to_csv("bagged_svm_rbf_predictions.csv", index=False)

Test set accuracy (Bagged SVM): 0.6791277258566978
Cross-validation scores: [0.6722408  0.6187291  0.65217391 0.63879599 0.61538462]
Mean CV score: 0.6394648829431439
Confusion Matrix:
[[181 119]
 [ 87 255]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.68      0.60      0.64       300
         1.0       0.68      0.75      0.71       342

    accuracy                           0.68       642
   macro avg       0.68      0.67      0.67       642
weighted avg       0.68      0.68      0.68       642



### XGboost

In [20]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.dummy import DummyClassifier

# Load test dataset
period_features_test = pd.read_csv("period_features_test_glove.csv")

# Train the XGBoost classifier on the train set
xgb_clf = XGBClassifier(
    random_state=42,
    learning_rate=0.05,  # Reduced
    n_estimators=200,    # Increased
    max_depth=3,         # Reduced to prevent overfitting
    min_child_weight=3,  # Helps with overfitting
    subsample=0.8,       # Use 80% of data per tree
    colsample_bytree=0.8 # Use 80% of features per tree
).fit(X_train, y_train)

# Test set prediction and evaluation
y_pred = xgb_clf.predict(X_test)
print("Test set accuracy (XGBoost):", accuracy_score(y_test, y_pred))

from sklearn.model_selection import cross_val_score

scores = cross_val_score(xgb_clf, X_train, y_train, cv=5)
print("Cross-validation scores:", scores)
print("Mean CV score:", scores.mean())

from sklearn.metrics import confusion_matrix, classification_report

y_pred = xgb_clf.predict(X_test)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Prepare the evaluation dataset
X_eval = period_features_test.drop(columns=["MatchID", "PeriodID", "ID"]).values

# Predict using the trained XGBoost classifier
xgb_preds = xgb_clf.predict(X_eval).astype(float)

# Add predictions to the dataframe
period_features_test["EventType"] = xgb_preds

# Prepare the final prediction dataframe
predictions = period_features_test[["ID", "EventType"]]

# Save predictions to a CSV file
pred_df = predictions
pred_df.to_csv("xgboost_predictions.csv", index=False)

# Optional: Print feature importance
feature_importance = pd.DataFrame({
    'feature': range(X.shape[1]),
    'importance': xgb_clf.feature_importances_
})
print("\nTop 10 most important features:")
print(feature_importance.sort_values('importance', ascending=False).head(10))

Test set accuracy (XGBoost): 0.7757009345794392
Cross-validation scores: [0.74916388 0.71571906 0.74916388 0.74247492 0.7458194 ]
Mean CV score: 0.7404682274247492
Confusion Matrix:
[[226  74]
 [ 70 272]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.76      0.75      0.76       300
         1.0       0.79      0.80      0.79       342

    accuracy                           0.78       642
   macro avg       0.77      0.77      0.77       642
weighted avg       0.78      0.78      0.78       642



ValueError: Feature shape mismatch, expected: 768, got 200