In [12]:
### This notebook uses preprocessed training data to train different models in order
### to compare their performance.
import pandas as pd
import numpy as np
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import time
from tqdm import tqdm

In [None]:
period_features = pd.read_csv("period_features_glove.csv") # preprocessing using Glove and dropping certain types of tweets


In [5]:
# We drop the non-numerical features and keep the embeddings values for each period
X = period_features.drop(columns=["EventType", "MatchID", "PeriodID", "ID"]).values
# We extract the labels of our training samples
y = period_features["EventType"].values


In [8]:
###### Evaluating on a test set:

# We split our data into a training and test set that we can use to train our classifier without fine-tuning into the
# validation set and without submitting too many times into Kaggle
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

In [None]:
period_features_test = pd.read_csv("period_features_test_glove.csv")

# We set up a basic classifier that we train and then calculate the accuracy on our test set
clf = LogisticRegression(random_state=42, max_iter=1000).fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Test set: ", accuracy_score(y_test, y_pred))

# This time we train our classifier on the full dataset that it is available to us.
clf = LogisticRegression(random_state=42, max_iter=1000).fit(X, y)
# We add a dummy classifier for sanity purposes
dummy_clf = DummyClassifier(strategy="most_frequent").fit(X, y)

X_eval = period_features_test.drop(columns=["MatchID", "PeriodID", "ID"]).values

# Predict using the trained classifiers
preds = clf.predict(X_eval)

# Add predictions to the dataframe
period_features_test["EventType"] = preds

# Prepare the final prediction dataframes
predictions = period_features_test[["ID", "EventType"]]

pred_df = predictions
pred_df.to_csv("logistic_better_preprocessing_predictions.csv", index=False)

In [None]:
period_features_test = pd.read_csv("period_features_test_glove.csv")

# We set up a basic classifier that we train and then calculate the accuracy on our test set
clf = RandomForestClassifier(random_state=42, n_estimators=100).fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Test set: ", accuracy_score(y_test, y_pred))

# This time we train our classifier on the full dataset that it is available to us.
clf = RandomForestClassifier(random_state=42, n_estimators=100).fit(X, y)

X_eval = period_features_test.drop(columns=["MatchID", "PeriodID", "ID"]).values

# Predict using the trained classifiers
preds = clf.predict(X_eval)

# Add predictions to the dataframe
period_features_test["EventType"] = preds

# Prepare the final prediction dataframes
predictions = period_features_test[["ID", "EventType"]]

pred_df = predictions
pred_df.to_csv("rf_better_preprocessing_predictions.csv", index=False)

Test set:  0.764797507788162


In [13]:
# Load test dataset
period_features_test = pd.read_csv("period_features_test_glove.csv")

# Train the SVM classifier on the train set
svm_clf = SVC(random_state=42, kernel='linear', probability=True).fit(X_train, y_train)

# Test set prediction and evaluation
y_pred = svm_clf.predict(X_test)
print("Test set accuracy (SVM):", accuracy_score(y_test, y_pred))

# Train the SVM classifier on the full dataset
svm_clf = SVC(random_state=42, kernel='linear', probability=True).fit(X, y)

# Add a dummy classifier for sanity purposes
dummy_clf = DummyClassifier(strategy="most_frequent").fit(X, y)

# Prepare the evaluation dataset
X_eval = period_features_test.drop(columns=["MatchID", "PeriodID", "ID"]).values

# Predict using the trained SVM classifier
svm_preds = svm_clf.predict(X_eval)

# Add predictions to the dataframe
period_features_test["EventType"] = svm_preds

# Prepare the final prediction dataframe
predictions = period_features_test[["ID", "EventType"]]

# Save predictions to a CSV file
pred_df = predictions
pred_df.to_csv("svm_better_preprocessing_predictions.csv", index=False)

Test set accuracy (SVM): 0.7289719626168224


In [1]:
print("test")

test
