In [None]:
import sys
print(sys.executable)
import os
import re
import gensim.downloader as api
import numpy as np
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import time
from tqdm import tqdm

### Using BERT
1) Simpler text processing because BERT has it's own tokenizer and lemmatizer

In [None]:
def batch_preprocess_tweets(df, batch_size=1000):
    """Main preprocessing function with filtering and batching
    Link here https://www.lix.polytechnique.fr/~nikolentzos/files/meladianos_ecir18

        1) Removing retweets
        2) Removing duplicates
        3) Removing @ mentions

    """

    print("Starting tweet preprocessing...")
    total_start = time.time()

    # Create a copy to avoid modifying original
    processed_df = df.copy()

    # Initial data filtering
    print("\nFiltering tweets...")
    initial_count = len(processed_df)

    # 1. Remove retweets
    processed_df = processed_df[~processed_df["Tweet"].str.startswith("RT ", na=False)]
    retweets_removed = initial_count - len(processed_df)

    # 2. Remove duplicates
    processed_df = processed_df.drop_duplicates(subset=["Tweet"])
    duplicates_removed = initial_count - retweets_removed - len(processed_df)

    # 3. Remove tweets with @-mentions
    processed_df = processed_df[~processed_df["Tweet"].str.contains("@", na=False)]
    mentions_removed = (
        initial_count - retweets_removed - duplicates_removed - len(processed_df)
    )

    # Print filtering statistics
    print(f"Removed {retweets_removed} retweets")
    print(f"Removed {duplicates_removed} duplicates")
    print(f"Removed {mentions_removed} tweets with @-mentions")
    print(f"Remaining tweets: {len(processed_df)}")

    # Calculate number of batches
    n_batches = int(np.ceil(len(processed_df) / batch_size))

    # Process in batches with progress bar
    processed_tweets = []
    with tqdm(total=len(processed_df), desc="Processing tweets") as pbar:
        for i in range(n_batches):
            start_idx = i * batch_size
            end_idx = min((i + 1) * batch_size, len(processed_df))

            # Get current batch
            batch = processed_df["Tweet"].iloc[start_idx:end_idx]

            # Process batch
            batch_results = [preprocess_text(tweet) for tweet in batch]
            processed_tweets.extend(batch_results)

            # Update progress bar
            pbar.update(end_idx - start_idx)

    # Add processed tweets to DataFrame
    processed_df["Tweet"] = processed_tweets

    # Print timing statistics
    total_time = time.time() - total_start
    print(f"\nPreprocessing complete!")
    print(f"Total processing time: {total_time:.2f} seconds")
    print(f"Average time per tweet: {total_time/len(processed_df):.4f} seconds")

    return processed_df


def preprocess_text(text):
    """
    Limited preprocessing for BERT

    Args:
        text: String containing the tweet text
    Returns:
        Preprocessed text string
    """

    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)

    text = " ".join(text.split())

    # Join tokens back into text
    return text

In [None]:
# Read all training files and concatenate them into one dataframe
li = []
for filename in os.listdir("../challenge_data/train_tweets"):
    df = pd.read_csv("../challenge_data/train_tweets/" + filename)
    li.append(df)
df = pd.concat(li, ignore_index=True)

print(f"Number of tweets: {len(df)}")

# Apply preprocessing to each tweet
print(f"Preprocessing tweets...")
tweet_processing_start = time.time()
df = batch_preprocess_tweets(df)
print(f"Preprocessing took {time.time() - tweet_processing_start:.2f} seconds")
print(df.head(300))
df.to_csv("bert_preprocessed_tweets.csv", index=False)

### Load csv and check if there are NaN values just in case

In [None]:
df = pd.read_csv("bert_preprocessed_tweets.csv")
original_count = len(df)
df = df.dropna() 
rows_dropped = original_count - len(df)
print(f"Rows dropped {rows_dropped}")

### Load BERT tokenizer and embeddings

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

In [None]:
# 1. Load BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

In [None]:
# 2. Function to get BERT embeddings for a batch of tweets
def get_bert_embeddings(tweets, tokenizer, model, max_length=128):
    # Move model to GPU if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    model.eval()
    
    # Tokenize all tweets in the batch
    encoded = tokenizer(
        tweets,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )
    
    # Move tensors to the same device as model
    input_ids = encoded['input_ids'].to(device)
    attention_mask = encoded['attention_mask'].to(device)
    
    # Get BERT embeddings
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        # Use [CLS] token embedding (first token) as sentence representation
        embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    
    return embeddings

# 3. Process your tweets in batches
def process_tweets_with_bert(df, batch_size=1000):
    all_embeddings = []
    
    for i in tqdm(range(0, len(df), batch_size)):
        batch = df['Tweet'].iloc[i:i + batch_size].tolist()
        batch_embeddings = get_bert_embeddings(batch, tokenizer, model)
        all_embeddings.extend(batch_embeddings)
        
    embeddings_array = np.array(all_embeddings, dtype=np.float16)
    
    return embeddings_array

In [None]:
embeddings = process_tweets_with_bert(df)
np.savez_compressed('embeddings.npz', embeddings=embeddings)
    
# Now embeddings contains BERT representations for all your tweets
# Shape will be (n_tweets, 768) for bert-base-uncased
print(f"Generated embeddings shape: {embeddings.shape}")

In [None]:
loaded = np.load('embeddings.npz')
embeddings = loaded['embeddings']
tweet_vectors = pd.DataFrame(embeddings)
tweet_vectors.head(10)

In [None]:
# Attach the vectors into the original dataframe
period_features = pd.concat([df, tweet_vectors], axis=1)
# Drop the columns that are not useful anymore
period_features = period_features.drop(columns=["Timestamp", "Tweet"])
# Group the tweets into their corresponding periods. This way we generate an average embedding vector for each period
period_features = (
    period_features.groupby(["MatchID", "PeriodID", "ID"]).mean().reset_index()
)
period_features.to_csv("period_features_bert.csv", index=False)

In [None]:
# We drop the non-numerical features and keep the embeddings values for each period
X = period_features.drop(columns=["EventType", "MatchID", "PeriodID", "ID"]).values
# We extract the labels of our training samples
y = period_features["EventType"].values

In [None]:
###### Evaluating on a test set:

# We split our data into a training and test set that we can use to train our classifier without fine-tuning into the
# validation set and without submitting too many times into Kaggle
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

In [None]:
# We set up a basic classifier that we train and then calculate the accuracy on our test set
clf = LogisticRegression(random_state=42, max_iter=1000).fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Test set: ", accuracy_score(y_test, y_pred))

In [None]:
# Train classifiers
clf = LogisticRegression(random_state=42, max_iter=1000).fit(X, y)
dummy_clf = DummyClassifier(strategy="most_frequent").fit(X, y)

predictions = []
dummy_predictions = []

# Process each file in test set
for fname in os.listdir("../challenge_data/eval_tweets"):
    # Read and preprocess test file
    val_df = pd.read_csv("../challenge_data/eval_tweets/" + fname)
    val_df['Tweet'] = val_df['Tweet'].apply(preprocess_text)
    
    # Get BERT embeddings for this file
    # Note: using your existing functions
    embeddings = process_tweets_with_bert(val_df, batch_size=1000)
    
    # Create DataFrame from embeddings
    embedding_df = pd.DataFrame(
        embeddings,
        columns=[f'embed_{i}' for i in range(embeddings.shape[1])]
    )
    
    # Combine with original features
    period_features_test = pd.concat([val_df, embedding_df], axis=1)
    period_features_test = period_features_test.dropna()
    
    # Drop non-feature columns
    period_features_test = period_features_test.drop(columns=["Timestamp", "Tweet"])
    
    # Group by match and period
    period_features_test = (
        period_features_test.groupby(["MatchID", "PeriodID", "ID"]).mean().reset_index()
    )
    
    # Prepare features for prediction
    X_test = period_features_test.drop(columns=["MatchID", "PeriodID", "ID"]).values
    X_test = pd.DataFrame(X_test)
    print(X_test.shape)
    print(X_test.isna().sum())
    X_test = X_test.dropna()
    print(X_test.shape)
    
    # Make predictions
    preds = clf.predict(X_test)
    dummy_preds = dummy_clf.predict(X_test)
    
    # Add predictions to DataFrame
    period_features_test["EventType"] = preds
    period_features_test["DummyEventType"] = dummy_preds
    
    # Append results
    predictions.append(period_features_test[["ID", "EventType"]])
    dummy_predictions.append(period_features_test[["ID", "DummyEventType"]])

# Save predictions
pred_df = pd.concat(predictions)
pred_df.to_csv("bert_logistic_predictions.csv", index=False)

pred_df = pd.concat(dummy_predictions)
pred_df.to_csv("bert_dummy_predictions.csv", index=False)