In [2]:
import sys
print(sys.executable)
import os
import re
import gensim.downloader as api
import numpy as np
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import time
from tqdm import tqdm

/Data/adrien.goldszal/data_challenge/bin/python


### Using BERT
1) Simpler text processing because BERT has it's own tokenizer and lemmatizer

In [17]:
def batch_preprocess_tweets(df, batch_size=1000):
    """Main preprocessing function with filtering and batching
    Link here https://www.lix.polytechnique.fr/~nikolentzos/files/meladianos_ecir18

        1) Removing retweets
        2) Removing duplicates
        3) Removing @ mentions

    """

    print("Starting tweet preprocessing...")
    total_start = time.time()

    # Create a copy to avoid modifying original
    processed_df = df.copy()

    # Initial data filtering
    print("\nFiltering tweets...")
    initial_count = len(processed_df)

    # 1. Remove retweets
    processed_df = processed_df[~processed_df["Tweet"].str.startswith("RT ", na=False)]
    retweets_removed = initial_count - len(processed_df)

    # 2. Remove duplicates
    processed_df = processed_df.drop_duplicates(subset=["Tweet"])
    duplicates_removed = initial_count - retweets_removed - len(processed_df)

    # 3. Remove tweets with @-mentions
    processed_df = processed_df[~processed_df["Tweet"].str.contains("@", na=False)]
    mentions_removed = (
        initial_count - retweets_removed - duplicates_removed - len(processed_df)
    )

    # Print filtering statistics
    print(f"Removed {retweets_removed} retweets")
    print(f"Removed {duplicates_removed} duplicates")
    print(f"Removed {mentions_removed} tweets with @-mentions")
    print(f"Remaining tweets: {len(processed_df)}")

    # Calculate number of batches
    n_batches = int(np.ceil(len(processed_df) / batch_size))

    # Process in batches with progress bar
    processed_tweets = []
    with tqdm(total=len(processed_df), desc="Processing tweets") as pbar:
        for i in range(n_batches):
            start_idx = i * batch_size
            end_idx = min((i + 1) * batch_size, len(processed_df))

            # Get current batch
            batch = processed_df["Tweet"].iloc[start_idx:end_idx]

            # Process batch
            batch_results = [preprocess_text(tweet) for tweet in batch]
            processed_tweets.extend(batch_results)

            # Update progress bar
            pbar.update(end_idx - start_idx)

    # Add processed tweets to DataFrame
    processed_df["Tweet"] = processed_tweets

    # Print timing statistics
    total_time = time.time() - total_start
    print(f"\nPreprocessing complete!")
    print(f"Total processing time: {total_time:.2f} seconds")
    print(f"Average time per tweet: {total_time/len(processed_df):.4f} seconds")

    return processed_df


def preprocess_text(text):
    """
    Limited preprocessing for BERT

    Args:
        text: String containing the tweet text
    Returns:
        Preprocessed text string
    """

    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)

    text = " ".join(text.split())

    # Join tokens back into text
    return text

In [17]:
# Read all training files and concatenate them into one dataframe
li = []
for filename in os.listdir("../challenge_data/train_tweets"):
    df = pd.read_csv("../challenge_data/train_tweets/" + filename)
    li.append(df)
df = pd.concat(li, ignore_index=True)

print(f"Number of tweets: {len(df)}")

# Apply preprocessing to each tweet
print(f"Preprocessing tweets...")
tweet_processing_start = time.time()
df = batch_preprocess_tweets(df)
print(f"Preprocessing took {time.time() - tweet_processing_start:.2f} seconds")
print(df.head(300))
df.to_csv("bert_preprocessed_tweets.csv", index=False)

Number of tweets: 5056050
Preprocessing tweets...
Starting tweet preprocessing...

Filtering tweets...
Removed 2619447 retweets
Removed 120425 duplicates
Removed 464706 tweets with @-mentions
Remaining tweets: 1851472


Processing tweets: 100%|██████████| 1851472/1851472 [00:03<00:00, 611542.79it/s]



Preprocessing complete!
Total processing time: 5.77 seconds
Average time per tweet: 0.0000 seconds
Preprocessing took 5.83 seconds
       ID  MatchID  PeriodID  EventType      Timestamp  \
6    11_0       11         0          0  1404575400000   
7    11_0       11         0          0  1404575400000   
9    11_0       11         0          0  1404575400000   
10   11_0       11         0          0  1404575400000   
12   11_0       11         0          0  1404575400000   
..    ...      ...       ...        ...            ...   
789  11_0       11         0          0  1404575419000   
792  11_0       11         0          0  1404575419000   
794  11_0       11         0          0  1404575419000   
795  11_0       11         0          0  1404575419000   
797  11_0       11         0          0  1404575419000   

                                                 Tweet  
6    I just hope Argentina lose. Would be fun to se...  
7    Watch Argentina vs Belgium 5th July 2014 LIVE ...  


### Load csv and check if there are NaN values just in case

In [10]:
df = pd.read_csv("bert_preprocessed_tweets.csv")
original_count = len(df)
df = df.dropna() 
rows_dropped = original_count - len(df)
print(f"Rows dropped {rows_dropped}")

Rows dropped 0


### Load BERT tokenizer and embeddings

In [3]:
from transformers import AutoTokenizer, AutoModel
import torch

In [20]:
# 1. Load BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

In [None]:
# 2. Function to get BERT embeddings for a batch of tweets
def get_bert_embeddings(tweets, tokenizer, model, max_length=128):
    # Move model to GPU if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    model.eval()
    
    # Tokenize all tweets in the batch
    encoded = tokenizer(
        tweets,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )
    
    # Move tensors to the same device as model
    input_ids = encoded['input_ids'].to(device)
    attention_mask = encoded['attention_mask'].to(device)
    
    # Get BERT embeddings
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        # Use [CLS] token embedding (first token) as sentence representation
        embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    
    return embeddings

# 3. Process your tweets in batches
def process_tweets_with_bert(df, batch_size=1000):
    all_embeddings = []
    
    for i in tqdm(range(0, len(df), batch_size)):
        batch = df['Tweet'].iloc[i:i + batch_size].tolist()
        batch_embeddings = get_bert_embeddings(batch, tokenizer, model)
        all_embeddings.extend(batch_embeddings)
        
    embeddings_array = np.array(all_embeddings, dtype=np.float16)
    
    return embeddings_array

In [None]:
embeddings = process_tweets_with_bert(df)
np.savez_compressed('embeddings.npz', embeddings=embeddings)
    
# Now embeddings contains BERT representations for all your tweets
# Shape will be (n_tweets, 768) for bert-base-uncased
print(f"Generated embeddings shape: {embeddings.shape}")

100%|██████████| 1852/1852 [28:21<00:00,  1.09it/s]


Generated embeddings shape: (1851472, 768)


In [9]:
loaded = np.load('embeddings.npz')
embeddings = loaded['embeddings']
tweet_vectors = pd.DataFrame(embeddings)
tweet_vectors.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.070923,0.085999,0.172852,-0.235107,-0.509766,-0.553223,0.402344,0.69043,-0.06543,-0.596191,...,-0.298096,-0.130981,0.43457,-0.333008,0.365234,0.096069,-0.083862,-0.495605,0.678711,0.731934
1,-0.031891,-0.261475,0.156982,-0.210327,-0.327881,-0.526855,0.488037,0.681641,-0.308838,-0.07312,...,-0.040131,-0.271729,0.027756,-0.272217,0.067444,0.081604,-0.38916,-0.597656,0.498535,0.389893
2,-0.00346,0.262939,-0.145874,-0.166748,-0.283691,-0.163452,0.488281,0.790527,0.143799,-0.282227,...,-0.552734,-0.106445,0.035675,-0.293945,0.568848,-0.124084,0.099731,0.015854,0.605469,0.257812
3,-0.358154,-0.099182,0.340576,0.011528,-0.898926,-0.537109,0.199585,0.351318,-0.59375,-0.191406,...,0.57373,0.287842,0.367188,-0.207031,0.493896,0.020203,-0.477051,-0.358398,0.668945,0.874023
4,-0.351562,0.234253,-0.26709,-0.241455,-0.561035,-0.253418,0.099854,0.620605,-0.267578,-0.574707,...,-0.111816,0.059631,0.315918,0.168457,-0.021912,0.214111,-0.436279,-0.312012,0.735352,0.274902
5,-0.334717,-0.09668,0.021744,-0.214355,-0.740723,-0.620605,0.373779,0.88623,-0.114014,-0.408203,...,0.075195,0.085754,0.100769,-0.240967,0.623047,-0.327637,-0.142456,-0.615723,0.567383,0.297852
6,-0.235229,-0.693848,0.489746,-0.093689,-0.721191,-0.917969,0.422607,0.592773,-0.021332,-0.706055,...,-0.021744,0.151855,0.147339,-0.305664,0.219971,-0.167725,-0.345459,-0.473389,0.631836,0.691406
7,-0.901367,-0.036652,0.188721,0.037048,-0.65918,-0.245972,0.620605,0.819824,-0.956543,-0.093933,...,0.595703,0.462402,0.388916,-0.541016,-0.010513,-0.411865,-0.225708,-0.522949,0.57959,0.291992
8,-0.599609,-0.134277,0.186646,-0.035126,-0.394043,-0.307373,0.404297,0.479492,-0.507812,-0.466553,...,0.203857,0.006367,0.308105,-0.172241,0.390137,-0.001609,-0.352295,-0.129028,0.179321,0.478271
9,-0.222168,0.256104,0.035034,-0.051971,-0.638184,-0.462646,0.960449,0.396484,-0.138184,-0.177856,...,0.025345,-0.154419,0.229858,0.05957,0.239502,-0.028458,-0.12561,-0.523926,0.149902,0.291992


In [11]:
# Attach the vectors into the original dataframe
period_features = pd.concat([df, tweet_vectors], axis=1)
# Drop the columns that are not useful anymore
period_features = period_features.drop(columns=["Timestamp", "Tweet"])
# Group the tweets into their corresponding periods. This way we generate an average embedding vector for each period
period_features = (
    period_features.groupby(["MatchID", "PeriodID", "ID"]).mean().reset_index()
)

In [12]:
# We drop the non-numerical features and keep the embeddings values for each period
X = period_features.drop(columns=["EventType", "MatchID", "PeriodID", "ID"]).values
# We extract the labels of our training samples
y = period_features["EventType"].values

In [13]:
###### Evaluating on a test set:

# We split our data into a training and test set that we can use to train our classifier without fine-tuning into the
# validation set and without submitting too many times into Kaggle
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

In [14]:
# We set up a basic classifier that we train and then calculate the accuracy on our test set
clf = LogisticRegression(random_state=42, max_iter=1000).fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Test set: ", accuracy_score(y_test, y_pred))

Test set:  0.7725856697819314


In [16]:
# Train classifiers
clf = LogisticRegression(random_state=42, max_iter=1000).fit(X, y)
dummy_clf = DummyClassifier(strategy="most_frequent").fit(X, y)

predictions = []
dummy_predictions = []

# Process each file in test set
for fname in os.listdir("../challenge_data/eval_tweets"):
    # Read and preprocess test file
    val_df = pd.read_csv("../challenge_data/eval_tweets/" + fname)
    val_df['Tweet'] = val_df['Tweet'].apply(preprocess_text)
    
    # Get BERT embeddings for this file
    # Note: using your existing functions
    embeddings = process_tweets_with_bert(val_df, batch_size=1000)
    
    # Create DataFrame from embeddings
    embedding_df = pd.DataFrame(
        embeddings,
        columns=[f'embed_{i}' for i in range(embeddings.shape[1])]
    )
    
    # Combine with original features
    period_features_test = pd.concat([val_df, embedding_df], axis=1)
    period_features_test = period_features_test.dropna()
    
    # Drop non-feature columns
    period_features_test = period_features_test.drop(columns=["Timestamp", "Tweet"])
    
    # Group by match and period
    period_features_test = (
        period_features_test.groupby(["MatchID", "PeriodID", "ID"]).mean().reset_index()
    )
    
    # Prepare features for prediction
    X_test = period_features_test.drop(columns=["MatchID", "PeriodID", "ID"]).values
    X_test = pd.DataFrame(X_test)
    print(X_test.shape)
    print(X_test.isna().sum())
    X_test = X_test.dropna()
    print(X_test.shape)
    
    # Make predictions
    preds = clf.predict(X_test)
    dummy_preds = dummy_clf.predict(X_test)
    
    # Add predictions to DataFrame
    period_features_test["EventType"] = preds
    period_features_test["DummyEventType"] = dummy_preds
    
    # Append results
    predictions.append(period_features_test[["ID", "EventType"]])
    dummy_predictions.append(period_features_test[["ID", "DummyEventType"]])

# Save predictions
pred_df = pd.concat(predictions)
pred_df.to_csv("bert_logistic_predictions.csv", index=False)

pred_df = pd.concat(dummy_predictions)
pred_df.to_csv("bert_dummy_predictions.csv", index=False)

NameError: name 'preprocess_text' is not defined