In [1]:
import os
import re
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import time
from tqdm import tqdm
import bisect
from cvxpy import Variable, Minimize, norm, Problem
from scipy.sparse import find
import pickle

In [2]:
# Download some NLP models for processing, optional
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("punkt_tab")

[nltk_data] Downloading package stopwords to
[nltk_data]     /users/eleves-a/2022/adrien.goldszal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /users/eleves-a/2022/adrien.goldszal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /users/eleves-a/2022/adrien.goldszal/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
def batch_preprocess_tweets(df, batch_size=1000):
    """Main preprocessing function with filtering and batching
    Link here https://www.lix.polytechnique.fr/~nikolentzos/files/meladianos_ecir18

        1) Removing retweets
        2) Removing duplicates
        3) Removing @ mentions

    """

    print("Starting tweet preprocessing...")
    total_start = time.time()

    # Create a copy to avoid modifying original
    processed_df = df.copy()

    # Initial data filtering
    print("\nFiltering tweets...")
    initial_count = len(processed_df)

    # 1. Remove retweets
    processed_df = processed_df[~processed_df["Tweet"].str.startswith("RT ", na=False)]
    retweets_removed = initial_count - len(processed_df)

    # 2. Remove duplicates
    processed_df = processed_df.drop_duplicates(subset=["Tweet"])
    duplicates_removed = initial_count - retweets_removed - len(processed_df)

    # 3. Remove tweets with @-mentions
    processed_df = processed_df[~processed_df["Tweet"].str.contains("@", na=False)]
    mentions_removed = (
        initial_count - retweets_removed - duplicates_removed - len(processed_df)
    )

    # Print filtering statistics
    print(f"Removed {retweets_removed} retweets")
    print(f"Removed {duplicates_removed} duplicates")
    print(f"Removed {mentions_removed} tweets with @-mentions")
    print(f"Remaining tweets: {len(processed_df)}")

    
    vocabulary = set()
    # Calculate number of batches
    n_batches = int(np.ceil(len(processed_df) / batch_size))

    # Process in batches with progress bar
    processed_tweets = []
    with tqdm(total=len(processed_df), desc="Processing tweets") as pbar:
        for i in range(n_batches):
            start_idx = i * batch_size
            end_idx = min((i + 1) * batch_size, len(processed_df))

            # Get current batch
            batch = processed_df["Tweet"].iloc[start_idx:end_idx]

            # Process batch
            batch_results, batch_vocab = zip(*[preprocess_text(tweet) for tweet in batch])
            
            for words in batch_vocab:
                vocabulary.update(words)
                
            processed_tweets.extend(batch_results)

            # Update progress bar
            pbar.update(end_idx - start_idx)

    
    vocabulary = sorted(list(vocabulary))
    
    # Add processed tweets to DataFrame
    processed_df["Tweet"] = processed_tweets

    # Print timing statistics
    total_time = time.time() - total_start
    print(f"\nPreprocessing complete!")
    print(f"Total processing time: {total_time:.2f} seconds")
    print(f"Average time per tweet: {total_time/len(processed_df):.4f} seconds")

    return processed_df, vocabulary


def preprocess_text(text):
    """
    Performs standard text preprocessing tasks:
    1. Tokenization
    2. Stopword removal
    3. Punctuation and special character removal
    4. URL removal
    5. Porter stemming

    Args:
        text: String containing the tweet text
    Returns:
        Preprocessed text string
    """

    # Convert to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)

    # Remove punctuation and special characters
    text = re.sub(r"[^\w\s]", "", text)

    # Remove numbers
    text = re.sub(r"\d+", "", text)

    # Simple splitting (as done in paper)
    tokens = text.split()

    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    tokens = [token for token in tokens if token not in stop_words]

    # Porter stemming (paper uses this instead of lemmatization)
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    # Join tokens back into text
    return " ".join(tokens), tokens

In [4]:
def preprocess_file(filename,save_dir="preprocessed_data") :
    
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    
    base_name = os.path.basename(filename).replace('.csv', '')
    save_path = os.path.join(save_dir, f"{base_name}_preprocessed.pkl")
    
    # Read match file
    df = pd.read_csv(filename)
    
    processed_df, vocabulary = batch_preprocess_tweets(df)
    
    with open(save_path, 'wb') as f:
        pickle.dump({
            'processed_df': processed_df,
            'vocabulary': vocabulary
        }, f)
    
    print(f"Saved preprocessed data to {save_path}")
    
    return processed_df, vocabulary

In [None]:
def generate_adjacency_matrix_dense(tweets, vocabulary):
    print(f'Generating adjacency matrix')
    """Method that is used to generate the adjacency matrix of the given tweets"""
    wordsNumber = len(vocabulary)
    adjacency_matrix = np.zeros((wordsNumber, wordsNumber))
    tweets_edges = []
    
    for _, tweet in tweets['Tweet'].items():
        # Convert tweet tokens to set to remove duplicates
        tweet_words = set(tweet.split())  # Assuming tweet is space-separated string
        
        # Get indices of words in vocabulary
        indexes = [bisect.bisect_left(vocabulary, word) for word in tweet_words]
        
        # Initialize edges list for this tweet
        tweet_edges = []
        
        # Create edges between all word pairs
        for i, idx1 in enumerate(indexes):
            for j, idx2 in enumerate(indexes[i+1:], i+1):  # Start from i+1 to avoid duplicates
                if idx1 != idx2:
                    # Add weight to adjacency matrix
                    weight = 1.0 / len(tweet_words)
                    adjacency_matrix[idx1, idx2] += weight
                    adjacency_matrix[idx2, idx1] += weight
                    
                    # Add edge to tweet edges
                    tweet_edges.append(sorted([vocabulary[idx1], vocabulary[idx2]]))
        
        tweets_edges.append(tweet_edges)
    
    return adjacency_matrix, tweets_edges

def get_edges_weight(adjacency_matrix, vocabulary, edges_list, nodes_list):
    """Method that is used to extract the weight for each edge in the given list. The nodes_list parameter is a
    list that contains the nodes that are included in the given edges """
    
    print(f'Getting edge weights ')
    nodes = {}
    for node in nodes_list:
        index = bisect.bisect(vocabulary, node) - 1
        if (0 <= index <= len(vocabulary)) and vocabulary[index] == node:
            nodes[node] = index

    weight_list = []
    for edge in edges_list:
        first_word, second_word = edge[0], edge[1]
        if all(word in nodes for word in (first_word, second_word)):
            indexes = [nodes[first_word], nodes[second_word]]
            indexes.sort()
            weight_list.append(adjacency_matrix[indexes[0], indexes[1]])
        else:
            weight_list.append(0)
    return weight_list


def get_nonzero_edges(matrix):
    """Method that is used to extract from the adjacency matrix the edges with no-negative weights"""
    print(f'Getting non zero edges')
    rows, columns, values = find(matrix)
    return [[rows[i], columns[i], float(values[i])] for i in range(len(rows))]

def generate_vector(adjacency_matrix, vocabulary):
    """Method that is used to generate a vector for the current period"""
    print(f'Generating vector')
    
    non_zero_edges = get_nonzero_edges(adjacency_matrix)
    vector = np.zeros((len(non_zero_edges), 1))
    vector_edges = []
    vector_nodes = set()
    weighted_edges = {}
    counter = 0
    for row, column, value in non_zero_edges:
        vector[counter] = value
        nodes = [vocabulary[row], vocabulary[column]]
        vector_edges.append(nodes)
        vector_nodes.update(nodes)
        weighted_edges[tuple(sorted(nodes))] = value
        counter += 1
    return vector, vector_nodes, vector_edges, weighted_edges


In [6]:
def detect_event(current_period_data, previous_periods_data, threshold=0.5):
    """
    Detect if current period contains an event using Least Squares Optimization.
    
    Args:
        current_period_data: Dict containing current period's data 
            (adjacency_matrix, vector, vector_nodes, vector_edges, tweets_edges, etc.)
        previous_periods_data: List of dicts containing previous periods' data
        threshold: Event detection threshold
    
    Returns:
        Tuple of (is_event, period_score, summary)
    """
    if len(current_period_data['tweets_edges']) == 0:
        return False, -1, "No tweets found in the current period."
        
    period_score = -1
    if previous_periods_data:
        # Get weights matrix comparing current period to previous periods
        weights = np.zeros((len(current_period_data['vector_edges']), 
                          len(previous_periods_data)))
        
        for i, prev_period in enumerate(previous_periods_data):
            weights[:, i] = np.asarray(
                get_edges_weight(
                    prev_period['adjacency_matrix'],
                    prev_period['vocabulary'],
                    current_period_data['vector_edges'],
                    current_period_data['vector_nodes']
                )
            )
        
        # Optimize to get period score
        print(f'Optimizing least squares')
        period_score = optimize_least_squares(weights, current_period_data['vector'])
    
    is_event = period_score >= threshold
    
    return is_event, period_score

def optimize_least_squares(A, b):
    """
    Solve the Least Squares optimization problem.
    
    Args:
        A: Weight matrix comparing current period to previous periods
        b: Current period vector
        
    Returns:
        Minimum value after optimization
    """

    x = Variable(A.shape[1])
    objective = Minimize(norm(A @ x - b))
    constraints = [0 <= x, sum(x) == 1]
    
    minimum = Problem(objective, constraints).solve()
    
    value = A @ x.value - b
    value[value > 0] = 0
    minimum = np.linalg.norm(value)
    
    return minimum

In [7]:
def process_time_window(preprocessed_tweets, vocabulary):
    """
    Process tweets from a single time window.
    
    Args:
        preprocessed_tweets: List of preprocessed tweets
        vocabulary: Sorted list of unique words
        
    Returns:
        Dictionary containing period data
    """
    # Generate adjacency matrix
    adj_matrix, tweets_edges = generate_adjacency_matrix_dense(preprocessed_tweets, vocabulary)
    
    # Generate vector representation
    vector, vector_nodes, vector_edges, weighted_edges = generate_vector(adj_matrix, vocabulary)
    
    return {
        'vocabulary': vocabulary,
        'adjacency_matrix': adj_matrix,
        'tweets_edges': tweets_edges,
        'vector': vector,
        'vector_nodes': vector_nodes,
        'vector_edges': vector_edges,
        'weighted_edges': weighted_edges
    }

In [8]:
def process_single_match_file(filename, save_dir = "preprocessed_data"):
    
    base_name = os.path.basename(filename).replace('.csv', '')
    save_path = os.path.join(save_dir, f"{base_name}_preprocessed.pkl")
    
    if os.path.exists(save_path):
        print(f'Loading existing file at {save_path}')
        with open(save_path, 'rb') as f:
            data = pickle.load(f)
            df = data['processed_df']
            vocabulary = data['vocabulary']
    
    else :
        print(f'No file found, preprocessing...')
        df, vocabulary = preprocess_file(filename, save_dir)
    
    # Convert timestamp to datetime
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='ms')
    
    # Get ground truth events
    ground_truth = (df.groupby(pd.Grouper(key='Timestamp', freq='1min'))['EventType']
                   .max()  # If any event in the minute window, count it as event
                   .fillna(0)
                   .astype(int))
    

    processed_df = df
    
    # Initialize results storage
    results = []
    previous_periods = []
    
    print("Analyzing time windows...")
    # Group by 1-minute windows
    for window_start, window_df in processed_df.groupby(
        pd.Grouper(key='Timestamp', freq='1min')
    ):  
        print(f"Analyzing time at {window_start}, window size {len(window_df)}")
        if len(window_df) == 0:
            print(f'No tweets at time {window_start}')
            results.append({
                'MatchID': df['MatchID'].iloc[0],
                'Timestamp': window_start,
                'is_event': 0,
                'score': 0,
                'true_event': ground_truth.get(window_start, 0)
            })
            continue
        # Process current window
        print(f'Processing time window')
        current_period = process_time_window(window_df, vocabulary)
        
        # Detect events
        print(f'Detecting event')
        is_event, score = detect_event(
            current_period,
            previous_periods,
            threshold=0.5
        )
        
        # Store results with ground truth
        results.append({
            'MatchID': df['MatchID'].iloc[0],
            'Timestamp': window_start,
            'is_event': int(is_event),
            'score': score,
            'true_event': ground_truth.get(window_start, 0)
        })
        
        # Update previous periods
        previous_periods.append(current_period)
        if len(previous_periods) > 5:
            previous_periods.pop(0)
    
    return pd.DataFrame(results)

def calculate_metrics(y_true, y_pred):
    """Calculate classification metrics"""
    
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [9]:
def main():
    """Process all match files in directory"""
    data_dir = "../challenge_data/train_tweets"
    results = []
    
    # Process each file
    for filename in os.listdir(data_dir):
        file_path = os.path.join(data_dir, filename)
        match_results = process_single_match_file(file_path)
        results.append(match_results)
        
        # Optionally save intermediate results
        match_id = match_results['MatchID'].iloc[0]
        match_results.to_csv(f'results_match_{match_id}.csv', index=False)
    
    # Combine all results
    final_results = pd.concat(results, ignore_index=True)
    final_results = final_results.sort_values(['MatchID', 'Timestamp'])
    
    # Save final results
    final_results.to_csv('all_match_events.csv', index=False)
    print("\nAll results saved to all_match_events.csv")
    
    return final_results
    

In [None]:
def main_train() :
    """Process all match files and calculate metrics"""
    data_dir = "../challenge_data/train_tweets"
    all_true = []
    all_pred = []
    match_metrics = {}
    
    # Process each file
    for filename in os.listdir(data_dir):
        file_path = os.path.join(data_dir, filename)
        match_results = process_single_match_file(file_path)
        
        if match_results is not None:
            # Calculate metrics for this match
            metrics = calculate_metrics(
                match_results['true_event'], 
                match_results['is_event']
            )
            
            match_id = match_results['MatchID'].iloc[0]
            match_metrics[match_id] = metrics
            
            # Add to overall results
            all_true.extend(match_results['true_event'])
            all_pred.extend(match_results['is_event'])
            
            print(f"\nMetrics for match {match_id}:")
            for metric, value in metrics.items():
                print(f"{metric}: {value:.3f}")
    
    # Calculate overall metrics
    overall_metrics = calculate_metrics(all_true, all_pred)
    
    print("\nOverall metrics:")
    for metric, value in overall_metrics.items():
        print(f"{metric}: {value:.3f}")
    
    return overall_metrics, match_metrics

: 

In [None]:
main_train()

Loading existing file at preprocessed_data/ArgentinaBelgium72_preprocessed.pkl
Analyzing time windows...
Analyzing time at 2014-07-05 15:50:00, window size 772
Processing time window
Generating adjacency matrix
Generating vocabulary
Getting non zero edges
Detecting event
Analyzing time at 2014-07-05 15:51:00, window size 724
Processing time window
Generating adjacency matrix
Generating vocabulary
Getting non zero edges
