In [7]:
import os
import re
import gensim.downloader as api
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import time
from tqdm import tqdm

In [8]:
# Download some NLP models for processing, optional
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("punkt_tab")
# Load GloVe model with Gensim's API
embeddings_start_time = time.time()
print(f"Loading embeddings...")
embeddings_model = api.load("glove-twitter-50")  # 50-dimensional GloVe embeddings
print(f"Loading embeddings took {time.time() - embeddings_start_time :.2f} seconds")

[nltk_data] Downloading package stopwords to
[nltk_data]     /users/eleves-a/2022/adrien.goldszal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /users/eleves-a/2022/adrien.goldszal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /users/eleves-a/2022/adrien.goldszal/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Loading embeddings...


KeyboardInterrupt: 

In [None]:
# Function to compute the average word vector for a tweet
def get_avg_embedding(tweet, model, vector_size=50):
    words = tweet.split()  # Tokenize by whitespace
    word_vectors = [model[word] for word in words if word in model]
    if (
        not word_vectors
    ):  # If no words in the tweet are in the vocabulary, return a zero vector
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)


def batch_preprocess_tweets(df, batch_size=1000):
    """Main preprocessing function with filtering and batching
    Link here https://www.lix.polytechnique.fr/~nikolentzos/files/meladianos_ecir18

        1) Removing retweets
        2) Removing duplicates
        3) Removing @ mentions

    """

    print("Starting tweet preprocessing...")
    total_start = time.time()

    # Create a copy to avoid modifying original
    processed_df = df.copy()

    # Initial data filtering
    print("\nFiltering tweets...")
    initial_count = len(processed_df)

    # 1. Remove retweets
    processed_df = processed_df[~processed_df["Tweet"].str.startswith("RT ", na=False)]
    retweets_removed = initial_count - len(processed_df)

    # 2. Remove duplicates
    processed_df = processed_df.drop_duplicates(subset=["Tweet"])
    duplicates_removed = initial_count - retweets_removed - len(processed_df)

    # 3. Remove tweets with @-mentions
    processed_df = processed_df[~processed_df["Tweet"].str.contains("@", na=False)]
    mentions_removed = (
        initial_count - retweets_removed - duplicates_removed - len(processed_df)
    )

    # Print filtering statistics
    print(f"Removed {retweets_removed} retweets")
    print(f"Removed {duplicates_removed} duplicates")
    print(f"Removed {mentions_removed} tweets with @-mentions")
    print(f"Remaining tweets: {len(processed_df)}")

    # Calculate number of batches
    n_batches = int(np.ceil(len(processed_df) / batch_size))

    # Process in batches with progress bar
    processed_tweets = []
    with tqdm(total=len(processed_df), desc="Processing tweets") as pbar:
        for i in range(n_batches):
            start_idx = i * batch_size
            end_idx = min((i + 1) * batch_size, len(processed_df))

            # Get current batch
            batch = processed_df["Tweet"].iloc[start_idx:end_idx]

            # Process batch
            batch_results = [preprocess_text(tweet) for tweet in batch]
            processed_tweets.extend(batch_results)

            # Update progress bar
            pbar.update(end_idx - start_idx)

    # Add processed tweets to DataFrame
    processed_df["Tweet"] = processed_tweets

    # Print timing statistics
    total_time = time.time() - total_start
    print(f"\nPreprocessing complete!")
    print(f"Total processing time: {total_time:.2f} seconds")
    print(f"Average time per tweet: {total_time/len(processed_df):.4f} seconds")

    return processed_df


def preprocess_text(text):
    """
    Performs standard text preprocessing tasks:
    1. Tokenization
    2. Stopword removal
    3. Punctuation and special character removal
    4. URL removal
    5. Porter stemming

    Args:
        text: String containing the tweet text
    Returns:
        Preprocessed text string
    """

    # Convert to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)

    # Remove punctuation and special characters
    text = re.sub(r"[^\w\s]", "", text)

    # Remove numbers
    text = re.sub(r"\d+", "", text)

    # Tokenization : better tokenization through word_tokenize by NLTK
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatization is kept (porter stemming less precise)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Join tokens back into text
    return " ".join(tokens)

In [None]:
# Read all training files and concatenate them into one dataframe
li = []
for filename in os.listdir("challenge_data/train_tweets"):
    df = pd.read_csv("challenge_data/train_tweets/" + filename)
    li.append(df)
df = pd.concat(li, ignore_index=True)

print(f"Number of tweets: {len(df)}")

# Apply preprocessing to each tweet
print(f"Preprocessing tweets...")
tweet_processing_start = time.time()
df = batch_preprocess_tweets(df)
print(f"Preprocessing took {time.time() - tweet_processing_start:.2f} seconds")
print(df.head(300))
df.to_csv("preprocessed_tweets_.csv", index=False)


Number of tweets: 5056050
Preprocessing tweets...
Starting tweet preprocessing...

Filtering tweets...
Removed 2619447 retweets
Removed 120425 duplicates
Removed 464706 tweets with @-mentions
Remaining tweets: 1851472


Processing tweets:  17%|█▋        | 318000/1851472 [01:41<08:15, 3093.65it/s]

Processing tweets:  96%|█████████▌| 1778000/1851472 [09:26<00:23, 3141.18it/s]


KeyboardInterrupt: 

In [9]:
df = pd.read_csv("preprocessed_tweets.csv")
original_count = len(df)
df = df.dropna() 
rows_dropped = original_count - len(df)
print(f"Rows dropped {rows_dropped}")
df

Rows dropped 1


Unnamed: 0,ID,MatchID,PeriodID,EventType,Timestamp,Tweet
0,11_0,11,0,0,1404575400000,hope argentina lose would fun see belgium go f...
1,11_0,11,0,0,1404575400000,watch argentina v belgium th july live go link...
2,11_0,11,0,0,1404575400000,even though hate belgium beating u waffle damn...
3,11_0,11,0,0,1404575400000,lionel messi scored assisted goal world cup ar...
4,11_0,11,0,0,1404575400000,three new player argentina teamone calamitious...
...,...,...,...,...,...,...
1851467,18_129,18,129,0,1276876799000,sound like quite match morning bad know work f...
1851468,18_129,18,129,0,1276876799000,ok england tie score u advance right worldcup
1851469,18_129,18,129,0,1276876799000,real soccer match many boring game worldcup us...
1851470,18_129,18,129,0,1276876799000,woah awesome game soccer played worldcup


In [10]:
vector_size = 50  # Adjust based on the chosen GloVe model
print(f"Computing tweet embeddings...")
embedding_start = time.time()
tweet_vectors = np.vstack(
    [get_avg_embedding(tweet, embeddings_model, vector_size) for tweet in df["Tweet"]]
)
print(f"Embedding computation took {time.time() - embedding_start:.2f} seconds")
tweet_df = pd.DataFrame(tweet_vectors)

# Attach the vectors into the original dataframe
period_features = pd.concat([df, tweet_df], axis=1)
# Drop the columns that are not useful anymore
period_features = period_features.drop(columns=["Timestamp", "Tweet"])
# Group the tweets into their corresponding periods. This way we generate an average embedding vector for each period
period_features = (
    period_features.groupby(["MatchID", "PeriodID", "ID"]).mean().reset_index()
)


Computing tweet embeddings...
Embedding computation took 34.80 seconds


In [11]:
period_features.isna().sum().sum()
period_features.to_csv("period_features_glove_50.csv", index=False)

In [12]:
period_features = pd.read_csv("period_features_glove_50.csv")

In [13]:
vector_size = 50  # Adjust based on the chosen GloVe model
# Concatenate all evaluation data first
eval_dfs = []
for fname in os.listdir("challenge_data/eval_tweets"):
    val_df = pd.read_csv("challenge_data/eval_tweets/" + fname)
    val_df['Tweet'] = val_df['Tweet'].apply(preprocess_text)
    eval_dfs.append(val_df)

eval_df = pd.concat(eval_dfs, ignore_index=True)

# Compute embeddings for the concatenated evaluation data
tweet_vectors_test = np.vstack(
    [get_avg_embedding(tweet, embeddings_model, vector_size) for tweet in eval_df["Tweet"]]
)
tweet_df_test = pd.DataFrame(tweet_vectors_test)

# Attach the vectors into the original dataframe
period_features_test = pd.concat([eval_df, tweet_df_test], axis=1)
period_features_test = period_features_test.drop(columns=["Timestamp", "Tweet"])

# Group the tweets into their corresponding periods
period_features_test = (
    period_features_test.groupby(["MatchID", "PeriodID", "ID"]).mean().reset_index()
)
print(period_features_test.head())
period_features_test.to_csv("period_features_test_glove_50.csv", index=False)


   MatchID  PeriodID   ID         0         1         2         3         4  \
0        6         0  6_0  0.163735  0.220645  0.072085 -0.371477 -0.026980   
1        6         1  6_1  0.180883  0.237059  0.078532 -0.384263 -0.038826   
2        6         2  6_2  0.201530  0.170647  0.078877 -0.399935  0.052759   
3        6         3  6_3  0.173831  0.247219  0.072275 -0.401311 -0.054211   
4        6         4  6_4  0.184585  0.270589  0.077353 -0.409590 -0.059731   

          5         6  ...        40        41        42        43        44  \
0  0.119863  0.323808  ... -0.804271 -0.164048  0.187336  0.186597  0.122361   
1  0.106418  0.310424  ... -0.816587 -0.171208  0.185524  0.204467  0.108616   
2  0.142922  0.293746  ... -0.761844 -0.150491  0.148775  0.255487  0.171711   
3  0.087987  0.309758  ... -0.796276 -0.160897  0.178360  0.221406  0.104271   
4  0.094002  0.300453  ... -0.814469 -0.145223  0.178493  0.240605  0.096409   

         45        46        47        48   

In [None]:
print("test")

test
