In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
import pandas as pd
import os
import re
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
import random
from tqdm import tqdm
from joblib import Parallel, delayed
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier


import warnings

warnings.filterwarnings("ignore")

In [3]:
def mean_main_kernel(word_vectors, axis=0, n_clusters=3):
    "get the vector of the center of the kernel"
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    kmeans.fit(word_vectors)

    labels = kmeans.labels_

    largest_cluster_label = np.bincount(labels).argmax()
    largest_cluster_indices = np.where(labels == largest_cluster_label)[0]
    largest_cluster_vectors = word_vectors[largest_cluster_indices]
    largest_cluster_centroid = largest_cluster_vectors.mean(axis=0)

    return largest_cluster_centroid


# Function to compute the average word vector for a tweet
def get_avg_embedding(tweet, model, vector_size=200):
    tweet = str(tweet)
    words = tweet.split()  # Tokenize by whitespace
    word_vectors = [model[word] for word in words if word in model]
    if (
        not word_vectors
    ):  # If no words in the tweet are in the vocabulary, return a zero vector
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)


def get_main_kernel_embedding(df, n_clusters=3):
    if "EventType" in df.columns:
        vector_columns = ["EventType"] + [str(i) for i in range(200)]
    else:
        vector_columns = [str(i) for i in range(200)]

    df[vector_columns] = (
        df[vector_columns].apply(pd.to_numeric, errors="coerce").fillna(0)
    )
    kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(df[vector_columns])

    # Store cluster labels in a separate DataFrame
    clusters_df = pd.DataFrame(kmeans.labels_, columns=["cluster"], index=df.index)

    # Concatenate cluster labels with the original DataFrame
    df = pd.concat([df, clusters_df], axis=1)

    main_kernel_label = df["cluster"].value_counts().idxmax()
    df["is_main_kernel"] = (df["cluster"] == main_kernel_label).astype(int)
    main_kernel_vectors = df[df["cluster"] == main_kernel_label][vector_columns]
    avg_main_kernel_embedding = main_kernel_vectors.mean(axis=0)

    return avg_main_kernel_embedding, df

In [6]:
### DOWNLOAD MODEL

import gensim.downloader as api
import nltk

# Download some NLP models for processing, optional
nltk.download("stopwords")
nltk.download("wordnet")
# Load GloVe model with Gensim's API
embeddings_model = api.load("glove-twitter-200")  # 200-dimensional GloVe embeddings

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Charles/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/Charles/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# TRAINING

## DOWNLOAD TRAIN DATASET TO ADD FEATURES

In [5]:
path_data = "../challenge_data/train_tweets"

files = []

for i, filename in enumerate(os.listdir(path_data)):
    if filename == ".ipynb_checkpoints":
        continue
    print(filename)
    df = pd.read_csv(f"{path_data}/" + filename)

    files.append(df)

df = pd.concat(files, ignore_index=True)
print("\n", f"Testing data shape: {df.shape}")

ArgentinaBelgium72.csv
ArgentinaGermanyFinal77.csv
AustraliaNetherlands29.csv
AustraliaSpain34.csv
BelgiumSouthKorea59.csv
CameroonBrazil36.csv
FranceGermany70.csv
FranceNigeria66.csv
GermanyAlgeria67.csv
GermanyBrazil74.csv
GermanyUSA57.csv
HondurasSwitzerland54.csv
MexicoCroatia37.csv
NetherlandsChile35.csv
PortugalGhana58.csv
USASlovenia2010.csv

 Testing data shape: (5056050, 6)


## CREATION AUTRES FEATURES

In [6]:
# Filtrer les tweets avec plus de 3 lettres identiques consécutives
regex = r"(.)\1{2,}"  # Cherche des séquences de 3 caractères identiques ou plus
df["Has_Repeated_Letters"] = df["Tweet"].apply(lambda x: bool(re.search(regex, x)))

# Expression régulière pour détecter les émojis
emoji_regex = r"[\U00010000-\U0010ffff]"
df["Emoji_Count"] = df["Tweet"].apply(lambda x: len(re.findall(emoji_regex, x)))

# Compter le nombre de points d'exclamation et d'interrogation
df["Exclamation_Count"] = df["Tweet"].apply(lambda x: x.count("!"))
df["Question_Count"] = df["Tweet"].apply(lambda x: x.count("?"))

# Intermediate features
df["starts_with_RT"] = df["Tweet"].str.startswith(
    "RT"
)  # ne jamais les mettre après aggrégation
df["isMention"] = df["Tweet"].apply(
    lambda x: "@" in x
)  # ne jamais les metter après aggrégation

### ADD FEATURES

df["nb_tweets_per_minute"] = df.groupby(by=["ID"])["ID"].transform("count")
df["nb_consecutive_letters_per_minute"] = df.groupby(by=["ID"])[
    "Has_Repeated_Letters"
].transform("count")
df["nb_smileys_per_minute"] = df.groupby(by=["ID"])["Emoji_Count"].transform("count")
df["Exclamation_Count_per_minute"] = df.groupby(by=["ID"])[
    "Exclamation_Count"
].transform("count")
df["Question_Count_per_minute"] = df.groupby(by=["ID"])["Question_Count"].transform(
    "count"
)
df["nb_RT_per_min"] = df.groupby("ID")["starts_with_RT"].transform("sum")
df["nb_@_per_min"] = df.groupby("ID")["isMention"].transform("sum")
df["Match_time"] = df["ID"].str.split("_").str[1].astype(int)


df_new_features = df.drop_duplicates(subset="ID")

## READ EMBEDDING + CONCATENATE WITH NEW FEATURES

In [8]:
### READ the already computed embeddings
li = []
for filename in os.listdir("kernel_with_model_teacher/dataframe_with_kernel_embedding"):
    imported_df = pd.read_csv(
        "kernel_with_model_teacher/dataframe_with_kernel_embedding/" + filename
    )
    li.append(imported_df)
imported_df = pd.concat(li, ignore_index=True)

### Merge with new features in df_new_features
imported_df["ID"] = imported_df["ID"].astype(str)
df_new_features["ID"] = df_new_features["ID"].astype(str)

columns_to_keep = [
    "nb_tweets_per_minute",
    "nb_RT_per_min",
    "nb_@_per_min",
    "Match_time",
    "nb_consecutive_letters_per_minute",
    "nb_smileys_per_minute",
    "Exclamation_Count_per_minute",
    "Question_Count_per_minute",
]
period_features = pd.concat(
    [imported_df.set_index("ID"), df_new_features.set_index("ID")[columns_to_keep]],
    axis=1,
).reset_index()

## CREATION df_X and df_y pour le training

In [9]:
liste_features_to_drop = ["MatchID", "ID", "Match_time"]

df_X = period_features.drop(columns=liste_features_to_drop + ["EventType"])
df_y = period_features["EventType"]

# CREATION PCA

In [10]:
# Sélection des colonnes pour la PCA
columns_to_pca = [str(i) for i in range(1, 200)]  # Colonnes '1' à '199'
X_pca_input = df_X[columns_to_pca]

# number of Principal Components
N = 50

pca = PCA(n_components=N)  # Réduction à N features
X_pca = pca.fit_transform(X_pca_input)

pca_columns = [f"PCA_{i+1}" for i in range(N)]
df_pca = pd.DataFrame(X_pca, columns=pca_columns, index=df_X.index)

columns_to_keep = [col for col in df_X.columns if col not in columns_to_pca]
df_final = pd.concat([df_X[columns_to_keep], df_pca], axis=1)

# Résultat
print(f"PCA terminée, DataFrame final créé avec {N} nouvelles features.")

df_X = df_final

PCA terminée, DataFrame final créé avec 50 nouvelles features.


# TESTING MODELS 

## RF

### GRID SEARCH

In [14]:
X = df_X.values
y = df_y.values

rf_clf = RandomForestClassifier(random_state=42)

param_grid = {
    "n_estimators": [100, 200],  # Nombre d'arbres dans la forêt
    "max_depth": [10, 20],  # Profondeur maximale des arbres
    "min_samples_split": [
        2,
        5,
    ],  # Nombre minimal d'échantillons nécessaires pour diviser un nœud
    "min_samples_leaf": [
        2,
        4,
    ],  # Nombre minimal d'échantillons nécessaires dans un feuille
    "max_features": [
        "auto",
        "sqrt",
        "log2",
    ],  # Nombre de caractéristiques à considérer pour la division
}

grid_search = GridSearchCV(
    estimator=rf_clf,
    param_grid=param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1,
    verbose=2,
)

grid_search.fit(X, y)

print("Meilleurs paramètres trouvés :", grid_search.best_params_)
print("Meilleure précision moyenne (cross-validation) :", grid_search.best_score_)

best_rf_clf = grid_search.best_estimator_

rf_cv_scores = cross_val_score(best_rf_clf, X, y, cv=10, scoring="accuracy")
print(
    "Random Forest - Accuracy moyenne (cross-validation) avec Grid Search :",
    np.mean(rf_cv_scores),
)
print(
    "Random Forest - Ecart-type de l'Accuracy (cross-validation) avec Grid Search :",
    np.std(rf_cv_scores),
)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Meilleurs paramètres trouvés : {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}
Meilleure précision moyenne (cross-validation) : 0.569517827048086
Random Forest - Accuracy moyenne (cross-validation) avec Grid Search : 0.5546728971962618
Random Forest - Ecart-type de l'Accuracy (cross-validation) avec Grid Search : 0.11025043947982338


## GRADIENT BOOSTING

### GRID SEARCH

In [None]:
X = df_X.values
y = df_y.values

gb_clf = GradientBoostingClassifier(random_state=50)

param_grid = {
    "n_estimators": [100],
    "learning_rate": [0.01, 0.1],
    "max_depth": [3, 4],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2],
    "subsample": [1.0],
}


grid_search = GridSearchCV(
    estimator=gb_clf,
    param_grid=param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1,
    verbose=2,
)


grid_search.fit(X, y)


print("Meilleurs paramètres trouvés :", grid_search.best_params_)
print("Meilleure précision moyenne (cross-validation) :", grid_search.best_score_)


best_gb_clf = grid_search.best_estimator_


gb_cv_scores = cross_val_score(best_gb_clf, X, y, cv=10, scoring="accuracy")
print(
    "Gradient Boosting - Accuracy moyenne (cross-validation) avec Grid Search :",
    np.mean(gb_cv_scores),
)
print(
    "Gradient Boosting - Ecart-type de l'Accuracy (cross-validation) avec Grid Search :",
    np.std(gb_cv_scores),
)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Meilleurs paramètres trouvés : {'learning_rate': 0.1, 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100, 'subsample': 1.0}
Meilleure précision moyenne (cross-validation) : 0.599463765895511


## LOGISTIC REGRESSION

### GRID SEARCH

In [19]:
X = df_X
y = df_y


param_grid = {
    "penalty": ["l1", "l2", "elasticnet", "none"],
    "C": [0.01, 0.1, 1, 10, 100],
    "solver": ["lbfgs", "liblinear", "saga"],
    "max_iter": [5000],
}


grid_search = GridSearchCV(
    estimator=LogisticRegression(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring="accuracy",
    verbose=1,
    n_jobs=-1,
)


grid_search.fit(X, y)


print("Meilleurs paramètres :", grid_search.best_params_)
print("Meilleure accuracy (cross-validation) :", grid_search.best_score_)
# Écart-type pour les meilleurs paramètres
best_index = grid_search.best_index_
best_std = grid_search.cv_results_["std_test_score"][best_index]
print("Standard deviation :", best_std)

Fitting 5 folds for each of 60 candidates, totalling 300 fits
Meilleurs paramètres : {'C': 0.1, 'max_iter': 5000, 'penalty': 'l2', 'solver': 'liblinear'}
Meilleure accuracy (cross-validation) : 0.6368830571910088
Standard deviation : 0.052929487676251316


In [16]:
# X = df_X
# y = df_y

# mandatory_columns = ["PCA_" + str(i) for i in range(1, N + 1)]  # PC 1 to N + 1
# optional_columns = [col for col in X.columns if col not in mandatory_columns]

# num_combinations = 10


# def evaluate_combination(subset):
#     selected_columns = list(subset)
#     X_subset = X[mandatory_columns + selected_columns]

#     model = LogisticRegression(max_iter=10_000)

#     scores = cross_val_score(model, X_subset, y, cv=5, scoring="accuracy")
#     return subset, scores.mean()


# random_combinations = []
# for _ in range(num_combinations):
#     subset_size = random.randint(1, len(optional_columns))
#     subset = random.sample(optional_columns, subset_size)
#     random_combinations.append(subset)

# results = Parallel(n_jobs=-1)(
#     delayed(evaluate_combination)(subset) for subset in tqdm(random_combinations)
# )

# results.sort(key=lambda x: x[1], reverse=True)

# print("Top 5 combinaisons de features avec leur précision moyenne :")
# for subset, score in results[:5]:  # Top 5 combinaisons
#     selected_columns = list(subset)
#     print(f"Features: {selected_columns}, Mean Accuracy: {score:.4f}")

In [22]:
# CROSS VALIDATION SUR TOUTE LA DATA + TOUTES LES FEATURES
X = df_X.values
y = df_y.values

clf = LogisticRegression(random_state=42, max_iter=10_000)

cv_scores = cross_val_score(
    clf, X, y, cv=5, scoring="accuracy"
)  # 5-fold cross-validation

print("Accuracy moyenne (cross-validation) :", np.mean(cv_scores))
print("Accuracy std (cross-validation) :", np.std(cv_scores))

Accuracy moyenne (cross-validation) : 0.603142988465495
Accuracy std (cross-validation) : 0.06513938204080033


## XGBOOST

### GRID SEARCH

In [18]:
X = df_X.values
y = df_y.values

xgb_clf = XGBClassifier(
    random_state=42, n_estimators=100, use_label_encoder=False, eval_metric="logloss"
)

param_grid = {
    "n_estimators": [100, 200],  # Nombre d'estimateurs (arbres)
    "learning_rate": [0.1, 0.2],  # Taux d'apprentissage
    "max_depth": [3, 4],  # Profondeur maximale des arbres
    "min_child_weight": [1],  # Poids minimal d'un enfant
    "subsample": [
        1.0,
    ],  # Fraction d'échantillons utilisée pour entraîner chaque arbre
    "colsample_bytree": [
        0.8,
    ],  # Fraction des colonnes utilisées pour chaque arbre
}

grid_search = GridSearchCV(
    estimator=xgb_clf,
    param_grid=param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1,
    verbose=2,
)

grid_search.fit(X, y)

print("Meilleurs paramètres trouvés :", grid_search.best_params_)
print("Meilleure précision moyenne (cross-validation) :", grid_search.best_score_)

best_xgb_clf = grid_search.best_estimator_

xgb_cv_scores = cross_val_score(best_xgb_clf, X, y, cv=10, scoring="accuracy")
print(
    "XGBoost - Accuracy moyenne (cross-validation) avec Grid Search :",
    np.mean(xgb_cv_scores),
)
print(
    "XGBoost - Ecart-type de l'Accuracy (cross-validation) avec Grid Search :",
    np.std(xgb_cv_scores),
)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Meilleurs paramètres trouvés : {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 4, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 1.0}
Meilleure précision moyenne (cross-validation) : 0.5975913239510604
XGBoost - Accuracy moyenne (cross-validation) avec Grid Search : 0.5583980518625774
XGBoost - Ecart-type de l'Accuracy (cross-validation) avec Grid Search : 0.09414449374914556
