## Imports

In [16]:
import re
from collections import Counter
import numpy as np
import pandas as pd
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import spacy


## Pré-traitement

In [17]:
df = pd.read_csv('train_tweets.csv')

On importe le csv d'entrainement.

In [18]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


On affiche les 5 premières lignes pour visualiser le dataset.

In [19]:
nombre_doublons = df['text'].duplicated().sum()
print(f"Doublons dans la colonne 'text' du DataFrame : {nombre_doublons}")

df = df.drop_duplicates(subset='text', keep='first')

Doublons dans la colonne 'text' du DataFrame : 110


On supprime les doublons du dataset en les cherchant dans la colonne *text*.

In [20]:
df_test = pd.read_csv('test_tweets.csv')

On importe le dataset de test.

In [21]:
df_test.head()

Unnamed: 0,id,keyword,location,text,target
0,0,,,Just happened a terrible car crash,
1,2,,,"Heard about #earthquake is different cities, s...",
2,3,,,"there is a forest fire at spot pond, geese are...",
3,9,,,Apocalypse lighting. #Spokane #wildfires,
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,


On affiche encore une fois les 5 premières lignes afin de le visualiser.

In [22]:
nombre_doublons = df_test['text'].duplicated().sum()
print(f"Doublons dans la colonne 'text' du DataFrame : {nombre_doublons}")

df_test = df_test.drop_duplicates(subset='text', keep='first')

Doublons dans la colonne 'text' du DataFrame : 20


On supprime également les doublons.

## Creation de la pipeline de donnée avec SpaCy

### Construction des fonctions / Chargement du modèle

In [23]:
# Charger le modèle
nlp = spacy.load('en_core_web_sm')

custom_stopwords = set(["oh", "please", "help", "#", "@"])
stemmer = SnowballStemmer('english')

def clean_text(text):
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z0-9#@\s]', '', text)
    text = text.strip()
    return text

def spacy_pipeline(text):
    hashtags_mentions = re.findall(r'[#@]\w+', text)
    text = re.sub(r'[#@]\w+', '', text)

    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.text not in custom_stopwords and token.is_alpha]

    # Mettre les hashtags et mentions à la fin du traitement
    tokens.extend(hashtags_mentions)
    return ' '.join(tokens)


# Pipeline de traitement de texte
def text_pipeline(text):
    text = clean_text(text)
    return spacy_pipeline(text)

Ici on construit notre pipeline avec quelques mots custom qui ont été identifiés dans le dataset.

In [24]:
# Apply the optimized spacy_pipeline to the 'text' column
df['processed_text'] = df['text'].apply(spacy_pipeline)
df_test['processed_text'] = df_test['text'].apply(spacy_pipeline)


Ici on applique la pipeline sur notre colonne *text* de nos deux dataset en créeant une nouvelle colonne nommée *processed_text* qui contient les modifications.

In [25]:
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['processed_text']).toarray()
X_test = vectorizer.transform(df_test['processed_text']).toarray()

y = df['target'].values

On vectorize la nouvelle colonne des deux dataset afin d'appliquer l'algorithme.

In [26]:
class Node:
    def __init__(self, feature=None, threshold=None, prediction=None):
        self.feature = feature
        self.threshold = threshold
        self.prediction = prediction
        self.left = None
        self.right = None

class CustomDecisionTree:
    def __init__(self, max_depth=None, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split

    def fit(self, X, y):
        self.root = self._grow_tree(X, y)

    def predict(self, X):
        return [self._predict(inputs, self.root) for inputs in X]

    def _predict(self, inputs, node):
        while node.left and node.right:
            if inputs[node.feature] < node.threshold:
                node = node.left
            else:
                node = node.right
        return node.prediction

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        n_classes = len(np.unique(y))

        # Stopping criteria
        if (self.max_depth is not None and depth >= self.max_depth) or n_classes == 1 or n_samples < self.min_samples_split:
            return Node(prediction=self._most_common_label(y))

        # Find best split
        best_gini = 1.0
        best_feature = None
        best_threshold = None

        for feature in range(n_features):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                y_left = y[X[:, feature] < threshold]
                y_right = y[X[:, feature] >= threshold]
                gini = (len(y_left) * self._gini_impurity(y_left) + len(y_right) * self._gini_impurity(y_right)) / n_samples
                if gini < best_gini:
                    best_gini = gini
                    best_feature = feature
                    best_threshold = threshold

        # Create split
        if best_gini < 1.0:
            left_idx = X[:, best_feature] < best_threshold
            X_left, y_left = X[left_idx], y[left_idx]
            X_right, y_right = X[~left_idx], y[~left_idx]
            node = Node(feature=best_feature, threshold=best_threshold)
            node.left = self._grow_tree(X_left, y_left, depth + 1)
            node.right = self._grow_tree(X_right, y_right, depth + 1)
            return node

        return Node(prediction=self._most_common_label(y))

    def _gini_impurity(self, y):
        if len(y) == 0:
            return 0
        class_counts = np.bincount(y)
        class_probs = class_counts / len(y)
        return 1.0 - np.sum(class_probs ** 2)

    def _most_common_label(self, y):
        counter = Counter(y)
        if len(counter) == 0:
            return None
        return counter.most_common(1)[0][0]


Création de notre classe custom de decision tree.

In [27]:
# Split the data for training and evaluation
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.2, stratify=y)

# Train the Custom Decision Tree
custom_tree = CustomDecisionTree(max_depth=5)
custom_tree.fit(X_train, y_train)

# Predict on the evaluation set
y_eval_pred = custom_tree.predict(X_eval)

# Print evaluation metrics
print('Custom Decision Tree - Evaluation Set')
print('Accuracy:', accuracy_score(y_eval, y_eval_pred))
print('Precision:', precision_score(y_eval, y_eval_pred))
print('Recall:', recall_score(y_eval, y_eval_pred))
print('F1 Score:', f1_score(y_eval, y_eval_pred))

# Predict on the test set
y_test_pred = custom_tree.predict(X_test)

# Save predictions to the test dataframe
df_test['target'] = y_test_pred
df_test.to_csv('test_predictions.csv', index=False)

Custom Decision Tree - Evaluation Set
Accuracy: 0.6282478347768155
Precision: 0.847457627118644
Recall: 0.15625
F1 Score: 0.2638522427440633


On peut voir grace à ces métriques que notre algorithme custom a de bons score au niveau de l'accuracy et de la precision mais que le recall est assez bas à 0.19. Cela nous donne un score F1 global de 0.30 ce qui n'est pas un bon score.

In [28]:
# Train the Decision Tree
sklearn_tree = DecisionTreeClassifier(max_depth=5)
sklearn_tree.fit(X_train, y_train)

# Predict on the evaluation set
y_eval_pred = sklearn_tree.predict(X_eval)

# Print evaluation metrics
print('Sklearn Decision Tree - Evaluation Set')
print('Accuracy:', accuracy_score(y_eval, y_eval_pred))
print('Precision:', precision_score(y_eval, y_eval_pred))
print('Recall:', recall_score(y_eval, y_eval_pred))
print('F1 Score:', f1_score(y_eval, y_eval_pred))

# Predict on the test set
y_test_pred = sklearn_tree.predict(X_test)

Sklearn Decision Tree - Evaluation Set
Accuracy: 0.6289140572951366
Precision: 0.8487394957983193
Recall: 0.1578125
F1 Score: 0.26613965744400525


La classe SKLearn nous donne des résultats similaires avec des paramètres identiques. On peut voir cependant que le recall est plus bas ce qui fait que notre score F1 est plus bas aussi.