In [55]:

import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import nltk
import re
import spacy
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from collections import Counter
from sklearn.model_selection import train_test_split

In [56]:
df = pd.read_csv('train_tweets.csv')

In [57]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [58]:
nombre_doublons = df['text'].duplicated().sum()
print(f"Doublons dans la colonne 'text' du DataFrame : {nombre_doublons}")

df = df.drop_duplicates(subset='text', keep='first')

Doublons dans la colonne 'text' du DataFrame : 110


In [59]:
df_test = pd.read_csv('test_tweets.csv')

In [60]:
df_test.head()

Unnamed: 0,id,keyword,location,text,target
0,0,,,Just happened a terrible car crash,
1,2,,,"Heard about #earthquake is different cities, s...",
2,3,,,"there is a forest fire at spot pond, geese are...",
3,9,,,Apocalypse lighting. #Spokane #wildfires,
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,


In [61]:
nombre_doublons = df_test['text'].duplicated().sum()
print(f"Doublons dans la colonne 'text' du DataFrame : {nombre_doublons}")

df_test = df_test.drop_duplicates(subset='text', keep='first')

Doublons dans la colonne 'text' du DataFrame : 20


Creation de la pipeline de donnée avec SpaCy

Construction des fonctions / Chargement du modèle

In [62]:
# Charger le modèle
nlp = spacy.load('en_core_web_sm')

custom_stopwords = set(["oh", "please", "help", "#", "@"])
stemmer = SnowballStemmer('english')

def clean_text(text):
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z0-9#@\s]', '', text)
    text = text.strip()
    return text

def spacy_pipeline(text):
    hashtags_mentions = re.findall(r'[#@]\w+', text)
    text = re.sub(r'[#@]\w+', '', text)

    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.text not in custom_stopwords and token.is_alpha]

    # Mettre les hashtags et mentions à la fin du traitement
    tokens.extend(hashtags_mentions)
    return ' '.join(tokens)


# Pipeline de traitement de texte
def text_pipeline(text):
    text = clean_text(text)
    return spacy_pipeline(text)

In [63]:
# Apply text preprocessing pipeline
df['processed_text'] = df['text'].apply(text_pipeline)
df_test['processed_text'] = df_test['text'].apply(text_pipeline)

In [64]:
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['processed_text']).toarray()
X_test = vectorizer.transform(df_test['processed_text']).toarray()

y = df['target'].values  # Ensure y is a numpy array

In [65]:
# shapes
print(X.shape)
print(y.shape)

(7503, 5000)
(7503,)


In [66]:
class Node:
    def __init__(self, feature=None, threshold=None, prediction=None):
        self.feature = feature        # Index of feature to split on
        self.threshold = threshold    # Threshold value for feature
        self.prediction = prediction  # Prediction value at leaf node
        self.left = None              # Left child Node
        self.right = None             # Right child Node

class CustomDecisionTree:
    def __init__(self, max_depth=None, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split

    def fit(self, X, y):
        self.root = self._grow_tree(X, y)

    def predict(self, X):
        return [self._predict(inputs, self.root) for inputs in X]

    def _predict(self, inputs, node):
        while node.left and node.right:
            if inputs[node.feature] < node.threshold:
                node = node.left
            else:
                node = node.right
        return node.prediction

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        n_classes = len(np.unique(y))

        # Stopping criteria
        if (self.max_depth is not None and depth >= self.max_depth) or n_classes == 1 or n_samples < self.min_samples_split:
            return Node(prediction=self._most_common_label(y))

        # Find best split
        best_gini = 1.0
        best_feature = None
        best_threshold = None

        for feature in range(n_features):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                y_left = y[X[:, feature] < threshold]
                y_right = y[X[:, feature] >= threshold]
                gini = (len(y_left) * self._gini_impurity(y_left) + len(y_right) * self._gini_impurity(y_right)) / n_samples
                if gini < best_gini:
                    best_gini = gini
                    best_feature = feature
                    best_threshold = threshold

        # Create split
        if best_gini < 1.0:
            left_idx = X[:, best_feature] < best_threshold
            X_left, y_left = X[left_idx], y[left_idx]
            X_right, y_right = X[~left_idx], y[~left_idx]
            node = Node(feature=best_feature, threshold=best_threshold)
            node.left = self._grow_tree(X_left, y_left, depth + 1)
            node.right = self._grow_tree(X_right, y_right, depth + 1)
            return node

        # Leaf node (no split performed)
        return Node(prediction=self._most_common_label(y))

    def _gini_impurity(self, y):
        if len(y) == 0:
            return 0
        class_counts = np.bincount(y)
        class_probs = class_counts / len(y)
        return 1.0 - np.sum(class_probs ** 2)

    def _most_common_label(self, y):
        counter = Counter(y)
        if len(counter) == 0:
            return None  # Handle the case where Counter is empty
        return counter.most_common(1)[0][0]


In [68]:
# Split the data for training and evaluation
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Custom Decision Tree
custom_tree = CustomDecisionTree(max_depth=10)
custom_tree.fit(X_train, y_train)

# Predict on the evaluation set
y_eval_pred = custom_tree.predict(X_eval)

# Print evaluation metrics
print('Custom Decision Tree - Evaluation Set')
print('Accuracy:', accuracy_score(y_eval, y_eval_pred))
print('Precision:', precision_score(y_eval, y_eval_pred))
print('Recall:', recall_score(y_eval, y_eval_pred))
print('F1 Score:', f1_score(y_eval, y_eval_pred))

# Predict on the test set
y_test_pred = custom_tree.predict(X_test)

# Test scores
print('Custom Decision Tree - Test Set')
print('Accuracy:', accuracy_score(df_test['target'], y_test_pred))
print('Precision:', precision_score(df_test['target'], y_test_pred))
print('Recall:', recall_score(df_test['target'], y_test_pred))
print('F1 Score:', f1_score(df_test['target'], y_test_pred))


# Save predictions to the test dataframe
df_test['target'] = y_test_pred
df_test.to_csv('test_predictions.csv', index=False)

Custom Decision Tree - Evaluation Set
Accuracy: 0.6575616255829447
Precision: 0.8192090395480226
Recall: 0.23125996810207336
F1 Score: 0.36069651741293535
Custom Decision Tree - Test Set
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
