# Import des Data

On importe toutes les bibliothèques.

In [40]:
import pandas            as pd
import numpy             as np
import seaborn           as sns
import matplotlib.pyplot as plt
import spacy
import re
import sklearn
import random

from spacy.vocab               import Vocab
from spacy.language            import Language
from spacy.tokens              import Token
from spacymoji                 import Emoji
from   sklearn.tree            import DecisionTreeClassifier
from   sklearn.model_selection import train_test_split
from   sklearn.metrics         import confusion_matrix






Importation du corpus.

In [8]:
corpus = pd.read_csv("C:/Users/a.tekiouk/Sujet_2/Sujet_2/DATA/chanel_5k.csv", sep=";", parse_dates=["publication_time"])

# on récupére les posts en anglais
mask = corpus["language"] == 'en'
corpus = corpus.loc[mask]
chanel = corpus['text'].tolist()
publication_time_chanel = corpus['publication_time'].tolist()

print(f"# documents in corpus: {len(chanel)}")

# documents in corpus: 2285


On définit les différents éléments de la *pipeline* `spacy` pour détecter les emojis et les hashtags.

In [5]:
@Language.component("hashtag")
def hashtag_pipe(
    doc : spacy.tokens.doc.Doc
) -> spacy.tokens.doc.Doc:
    """
    Spacy pipeline component that detects if a word is a hashtag or not.

    Parameters
    ----------
    doc : spacy.tokens.Doc
        The input document to process.

    Returns
    -------
    spacy.tokens.Doc
        The processed document with updated token attributes.
    """
    merged_hashtag = False
    while True:
        for token in doc:
            if token.text == '#':
                if token.head is not None:
                    start_index = token.i
                    end_index = start_index + 1
                    with doc.retokenize() as retokenizer:
                        retokenizer.merge(doc[start_index:end_index+1])
                        merged_hashtag = True
                        break
        if not merged_hashtag:
            break
        merged_hashtag = False
    return doc

# définition du pipe
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("emoji", first=True)
nlp.add_pipe("hashtag", first=True)
Token.set_extension("is_hashtag", getter=lambda token: token.text[0] in ("#"), force=True)

# Méthodes

In [38]:
def clear_trailing_hash(
    corpus: list[str]
) -> None:
    """
    Clear any trailing '#' character from each string in a list.
    
    Parameters
    ----------
    corpus : list of str
        List of strings to be processed.
    
    Returns
    -------
    None
        The function only modifies the input corpus list in place.
    """
    for i in range(len(corpus)):
        if(corpus[i][-1]=="#"):
            corpus[i] = corpus[i].rstrip(corpus[i][-1])

In [54]:
# TODO: voir si l'on peut passer en paramètre un objet spacy qui identifie le type de token,
# afin de n'écrire qu'une seule méthode (pour les hashtags, les emojis, etc...).
def top_hashtags(
    corpus: list,
    top: int = 5,
    nlp : spacy.lang.en.English
) -> pd.Series:
        """
    Retrieves the most frequent hashtags in the given corpus.
    
    Parameters
    ----------
    corpus : list
        The list of text documents to retrieve hashtags from.
    top : int, optional
        The number of top hashtags to return. Default is 5.
    nlp : spacy.lang.en.English
        A spacy language model with a custom pipe to detect hashtags.
    
    Returns
    -------
    pd.Series
        A pandas Series containing the count of each of the most frequent hashtags found in the corpus,
        sorted in descending order.
    """
    # retrieve all hashtags in corpus
    hashtags = []
    for i in corpus:
        doc = nlp(i)
        for token in doc:
            if token._.is_hashtag:
                hashtags.append(token.text)
    # count hashtags & return most frequents
    return (
        pd
        .Series(hashtags)
        .value_counts()
        .sort_values(ascending=False)
        .head(top)
    )

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 27)

In [52]:
def top_emojis(
    corpus: list,
    top: int = 5
) -> pd.Series:
    """Retrieves the most frequent emojis in the given corpus."""
    # Retrieve emojis in corpus
    emojis = []
    for i in corpus:
        doc = nlp(i)
        for token in doc:
            if token._.is_emoji:
                emojis.append(token.text)

    # count occurrences & return most frequent
    return (
        pd
        .Series(emojis)
        .value_counts()
        .sort_values(ascending=False)
        .head(top)
    )

In [19]:
def create_dummies(corpus: pd.Series,
                   element: str,
                   top: int = 5
) -> pd.DataFrame:
    """
    Create dummy encodings for most frequents text elements in the given corpus.

    Parameters
    ----------
    corpus : pd.Series
        The corpus on which the elements will be searched for.
    element : str, {'hashtag', 'emoji'}
        The text element to look for. Currently, only 'hashtag' and 'emoji' are supported.
    top : int, optional
        The number of top modalities to dummy encode. Default is 5.

    Returns
    -------
    pd.DataFrame
        The dummy encoding corresponding to the most frequent modalities of the specified elements.
    
    Raises
    ------
    ValueError: if ``element`` is not supported.
    """
    def _is_hashtag(token):
        return token._.is_hashtag
    def _is_emoji(token):
        return token._.is_emoji
    if element == 'hashtag':
        detector = _is_hashtag
    elif element == 'emoji':
        detector = _is_emoji
    else:
        raise ValueError("Only 'hashtag' and 'emoji' elements are supported.")

    top_elements = (
        corpus
        .apply(lambda text: [token.text for token in nlp(text) if detector(token)])
        .explode()
        .value_counts()
        .head(top)
        .index
    )

    dummy = pd.DataFrame(index=corpus.index)
    for e in top_elements:
        dummy[e] = corpus.apply(lambda text: 1 if e in [token.text for token in nlp(text) if detector(token)] else 0)

    return dummy

Unnamed: 0,text,#chanel,#fashion,#instagood,#beauty,#style
57,"Hello friend, there are many products you need...",0,0,0,0,0
5,"I couldn’t not kiss you, not sleep with you, n...",1,1,0,0,0
42,"We are a product wholesaler, we sell a lot of ...",0,0,0,0,0
11,zara jillstuart.jp ferragamo chanelofficial be...,1,0,0,0,0
35,"We are a wholesaler, here has the best price, ...",0,0,0,0,0
38,"We are a product wholesaler, we sell a lot of ...",0,0,0,0,0
4,Will be wearing this denim jacket on repeat th...,1,0,0,0,0
50,"We are a wholesaler, here has the best price, ...",0,0,0,0,0
33,"I am a wholesaler of brand products, if you ne...",0,0,0,0,0
25,Yes or No ? 🧡🧡👉👉kimberly.chanel.closet ​\nDoub...,0,0,0,0,0


In [30]:
def get_word_ratio(
    corpus: list, 
    nlp: spacy.lang.en.English
) -> list[float]:
    """
    Computes the ratio of words to hashtags in the text column of the given dataframe.

    Parameters
    ----------
    df: pd.DataFrame
        The dataframe containing the text column.
    nlp: spacy.lang.en.English
        The spacy English pipeline.

    Returns
    -------
    List[float]
        The list of word-to-hashtag ratios computed for each text in the dataframe.
    """
    ratio = []
    for i in corpus:
        doc = nlp(i)
        nb_word = 0
        nb_hash = 0
        for token in doc:
            if(token._.is_hashtag):
                nb_hash+=1
            else:
                nb_word+=1
        if((nb_hash+nb_word)!=0):
            ratio.append(nb_word/(nb_hash+nb_word))
        else:
            ratio.append(0)  
    return ratio

In [31]:
def get_caps_ratio(
    corpus: list,
    nlp: spacy.lang.en.English
) -> list[float]:
    """
    Calculates the ratio of capitalized words in each text of the given DataFrame.
    
    Parameters
    ----------
    corpus: list
        The list containing the text to analyze.
    
    Returns
    -------
    list[float]
        The list containing the ratios of capitalized words for each text.
    """
    ratio = []
    for i in corpus:
        doc = nlp(clean_hashtag(i))
        nb_lower = 0
        nb_caps = 0
        for token in doc:
            if(token.text.isupper()):
                nb_caps+=1
            else:
                nb_lower+=1
        if((nb_caps+nb_lower)!=0):
            ratio.append(nb_caps/(nb_caps+nb_lower))
        else:
            ratio.append(1)
    return ratio

In [None]:
def get_nb_punct(
        corpus: list,
        nlp: spacy.lang.en.English
        ) -> list:
    """
    Counts the number of punctuation symbols in each string of the given corpus.
    
    Parameters
    ----------
    corpus : list
        A list of strings to be processed.
    nlp : spacy.lang.en.English
        A spaCy English language processing pipeline instance.

    Returns
    -------
    list
        A list containing the number of punctuation symbols in each string of the corpus.
    """

    tot = []
    for i in corpus:
        doc = nlp(clean_hashtag(i))
        nb_punct = 0
        for token in doc:
            if(token.is_punct):
                nb_punct+=1
        tot.append(nb_punct)
    return tot

In [34]:
def del_double(
    corpus: list, 
    publication_time: list, 
    limit: float, 
    method: callable) -> list: 
    """
    Remove duplicated elements from a list of strings using Levenshtein distance.

    Parameters
    ----------
    corpus : list of str
        The list of strings to remove duplicates from.
    publication_time : list of timestamp
        The list of publication time for each element in the corpus.
    limit : float
        The distance threshold under which two elements are considered duplicates.
        Must be in the range [0, 1] if using normalized Levenshtein distance, or
        in the range [0, 100] if using classical Levenshtein distance.
    method : function
        The method to use to compute the distance between two strings.

    Returns
    -------
    list of str
        The list of strings with duplicates removed.
    """
    def clean_hashtag(t): #--text
        hashtag_pattern= re.compile("#[A-Za-z0-9_]+")
        return re.sub(hashtag_pattern,"", t) #On supprime tout les types de #
    t = txt.copy()
    distance = method #initialisiation de levenshtein avec la distance normalisée.
    i = 0
    r = len(t)
    while(i<r):
        r = len(t)
        j=i+1
        while(j<r):
            if(distance(clean_hashtag(t[i]).strip(),clean_hashtag(t[j]).strip()) <= s ): # Si la distance entre les deux élemens de la liste inf à seuil
                if(publication_time[i]<publication_time[j]):
                    del t[j] #delete
                    r = len(t) #on actualise la taille de la listes
                else:
                    del t[i]
                    r = len(t) #on actualise la taille de la listes
            else:
                j+=1
        i+=1
    return t

# Creation d'un sample de validation

In [None]:
import random
chanel_junk_valid = random.choices(chanel, k=700)

In [35]:
import textdistance
dist = textdistance.levenshtein.normalized_distance
chanel_junk_valid_dd = del_double(chanel_junk_valid,publication_time_chanel,0.5,dist)

NameError: name 'chanel_junk_valid' is not defined

In [None]:
len(chanel_junk_valid_dd)

In [None]:
chanel_junk_valid_df= pd.DataFrame()
chanel_junk_valid_df['text'] = chanel_junk_valid_dd

In [None]:
chanel_junk_valid_df.to_csv("C:/Users/a.tekiouk/Sujet_2/Sujet_2/DATA/chanel_junk_valid.csv")
# Ajout de la variable is_junk sur excel

### Sample de validation : 

In [None]:
chanel_junk_valid_new = pd.read_excel('C:/Users/a.tekiouk/Sujet_2/Sujet_2/DATA/chanel_junk_valid_new.xlsx')
chanel_junk_valid_new = chanel_junk_valid_new[['text','is_junk']]

In [None]:
chanel_junk_valid_new.head()

In [None]:
clean_right_side(chanel_junk_valid_new)

# Ajout de features

### Ratio de mot

In [None]:
word_ratio(chanel_junk_valid_new)

In [None]:
chanel_junk_valid_new.head()

### Ratio de caps

In [None]:
caps_ratio(chanel_junk_valid_new)

In [None]:
chanel_junk_valid_new.head()

# Nb de ponctuation

In [None]:
nb_punct(chanel_junk_valid_new)

In [None]:
chanel_junk_valid_new.head()

### Top hashtags junk

In [None]:
s = chanel_junk_valid_new[chanel_junk_valid_new['is_junk']==1]['text']

In [None]:
top_hashtags(s,15)

### Top emojis junk

In [None]:
top_emojis(s,15)

# Hashtag/emojis dummy

In [None]:
dummy_emojis(chanel_junk_valid_new,5)
dummy_hashtags(chanel_junk_valid_new,5)

# Correlation, boxplot

In [None]:


corr_df = chanel_junk_valid_new.corr(method='pearson')

plt.figure(figsize=(8, 6))
sns.heatmap(corr_df, annot=True, vmin=-1, vmax=1)
plt.show()

Boxplot word ratio 

In [None]:
sns.boxplot(data=chanel_junk_valid_new, x="is_junk", y="ratio_word",color= 'skyblue')

Boxplot caps ratio

In [None]:
sns.boxplot(data  = chanel_junk_valid_new,
            x     = "is_junk",
            y     = "ratio_caps",
            color = 'skyblue')

Boxplot nb ponctuation

In [None]:
sns.boxplot(data=chanel_junk_valid_new, x="is_junk", y="nb_punct")

# Arbre de decision

In [None]:
junk0 = chanel_junk_valid_new[chanel_junk_valid_new['is_junk']==0].index.values.tolist()
i_0  = random.choices(junk0, k=round(len(junk0)*0.75))

In [None]:
junk1 = chanel_junk_valid_new[chanel_junk_valid_new['is_junk']==1].index.values.tolist()
i_1  = random.choices(junk1, k=round(len(junk1)*0.40))

In [None]:
len(i_0)

In [None]:
len(i_1)

In [None]:
train  = chanel_junk_valid_new.iloc[[*i_0,*i_1]]

In [None]:
test = chanel_junk_valid_new.drop(index = [*i_0,*i_1])

In [None]:
len(train['text'])

In [None]:
len(test['text'])

In [None]:
len(chanel_junk_valid_new['text'])

In [None]:
X = chanel_junk_valid_new.drop(['text', 'is_junk'], axis=1)

In [None]:
y = chanel_junk_valid_new['is_junk']

In [None]:
X_train = chanel_junk_valid_new.drop([1,2,3], axis=0)

In [None]:
y_train = 

In [None]:
tree1 = DecisionTreeClassifier()

In [None]:
tree1.fit(X_train,y_train)

In [None]:
pred = tree1.predict(X_test)

In [None]:
confusion_matrix(y_true = y_test, y_pred = pred)