In [1]:
import numpy as np 
import pandas as pd 
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

train_data = pd.read_csv('train_data.csv')
test_data = pd.read_csv('test_data.csv')


def remove_punct(text):
    """
    Remove all punctuation, special characters including '?', '!', and extra spaces from the text 
    while keeping letters and numbers, and return the cleaned text.
    """
    # Supprimer la ponctuation et les caractères spéciaux
    cleaned_text = re.sub(r'[?|!|\'|"|#]', '', text)
    # Supprimer tout ce qui n'est pas une lettre, un nombre, ou un espace
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', cleaned_text)
    # Remplacer les espaces multiples par un seul espace
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    # Convertir en minuscules pour standardiser le texte
    return cleaned_text.lower().strip()

def get_sentences(df, text_col, target_col):
    sentences = []
    for row in df.itertuples():
        # Tokenisation du texte en phrases et nettoyage préliminaire
        text = nltk.sent_tokenize(getattr(row, text_col).strip().lower())
        level = getattr(row, target_col)
        doc_id = getattr(row, 'Index')
        for sentence in text:
            # Nettoyage approfondi de chaque phrase pour éliminer la ponctuation et les caractères spéciaux
            cleaned_sentence = remove_punct(sentence)
            words = cleaned_sentence.split()
            avg_word_length = np.mean([len(word) for word in words]) if words else 0
            sentences.append((cleaned_sentence, len(words), round(avg_word_length, 2), level, doc_id))
    
    return sentences


train_data = get_sentences(train_data, 'text', 'level')
# Break the Data Frame into pieces sentence-wise
df = pd.DataFrame.from_records(train_data, columns=['text', 'num_of_words', 'avg_len_words', 'level', 'doc_id'])
display(df)

def clean_dataframe(df):
    # Vérification et affichage du nombre de doublons
    duplicate_rows = df.duplicated()
    num_duplicates = duplicate_rows.sum()
    print(f"Nombre de doublons : {num_duplicates}")
    if num_duplicates > 0:
        print("Doublons :")
        display(df[duplicate_rows])
    
    # Suppression des doublons
    df_cleaned = df.drop_duplicates()
    
    # Vérification et affichage du nombre de valeurs manquantes ou de textes vides
    missing_or_empty = df_cleaned.isnull() | (df_cleaned == "")
    num_missing_or_empty = missing_or_empty.sum().sum()
    print(f"Nombre de valeurs manquantes ou de textes vides : {num_missing_or_empty}")
    
    # Suppression des lignes avec des valeurs manquantes ou des textes vides
    # Remarque : cette opération dépend de la structure de votre DataFrame et de quelles colonnes vous souhaitez vérifier
    df_final = df_cleaned.dropna().replace("", np.nan).dropna()
    
    return df_final

# Application de la fonction 
df = clean_dataframe(df)
print("DataFrame après nettoyage :")
display(df.sort_values(by='num_of_words'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yanivbenichou/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/yanivbenichou/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/yanivbenichou/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,text,num_of_words,avg_len_words,level,doc_id
0,my friend meg was going out to work like every...,11,3.64,3,0
1,when she was approaching the corner at elm and...,28,4.25,3,0
2,meg decided to go over to them to see if she c...,16,3.25,3,0
3,suddenly the young woman came at her waving a gun,10,4.00,3,0
4,she pushed the gun into her stomach and starte...,21,4.62,3,0
...,...,...,...,...,...
114693,i get up at eight,5,2.60,0,20444
114694,at ten oclock i go sport for my childrens,9,3.67,0,20444
114695,at one oclock i have lunch on the fastfood,9,3.78,0,20444
114696,in afternoon i play with my childrens,7,4.43,0,20444


Nombre de doublons : 127
Doublons :


Unnamed: 0,text,num_of_words,avg_len_words,level,doc_id
284,,0,0.0,2,54
3328,,0,0.0,1,579
6451,,0,0.0,1,1142
6626,,0,0.0,2,1172
6628,,0,0.0,2,1172
...,...,...,...,...,...
111928,,0,0.0,3,19954
113394,,0,0.0,0,20221
113756,to from subject networking event,5,5.6,1,20288
114312,,0,0.0,3,20382


Nombre de valeurs manquantes ou de textes vides : 323
DataFrame après nettoyage :


Unnamed: 0,text,num_of_words,avg_len_words,level,doc_id
18482,bye,1,3.00,1,3281
89840,hi,1,2.00,0,16019
101420,hi,1,2.00,0,18091
13210,hi,1,2.00,0,2316
71041,christine,1,9.00,0,12714
...,...,...,...,...,...
38545,mr john smithi write you this letter for sugge...,153,5.13,4,6879
29347,dear sebastianregarding your convincing apply ...,160,4.89,4,5231
28253,carson county need you on monday our county ha...,183,4.72,4,5037
105297,property 1 cottage 200000 wonderful viewnot qu...,199,4.55,3,18777


In [3]:
df["text"][4]

'she pushed the gun into her stomach and started talking very fast about consumerism and the evils of the modern world'

In [2]:
remove_punct("I love this day !")

'i love this day'