# Pré-traitement des données

## Lecture du fichier CSVet création de la dataframe

In [1]:
import pandas as pd

opinion_df = pd.read_csv ("./data_website_tab.csv", sep="\t", names=["titre", "description","position","theme"],dtype={"titre":"object","description":"object","position":"category","theme":"category"},header=None)
opinion_df.head()

Unnamed: 0,titre,description,position,theme
0,Le député Loïc Dombreval demande au gouverneme...,.@LoicDombreval se prononce pour une anticipa...,Agit pour les animaux,elevage
1,Ces maires ont attribué une délégation conditi...,Ces maires ont attribué à un conseiller ou un...,Agit pour les animaux,droit-animal
2,Tribune en faveur de la censure du débat publi...,"Soutien assumé à la convention « Déméter », d...",Agit contre les animaux,elevage
3,Proposition de loi n°1896 visant à abolir l’él...,"En France, il est interdit de fabriquer et ve...",Agit pour les animaux,elevage
4,3 Députés demandent au gouvernement d'intégrer...,Question écrite de M. Dimitri Houbron député ...,Agit pour les animaux,mer-pisciculture


## Vérification lignes vides

In [2]:
opinion_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7590 entries, 0 to 7589
Data columns (total 4 columns):
titre          7590 non-null object
description    7590 non-null object
position       7588 non-null category
theme          7590 non-null category
dtypes: category(2), object(2)
memory usage: 134.1+ KB


## Suppression des lignes vides

In [3]:
opinion_df_clean = opinion_df.dropna(how='any')
opinion_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7588 entries, 0 to 7589
Data columns (total 4 columns):
titre          7588 non-null object
description    7588 non-null object
position       7588 non-null category
theme          7588 non-null category
dtypes: category(2), object(2)
memory usage: 193.2+ KB


## Récupération des index des lignes sans texte ne contenant que des espaces 


In [4]:
opinion_df_spaces = opinion_df_clean[(opinion_df_clean.description == " ") |(opinion_df_clean.description == "  ")]
opinion_df_spaces.index

Int64Index([  58,   59,   60,   61,  146,  176,  178,  180,  365,  392,  561,
             581,  583,  771,  772,  894,  895, 1038, 1098, 1355, 1356, 1407,
            1521, 1747, 2384, 2711, 2841, 3133, 4134, 5373],
           dtype='int64')

## Suppression des lignes ayant uniquement des espaces

In [5]:
opinion_df_clean2 = opinion_df_clean.drop(index=opinion_df_spaces.index)

## Vérification résultats

In [6]:
opinion_df_clean2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7558 entries, 0 to 7589
Data columns (total 4 columns):
titre          7558 non-null object
description    7558 non-null object
position       7558 non-null category
theme          7558 non-null category
dtypes: category(2), object(2)
memory usage: 192.5+ KB


## Remplacement du jeu de données initial par le jeu de données nettoyé

In [7]:
opinion_df= opinion_df_clean2

## Nombre de lignes pour chaque position

In [8]:
opinion_df.position.value_counts()

Agit pour les animaux        3899
Agit contre les animaux      2768
Penche pour les animaux       565
Penche contre les animaux     326
Name: position, dtype: int64

## Nombre de lignes pour chaque thème

In [9]:
opinion_df.theme.value_counts()

elevage                    2214
chasse                     1765
zoo-cirque                  748
mer-pisciculture            712
animaux-de-compagnie        687
droit-animal                653
corrida                     511
experimentation-animale     268
Name: theme, dtype: int64

## Création d'une fonction pour découper texte en mots

In [10]:
import nltk
from nltk.tokenize import word_tokenize

def split_into_tokens_nltk(desc) :
    return word_tokenize(desc)

## Tokenisation du texte dans la colonne description

In [11]:
opinion_df.description.head().apply(split_into_tokens_nltk)

0    [., @, LoicDombreval, se, prononce, pour, une,...
1    [Ces, maires, ont, attribué, à, un, conseiller...
2    [Soutien, assumé, à, la, convention, «, Déméte...
3    [En, France, ,, il, est, interdit, de, fabriqu...
4    [Question, écrite, de, M., Dimitri, Houbron, d...
Name: description, dtype: object

## Création d'une fonction pour la désuffixation

In [12]:
from nltk.stem.snowball import FrenchStemmer

fs = FrenchStemmer()
def split_into_stems(desc):
    words = word_tokenize(desc)
    return [fs.stem(w) for w in words]

## Désuffixation du texte dans la colonne description

In [13]:
opinion_df.description.head().apply(split_into_stems)

0    [., @, loicdombreval, se, prononc, pour, une, ...
1    [ce, mair, ont, attribu, à, un, conseil, ou, u...
2    [soutien, assum, à, la, convent, «, démet, », ...
3    [en, franc, ,, il, est, interd, de, fabriqu, e...
4    [question, écrit, de, m., dimitr, houbron, dép...
Name: description, dtype: object

## Transformation du texte en vecteur avec CountVectorizer


In [14]:
from sklearn.feature_extraction.text import CountVectorizer
bow_transformer = CountVectorizer(tokenizer=split_into_stems, lowercase=True, min_df=0.01)
bow_transformer.fit(opinion_df['description'])
print(len(bow_transformer.vocabulary_))

2264


## Transformation des descriptions en vecteurs

In [15]:
descriptions_bow = bow_transformer.transform(opinion_df['description'])

## Transformation des fréquences de sac de mots et calcul du tfidf pour chaque attribut


In [16]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(descriptions_bow)
descriptions_tfidf = tfidf_transformer.transform(descriptions_bow)

## Récupération des noms de colonnes et ajout du préfixe l_ + transformation du sac de mots en Dataframe

In [17]:
vocab = ["l_" + v for v in bow_transformer.get_feature_names()]

tfidf_df = pd.DataFrame(descriptions_tfidf.toarray(), columns=vocab)

## Affichage des premières lignes de la DataFrame des sacs de mots

In [18]:
tfidf_df.head()

Unnamed: 0,l_!,l_#,l_%,l_&,l_',l_'',l_(,l_),"l_,",l_-,...,l_évoqu,l_être,l_œuf,l_œuvr,l_–,l_’,l_“,l_”,l_…,l_€
0,0.0,0.743203,0.0,0.0,0.0,0.0,0.053354,0.053362,0.094691,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.068274,0.068284,0.181756,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.073551,0.0,0.0,0.0,0.030724,0.030729,0.354436,0.0,...,0.0,0.0,0.0,0.0,0.041923,0.230113,0.0,0.0,0.0,0.0
3,0.0,0.0,0.032968,0.0,0.0,0.0,0.024101,0.024104,0.556045,0.0,...,0.0,0.025039,0.0,0.010186,0.0,0.37507,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.029153,0.0,0.054906,0.054913,0.185144,0.042344,...,0.0,0.019965,0.041323,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Création d'une nouvelle DataFrame intégrant les colonnes des sacs de mots dans la DataFrame d'origine


In [19]:
opinion_descri_df =pd.concat([opinion_df,tfidf_df],axis=1)

In [20]:
opinion_descri_df.head()

Unnamed: 0,titre,description,position,theme,l_!,l_#,l_%,l_&,l_',l_'',...,l_évoqu,l_être,l_œuf,l_œuvr,l_–,l_’,l_“,l_”,l_…,l_€
0,Le député Loïc Dombreval demande au gouverneme...,.@LoicDombreval se prononce pour une anticipa...,Agit pour les animaux,elevage,0.0,0.743203,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Ces maires ont attribué une délégation conditi...,Ces maires ont attribué à un conseiller ou un...,Agit pour les animaux,droit-animal,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Tribune en faveur de la censure du débat publi...,"Soutien assumé à la convention « Déméter », d...",Agit contre les animaux,elevage,0.0,0.0,0.073551,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.041923,0.230113,0.0,0.0,0.0,0.0
3,Proposition de loi n°1896 visant à abolir l’él...,"En France, il est interdit de fabriquer et ve...",Agit pour les animaux,elevage,0.0,0.0,0.032968,0.0,0.0,0.0,...,0.0,0.025039,0.0,0.010186,0.0,0.37507,0.0,0.0,0.0,0.0
4,3 Députés demandent au gouvernement d'intégrer...,Question écrite de M. Dimitri Houbron député ...,Agit pour les animaux,mer-pisciculture,0.0,0.0,0.0,0.0,0.029153,0.0,...,0.0,0.019965,0.041323,0.0,0.0,0.0,0.0,0.0,0.0,0.0
