### Prétraitement des textes

##### 1. Charger les données 

In [34]:
import pandas as pd

data_train = pd.read_csv('../data/raw/train.csv')
data_test = pd.read_csv('../data/raw/test.csv')

##### 2. Normaliser les textes

- Mettre en minuscules

In [35]:
data_train['text'] = data_train['text'].str.lower()
data_test['text'] = data_test['text'].str.lower()

- Supprimer les chiffres et la ponctuation

In [36]:
import re

data_train['text_cleaned'] = data_train['text'].apply(lambda x: re.sub(r'[^\w\s\.]', '', x))
data_test['text_cleaned'] = data_test['text'].apply(lambda x: re.sub(r'[^\w\s\.]', '', x))

- Supprimer les points (en gardant les URLs)

In [37]:
data_train['text_cleaned'] = data_train['text_cleaned'].apply(lambda x: re.sub(r'\. ', ' ', x))
data_test['text_cleaned'] = data_test['text_cleaned'].apply(lambda x: re.sub(r'\. ', ' ', x))

- Supprimer les espaces multiples

In [39]:
import re

data_train['text_cleaned'] = data_train['text_cleaned'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())
data_test['text_cleaned'] = data_test['text_cleaned'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())

##### 3. Supprimer les stopwords.

In [47]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

data_train['tokens'] = data_train['text_cleaned'].apply(lambda x: [w for w in x.split() if w not in stop_words])
data_test['tokens'] = data_test['text_cleaned'].apply(lambda x: [w for w in x.split() if w not in stop_words])


##### 4. Vérifier et valider la qualité finale des textes prétraités.

In [None]:
data_train['text_final'] = data_train['tokens'].apply(lambda x: " ".join(x))
data_test['text_final'] = data_test['tokens'].apply(lambda x: " ".join(x))

- Train

In [57]:
data_train['text_final'].head(20)

0     wall st bears claw back black reuters reuters ...
1     carlyle looks toward commercial aerospace reut...
2     oil economy cloud stocks outlook reuters reute...
3     iraq halts oil exports main southern pipeline ...
4     oil prices soar alltime record posing new mena...
5     stocks end near year lows reuters reuters stoc...
6     money funds fell latest week ap ap assets nati...
7     fed minutes show dissent inflation usatoday.co...
8     safety net forbes.com forbes.com earning ph.d ...
9     wall st bears claw back black new york reuters...
10    oil economy cloud stocks outlook new york reut...
11    need opec pump moreiran gov tehran reuters ope...
12    nonopec nations outputpurnomo jakarta reuters ...
13    google ipo auction rocky start washingtonnew y...
14    dollar falls broadly record trade gap new york...
15    rescuing old saver think may need help elderly...
16    kids rule backtoschool purchasing power kids b...
17    market head toward value funds little caus

- Test

In [56]:
data_test['text_final'].head(20)

0     fears n pension talks unions representing work...
1     race second private team sets launch date huma...
2     ky company wins grant study peptides ap ap com...
3     prediction unit helps forecast wildfires ap ap...
4     calif aims limit farmrelated smog ap ap southe...
5     open letter british copyright indoctrination s...
6     loosing war terrorism sven jaschan selfconfess...
7     foafkey foaf pgp key distribution bloom filter...
8     email scam targets police chief wiltshire poli...
9     card fraud unit nets 36000 cards first two yea...
10    group propose new highspeed wireless format lo...
11    apple launches graphics software video bundle ...
12    dutch retailer beats apple local download mark...
13    super ant colony hits australia giant 100km co...
14    socialites unite dolphin groups dolphin groups...
15    teenage rexs monster growth tyrannosaurus rex ...
16    scientists discover ganymede lumpy interior je...
17    mars rovers relay images mars express euro

##### 5. Sauvegarder les changements

In [62]:
data_train[['text_final', 'label', 'label_text']].to_csv('../data/processed/train_text_cleaned.csv', index=False)
data_test[['text_final', 'label', 'label_text']].to_csv('../data/processed/test_text_cleaned.csv', index=False)