In [4]:
import pandas as pd
import numpy as np

from preprocessing.preparing import clean_data
from preprocessing.lemmatization import lemmatize
from preprocessing.stemming import stem

## Read data

In [2]:
df = pd.read_csv('data/train.csv')
target = df[['target']]
text_col = df[['text']]
text_col

Unnamed: 0,text
0,Our Deeds are the Reason of this #earthquake M...
1,Forest fire near La Ronge Sask. Canada
2,All residents asked to 'shelter in place' are ...
3,"13,000 people receive #wildfires evacuation or..."
4,Just got sent this photo from Ruby #Alaska as ...
...,...
7608,Two giant cranes holding a bridge collapse int...
7609,@aria_ahrary @TheTawniest The out of control w...
7610,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...
7611,Police investigating after an e-bike collided ...


## Full cleaning, stemming and lemmatization

In [3]:
clean = clean_data(text_col, correct_spelling = False)

ic| 'To lower'
100%|██████████| 7613/7613 [00:00<00:00, 142372.20it/s]
ic| 'Remove mail'
100%|██████████| 7613/7613 [00:00<00:00, 68256.12it/s]
ic| 'Remove urls'
100%|██████████| 7613/7613 [00:00<00:00, 128624.29it/s]
ic| 'Remove Twitter references'
100%|██████████| 7613/7613 [00:00<00:00, 46515.87it/s]
ic| 'Remove punctuation'
100%|██████████| 7613/7613 [00:00<00:00, 57057.15it/s]
ic| 'Remove numbers'
100%|██████████| 7613/7613 [00:00<00:00, 109370.78it/s]
ic| 'Remove stopwords'
100%|██████████| 7613/7613 [01:19<00:00, 95.92it/s] 


In [3]:
# Remove missing entries
target = target.iloc[np.logical_not(clean.isna().values)]
clean = clean.iloc[np.logical_not(clean.isna().values)]

In [None]:
target.to_csv('prepared/target.csv', index=False)
clean.to_csv('prepared/clean.csv', index=False)

In [3]:
lemmatized = lemmatize(clean)
lemmatized.to_csv('prepared/lemmatized.csv', index=False)

0            deeds reason earthquake may allah forgive us
1                    forest fire near la range ask canada
2       residents asked shelter place notified officer...
3       people receive wildfires evacuation orders cal...
4       got sent photo ruby alaska smoke wildfires hou...
                              ...                        
7607    two giant cranks holding bridge collapse nearb...
7608    array control wild fires california even north...
7609                                  tuck volcano hawaii
7610    police investigation bike collided car little ...
7611    latest homes razed northern california wildlif...
Name: text, Length: 7612, dtype: object

In [4]:
stemmed = stem(clean)
stemmed.to_csv('prepared/stemmed.csv', index=False)

0               deed reason earthquak may allah forgiv us
1                     forest fire near la rang ask canada
2       resid ask shelter place notifi offic evacu she...
3             peopl receiv wildfir evacu order california
4       got sent photo rubi alaska smoke wildfir hour ...
                              ...                        
7607       two giant crank hold bridg collaps nearbi home
7608    array control wild fire california even northe...
7609                                  tuck volcano hawaii
7610    polic investig bike collid car littl portug bi...
7611    latest home raze northern california wildlif a...
Name: text, Length: 7612, dtype: object

## Cleaning without stopwords removal and lemmatization

In [19]:
clean = clean_data(text_col, remove_stopwords=False)
clean.to_csv('prepared/clean_with_stopwords.csv', index=False)

In [None]:
lemmatize(clean).to_csv('prepared/lemmatized_with_stopwords.csv', index=False)