# Text processing with Spacy
The current notebook is a supplementary notebook as part of the text preprocessing activities I've explored on the dataset, in particular - it uses spacy library for linguistic processing and its in-built nlp pipeline, for producing pos tagged sequences. 
The code here is not used further - I keep it for reference purposes

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick 
import matplotlib.dates as mdates
from matplotlib.ticker import PercentFormatter, FuncFormatter
%matplotlib inline

import sqlalchemy

from cycler import cycler

import seaborn as sns
sns.set()

import googletrans
from googletrans import Translator

import regex as re
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk import tokenize # word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag, ne_chunk 
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from textacy import preprocessing
import textacy
from langdetect import detect

import spacy
nlp = spacy.load('en_core_web_sm')
import os 
# environment settings
pd.set_option('display.max_column',None)
pd.set_option('display.max_rows',None)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/asyagadzhalova/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/asyagadzhalova/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
os.getcwd()

'/Users/asyagadzhalova/Documents/GitHub/disaster_messages_classification/notebooks'

In [4]:
os.chdir('..')

In [5]:
df=pd.read_pickle(os.getcwd()+'/data/data_after_eda.pkl')

In [6]:
#df.isnull().sum()

## Text processing 

### Text preprocessing - text cleaning 
The actions on translation /where the text is not in english/, removal of noise, normalization

In [6]:
df['message'].head(20)

0     Weather update - a cold front from Cuba that c...
1               Is the Hurricane over or is it not over
2                       Looking for someone but no name
3     UN reports Leogane 80-90 destroyed. Only Hospi...
4     says: west side of Haiti, rest of the country ...
5                Information about the National Palace-
6                        Storm at sacred heart of jesus
7     Please, we need tents and water. We are in Sil...
8       I would like to receive the messages, thank you
9     I am in Croix-des-Bouquets. We have health iss...
10    There's nothing to eat and water, we starving ...
11    I am in Petionville. I need more information r...
12    I am in Thomassin number 32, in the area named...
13    Let's do it together, need food in Delma 75, i...
14    More information on the 4636 number in order f...
15    A Comitee in Delmas 19, Rue ( street ) Janvier...
16    We need food and water in Klecin 12. We are dy...
17    are you going to call me or do you want me

#### Text & noise cleaning 

In [7]:
def text_cleaner(serie):
    '''Function to normalize data, clean special characters, empty string, noise
    '''
    #lower case
    serie = serie.astype(str).str.lower()
    #cleaning
    serie= serie.str.replace('://www.([\w\-\.]+\S+)','') #replace URL
    serie= serie.str.replace('[^\w\s]|\b\w{1,2}\b|\d+','') #remove digit, less than 2 chars
    serie= serie.str.replace('\s{3,}','empty_string') #replace empty string 
    return serie

In [8]:
df['message_clean'] = text_cleaner(df['message'])

  serie= serie.str.replace('://www.([\w\-\.]+\S+)','') #replace URL
  serie= serie.str.replace('[^\w\s]|\b\w{1,2}\b|\d+','') #remove digit, less than 2 chars
  serie= serie.str.replace('\s{3,}','empty_string') #replace empty string


In [9]:
df['message_clean'].isna().sum()

0

We have to drop the messages which are empty

In [10]:
#drop the rows with empty string - they do not contain any information in the message
df.index[df['message_clean']=='empty_string'].values

array([ 7534, 12185, 12189, 12222])

In [11]:
df.drop(df.index[df['message_clean']=='empty_string'].values,axis=0, inplace=True)

In [12]:
df.shape

(26176, 40)

In [14]:
len(df['message'])

26176

In [15]:
df.reset_index(inplace=True)

In [17]:
df.drop('index',axis=1,inplace=True)

#### Language detection and translation of non-english to english

In [18]:
'''
Function to detect the language of a given text
'''
def detect_language(text):
    if len(text)>10:
        lang = detect(text)
        return lang

In [19]:
df['lang'] = df['message_clean'].map(detect_language)

In [20]:
df[df['lang']=='fr']

Unnamed: 0,id,message,genre,trans_ind,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,water,food,shelter,clothing,money,missing_people,refugees,death,other_aid,infrastructure_related,transport,buildings,electricity,tools,hospitals,shops,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report,message_clean,lang
117,146,Dans la zone de Saint Etienne la route de Jacm...,direct,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,dans la zone de saint etienne la route de jacm...,fr
334,407,ADDRESS CYBER CAFE IS MS NET ADRESS ITS HAITI ...,direct,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,address cyber cafe is ms net adress its haiti ...,fr
459,565,"Bonsoir, on est a bon repos aprs la compagnie ...",direct,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,bonsoir on est a bon repos aprs la compagnie t...,fr
548,670,help at 3rlle du travail au college lamartiner...,direct,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,help at rlle du travail au college lamartinere...,fr
558,682,"Laboule 12 prolongee, section Prosi NO FOOD NO...",direct,1,1,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,laboule prolongee section prosi no food no st...,fr
575,700,URGENT CRECHE ORPHANAGE KAY TOUT TIMOUN CROIX ...,direct,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,urgent creche orphanage kay tout timoun croix ...,fr
647,796,"we need food, water, toilets and security forc...",direct,1,1,1,0,1,0,1,0,1,0,1,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,we need food water toilets and security forces...,fr
654,804,elle est vraiment malade et a besoin d'aide. u...,direct,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,elle est vraiment malade et a besoin daide uti...,fr
1339,1584,ok tout le monde qui victime. paix pager gens ...,direct,1,1,1,0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,ok tout le monde qui victime paix pager gens y...,fr
1466,1718,"We need medical help at climatec, rue aubran, ...",direct,1,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,we need medical help at climatec rue aubran op...,fr


The translator is not working. The only option is to drop the rows with language different that english - they are 1.6% of the total data, so we can drop them

In [21]:
df[df['lang']!='en'].shape[0]

430

In [22]:
df['lang'].value_counts()

en    25746
fr       67
af       51
id       43
nl       39
it       33
pt       29
so       20
da       20
ca       20
cy       17
es       16
no       16
et       11
sq        7
tl        6
sv        6
sl        4
hr        4
pl        3
tr        2
fi        2
sk        2
ro        2
cs        1
sw        1
de        1
Name: lang, dtype: int64

In [None]:
df.shape

In [None]:
424/26176

In [23]:
df.drop(df.index[df['lang']!='en'].values,axis=0, inplace=True)

In [24]:
df.shape

(25746, 41)

### Linguistic processing 
That will involve tokenization, stemming or lemmatization, POS tagging with the spacy pipeline

In [25]:
def normalize(text):
    text = preprocessing.normalize.hyphenated_words(text)
    text = preprocessing.normalize.unicode(text)
    text = preprocessing.normalize.quotation_marks(text)
    return text

In [26]:
df['message_clean']=df['message_clean'].map(normalize)

In [27]:
'''
Input: The doc input of spacy pipeline 
Output data: Lemmas 
'''
def extract_lemmas(doc, **kwargs):
    return [t.lemma_ for t in textacy.extract.words(doc, **kwargs)]


def extract_noun_phrases(doc, preceding_pos=['NOUN'], sep='_'):
    patterns = []
    for pos in preceding_pos:
        patterns.append(f"POS:{pos} POS:NOUN:+")
    spans = textacy.extract.matches.token_matches(doc, patterns=patterns)
    return [sep.join([t.lemma_ for t in s]) for s in spans]


def extract_entities(doc, include_types=None, sep='_'):

    ents = textacy.extract.entities(doc,
             include_types=include_types,
             exclude_types=None,
             drop_determiners=True,
             min_freq=1)

    return [sep.join([t.lemma_ for t in e])+'/'+e.label_ for e in ents]


def extract_nlp(doc):
    return {
    'lemmas'          : extract_lemmas(doc,
                                     exclude_pos = ['PART', 'PUNCT',
                                        'DET', 'PRON', 'SYM', 'SPACE'],
                                     filter_stops = False),
    'adjs_verbs'      : extract_lemmas(doc, include_pos = ['ADJ', 'VERB']),
    'nouns'           : extract_lemmas(doc, include_pos = ['NOUN', 'PROPN']),
    'noun_phrases'    : extract_noun_phrases(doc, ['NOUN']),
    'adj_noun_phrases': extract_noun_phrases(doc, ['ADJ']),
    'entities'        : extract_entities(doc, ['PERSON', 'ORG', 'GPE', 'LOC'])
    }

In [28]:
nlp_columns = list(extract_nlp(nlp.make_doc('')).keys())
print(nlp_columns)

['lemmas', 'adjs_verbs', 'nouns', 'noun_phrases', 'adj_noun_phrases', 'entities']


In [29]:
for col in nlp_columns:
    df[col] = None

In [30]:
batch_size = 50

for i in range(0, len(df), batch_size):
    docs = nlp.pipe(df['message_clean'][i:i+batch_size])

    for j, doc in enumerate(docs):
        for col, values in extract_nlp(doc).items():
            df[col].iloc[i+j] = values

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col].iloc[i+j] = values


In [31]:
df.head()

Unnamed: 0,id,message,genre,trans_ind,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,water,food,shelter,clothing,money,missing_people,refugees,death,other_aid,infrastructure_related,transport,buildings,electricity,tools,hospitals,shops,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report,message_clean,lang,lemmas,adjs_verbs,nouns,noun_phrases,adj_noun_phrases,entities
0,2,Weather update - a cold front from Cuba that c...,direct,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,weather update a cold front from cuba that co...,en,"[weather, update, cold, front, from, cuba, cou...","[cold, pass]","[weather, update, cuba, haiti]",[weather_update],[cold_front],[cuba/GPE]
1,7,Is the Hurricane over or is it not over,direct,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,is the hurricane over or is it not over,en,"[be, hurricane, over, or, be, over]",[],[hurricane],[],[],[]
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,un reports leogane destroyed only hospital st...,en,"[un, report, leogane, destroy, only, hospital,...","[report, destroy, need]","[un, leogane, hospital, st, croix, functioning...",[croix_functioning],[],[un/ORG]
4,12,"says: west side of Haiti, rest of the country ...",direct,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,says west side of haiti rest of the country to...,en,"[say, west, side, of, haiti, rest, of, country...","[say, west, haiti]","[rest, country, today, tonight]",[country_today],"[west_side, haiti_rest]",[]
5,14,Information about the National Palace-,direct,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,information about the national palace,en,"[information, about, national, palace]",[],"[information, national, palace]",[],[],[national_palace/ORG]


In [32]:
df.to_pickle(os.getcwd()+'/data/data_after_processing_spacy.pkl')