https://towardsdatascience.com/preprocessing-text-data-using-python-576206753c28

1. Importing Libraries along with our Data
2. Expanding Contractions
3. Language Detection
4. Tokenization
5. Converting all Characters to Lowercase
6. Removing Punctuations
7. Removing Stopwords
8. Parts of Speech Tagging
9. Lemmatization

In [1]:
import pandas as pd
import numpy as np
import nltk
import string
import fasttext
import spacy
import contractions
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
from nltk import word_tokenize, pos_tag
plt.xticks(rotation=70)
pd.options.mode.chained_assignment = None
pd.set_option('display.max_colwidth', 100)
%matplotlib inline
import importlib
#import pickle


In [2]:
#df = pd.read_csv('C:/Users/jeanl/Desktop/trial.csv',index_col=False)
df = pd.read_pickle("C:/Users/mwamb/OneDrive/Desktop/tenthdimensionanalytics/WFP/wfpdata")

#"C:/Users/David/Desktop/tenthdimensionanalytics/WFP/wfpdata"

In [3]:
print(df['words'])

0    FOOD SECURITY \n AND NUTRITION \nIN THE WORLD\nTHE STATE OF \nSAFEGUARDING AGAINST \nECONOMIC SL...
Name: words, dtype: object


## Text Pre-Processing
### Expanding Contractions

In [4]:
df['no_contract'] = df['words'].apply(lambda x: [contractions.fix(word) for word in x.split()])
df

Unnamed: 0,cluster,words,no_contract
0,WFP,FOOD SECURITY \n AND NUTRITION \nIN THE WORLD\nTHE STATE OF \nSAFEGUARDING AGAINST \nECONOMIC SL...,"[FOOD, SECURITY, AND, NUTRITION, IN, THE, WORLD, THE, STATE, OF, SAFEGUARDING, AGAINST, ECONOMIC..."


In [5]:
df['words_str'] = [' '.join(map(str, l)) for l in df['no_contract']]
df

Unnamed: 0,cluster,words,no_contract,words_str
0,WFP,FOOD SECURITY \n AND NUTRITION \nIN THE WORLD\nTHE STATE OF \nSAFEGUARDING AGAINST \nECONOMIC SL...,"[FOOD, SECURITY, AND, NUTRITION, IN, THE, WORLD, THE, STATE, OF, SAFEGUARDING, AGAINST, ECONOMIC...",FOOD SECURITY AND NUTRITION IN THE WORLD THE STATE OF SAFEGUARDING AGAINST ECONOMIC SLOWDOWNS AN...


## English Language Detection

## Tokenization

In [6]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mwamb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
import nltk
#nltk.download()

In [8]:
df['tokenized'] = df['words_str'].apply(word_tokenize)
df

Unnamed: 0,cluster,words,no_contract,words_str,tokenized
0,WFP,FOOD SECURITY \n AND NUTRITION \nIN THE WORLD\nTHE STATE OF \nSAFEGUARDING AGAINST \nECONOMIC SL...,"[FOOD, SECURITY, AND, NUTRITION, IN, THE, WORLD, THE, STATE, OF, SAFEGUARDING, AGAINST, ECONOMIC...",FOOD SECURITY AND NUTRITION IN THE WORLD THE STATE OF SAFEGUARDING AGAINST ECONOMIC SLOWDOWNS AN...,"[FOOD, SECURITY, AND, NUTRITION, IN, THE, WORLD, THE, STATE, OF, SAFEGUARDING, AGAINST, ECONOMIC..."


## Converting all Characters to Lowercase


In [9]:
df['lower'] = df['tokenized'].apply(lambda x: [word.lower() for word in x])
df

Unnamed: 0,cluster,words,no_contract,words_str,tokenized,lower
0,WFP,FOOD SECURITY \n AND NUTRITION \nIN THE WORLD\nTHE STATE OF \nSAFEGUARDING AGAINST \nECONOMIC SL...,"[FOOD, SECURITY, AND, NUTRITION, IN, THE, WORLD, THE, STATE, OF, SAFEGUARDING, AGAINST, ECONOMIC...",FOOD SECURITY AND NUTRITION IN THE WORLD THE STATE OF SAFEGUARDING AGAINST ECONOMIC SLOWDOWNS AN...,"[FOOD, SECURITY, AND, NUTRITION, IN, THE, WORLD, THE, STATE, OF, SAFEGUARDING, AGAINST, ECONOMIC...","[food, security, and, nutrition, in, the, world, the, state, of, safeguarding, against, economic..."


## Removing Punctuations

In [10]:
punc = string.punctuation
df['no_punc'] = df['lower'].apply(lambda x: [word for word in x if word not in punc])
df

Unnamed: 0,cluster,words,no_contract,words_str,tokenized,lower,no_punc
0,WFP,FOOD SECURITY \n AND NUTRITION \nIN THE WORLD\nTHE STATE OF \nSAFEGUARDING AGAINST \nECONOMIC SL...,"[FOOD, SECURITY, AND, NUTRITION, IN, THE, WORLD, THE, STATE, OF, SAFEGUARDING, AGAINST, ECONOMIC...",FOOD SECURITY AND NUTRITION IN THE WORLD THE STATE OF SAFEGUARDING AGAINST ECONOMIC SLOWDOWNS AN...,"[FOOD, SECURITY, AND, NUTRITION, IN, THE, WORLD, THE, STATE, OF, SAFEGUARDING, AGAINST, ECONOMIC...","[food, security, and, nutrition, in, the, world, the, state, of, safeguarding, against, economic...","[food, security, and, nutrition, in, the, world, the, state, of, safeguarding, against, economic..."


## Removing Stopwords


In [11]:
stop_words = set(stopwords.words('english'))
df['stopwords_removed'] = df['no_punc'].apply(lambda x: [word for word in x if word not in stop_words])
df

Unnamed: 0,cluster,words,no_contract,words_str,tokenized,lower,no_punc,stopwords_removed
0,WFP,FOOD SECURITY \n AND NUTRITION \nIN THE WORLD\nTHE STATE OF \nSAFEGUARDING AGAINST \nECONOMIC SL...,"[FOOD, SECURITY, AND, NUTRITION, IN, THE, WORLD, THE, STATE, OF, SAFEGUARDING, AGAINST, ECONOMIC...",FOOD SECURITY AND NUTRITION IN THE WORLD THE STATE OF SAFEGUARDING AGAINST ECONOMIC SLOWDOWNS AN...,"[FOOD, SECURITY, AND, NUTRITION, IN, THE, WORLD, THE, STATE, OF, SAFEGUARDING, AGAINST, ECONOMIC...","[food, security, and, nutrition, in, the, world, the, state, of, safeguarding, against, economic...","[food, security, and, nutrition, in, the, world, the, state, of, safeguarding, against, economic...","[food, security, nutrition, world, state, safeguarding, economic, slowdowns, downturnsdemocratic..."


https://heartbeat.fritz.ai/nlp-chronicles-intro-to-spacy-34949f1bc118#:~:text=Chunking%20is%20the%20process%20of,noun%20of%20a%20given%20chunk.

In [12]:


import en_core_web_sm
nlp = en_core_web_sm.load()


## Dependency Parsing

In [13]:
df['words_str'].str.lower()

0    food security and nutrition in the world the state of safeguarding against economic slowdowns an...
Name: words_str, dtype: object

In [14]:
text = df['words_str'].str.lower()

In [15]:
print(type(text))

<class 'pandas.core.series.Series'>


In [16]:
text1= text.to_string() 

In [17]:
doc = nlp(text1)
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_, [child for child in token.children])


0 ROOT 0 PUNCT [   , security, ...]
     0 PUNCT []
food compound security NOUN []
security nsubj 0 PUNCT [food, and, nutrition, in, state, an]
and cc security NOUN []
nutrition conj security NOUN []
in prep security NOUN [world]
the det world NOUN []
world pobj in ADP [the]
the det state NOUN []
state appos security NOUN [the, of]
of prep state NOUN [safeguarding]
safeguarding pcomp of ADP [against]
against prep safeguarding VERB [slowdowns]
economic amod slowdowns NOUN []
slowdowns pobj against ADP [economic]
an appos security NOUN []
... punct 0 PUNCT []


## Visualising Dependency Parsing

https://github.com/Manikanta-Munnangi/Natural-Language-Processing-with-spaCy

In [18]:
from spacy import displacy


In [19]:
#displacy.render(doc, style='dep', jupyter=True, options={'distance': 100})


In [20]:
doc=nlp(text1)

In [21]:
tokenized=[(token.text,token.dep_) for token in doc]


In [22]:
# print result.
tokenized

[('0', 'ROOT'),
 ('   ', ''),
 ('food', 'compound'),
 ('security', 'nsubj'),
 ('and', 'cc'),
 ('nutrition', 'conj'),
 ('in', 'prep'),
 ('the', 'det'),
 ('world', 'pobj'),
 ('the', 'det'),
 ('state', 'appos'),
 ('of', 'prep'),
 ('safeguarding', 'pcomp'),
 ('against', 'prep'),
 ('economic', 'amod'),
 ('slowdowns', 'pobj'),
 ('an', 'appos'),
 ('...', 'punct')]

In [23]:
# Rather than see in bare text visualize it as a graph with displacy.
from spacy import displacy

doc=nlp(text1)
# specifying style is dep will render dependency_parser.
displacy.render(doc,style='dep',manual=False)

In [24]:
# you can access head and multiple childrens.
family_tokens=[(token.text,token.head) for token in doc]

In [25]:
family_tokens


[('0', 0),
 ('   ', 0),
 ('food', security),
 ('security', 0),
 ('and', security),
 ('nutrition', security),
 ('in', security),
 ('the', world),
 ('world', in),
 ('the', state),
 ('state', security),
 ('of', state),
 ('safeguarding', of),
 ('against', safeguarding),
 ('economic', slowdowns),
 ('slowdowns', against),
 ('an', security),
 ('...', 0)]

## Part Of Speech Tagging

In [26]:
# loop through doc and return tokens with .text
tokenized=[(token.text,token.pos_,token.tag_) for token in doc]



In [27]:

# print result.
tokenized



[('0', 'PUNCT', 'NFP'),
 ('   ', 'SPACE', '_SP'),
 ('food', 'NOUN', 'NN'),
 ('security', 'NOUN', 'NN'),
 ('and', 'CCONJ', 'CC'),
 ('nutrition', 'NOUN', 'NN'),
 ('in', 'ADP', 'IN'),
 ('the', 'DET', 'DT'),
 ('world', 'NOUN', 'NN'),
 ('the', 'DET', 'DT'),
 ('state', 'NOUN', 'NN'),
 ('of', 'ADP', 'IN'),
 ('safeguarding', 'VERB', 'VBG'),
 ('against', 'ADP', 'IN'),
 ('economic', 'ADJ', 'JJ'),
 ('slowdowns', 'NOUN', 'NNS'),
 ('an', 'DET', 'DT'),
 ('...', 'PUNCT', '.')]

In [28]:
for word in doc:
    print(word.text, word.pos_)
    

0 PUNCT
    SPACE
food NOUN
security NOUN
and CCONJ
nutrition NOUN
in ADP
the DET
world NOUN
the DET
state NOUN
of ADP
safeguarding VERB
against ADP
economic ADJ
slowdowns NOUN
an DET
... PUNCT


In [29]:
for word in doc:
    print(word.text, word.pos_, word.tag_)

0 PUNCT NFP
    SPACE _SP
food NOUN NN
security NOUN NN
and CCONJ CC
nutrition NOUN NN
in ADP IN
the DET DT
world NOUN NN
the DET DT
state NOUN NN
of ADP IN
safeguarding VERB VBG
against ADP IN
economic ADJ JJ
slowdowns NOUN NNS
an DET DT
... PUNCT .


In [30]:
for word in doc:
    print(word.text, word.pos_, word.tag_, word.dep_)

0 PUNCT NFP ROOT
    SPACE _SP 
food NOUN NN compound
security NOUN NN nsubj
and CCONJ CC cc
nutrition NOUN NN conj
in ADP IN prep
the DET DT det
world NOUN NN pobj
the DET DT det
state NOUN NN appos
of ADP IN prep
safeguarding VERB VBG pcomp
against ADP IN prep
economic ADJ JJ amod
slowdowns NOUN NNS pobj
an DET DT appos
... PUNCT . punct


In [31]:
nouns = [token for token, pos in pos_tag(word_tokenize(text1)) if (pos.startswith('NN') or pos.startswith('NS') or pos.startswith('NNP')or pos.startswith('NPS'))]
print(nouns)



['food', 'security', 'nutrition', 'world', 'state', 'slowdowns']


In [32]:
for token in doc:
    print(token.text, token.pos_, token.tag_, token.shape_, token.is_alpha, token.is_stop)

0 PUNCT NFP d False False
    SPACE _SP     False False
food NOUN NN xxxx True False
security NOUN NN xxxx True False
and CCONJ CC xxx True True
nutrition NOUN NN xxxx True False
in ADP IN xx True True
the DET DT xxx True True
world NOUN NN xxxx True False
the DET DT xxx True True
state NOUN NN xxxx True False
of ADP IN xx True True
safeguarding VERB VBG xxxx True False
against ADP IN xxxx True True
economic ADJ JJ xxxx True False
slowdowns NOUN NNS xxxx True False
an DET DT xx True True
... PUNCT . ... False False


https://towardsdatascience.com/a-practitioners-guide-to-natural-language-processing-part-i-processing-understanding-text-9f4abfd13e72

In [33]:
# POS tagging with Spacy 
spacy_pos_tagged = [(word, word.tag_, word.pos_) for word in doc]
pd.DataFrame(spacy_pos_tagged, columns=['Word', 'POS tag', 'Tag type'])

Unnamed: 0,Word,POS tag,Tag type
0,0,NFP,PUNCT
1,,_SP,SPACE
2,food,NN,NOUN
3,security,NN,NOUN
4,and,CC,CCONJ
5,nutrition,NN,NOUN
6,in,IN,ADP
7,the,DT,DET
8,world,NN,NOUN
9,the,DT,DET


In [34]:
# POS tagging with nltk

nltk_pos_tagged = nltk.pos_tag(sentence.split())
pd.DataFrame(nltk_pos_tagged, columns=['Word', 'POS tag'])

NameError: name 'sentence' is not defined

## Part of speech tagging
In your childhood, you may have heard the term Part of Speech (POS). It can really take good amount of time to get the hang of what adjectives and adverbs actually are. What exactly is the difference? Think about building a system where we can encode all this knowledge. It may look very easy, but for many decades, coding this knowledge into a machine learning model was a very hard NLP problem. POS tagging algorithms can predict the POS of the given word with a higher degree of precision. You can get the POS of individual words as a tuple



https://datascience.foundation/sciencewhitepaper/natural-language-processing-nlp-simplified-a-step-by-step-guide

In [35]:
from nltk import word_tokenize, pos_tag
print(pos_tag(word_tokenize(text1)))

[('0', 'CD'), ('food', 'NN'), ('security', 'NN'), ('and', 'CC'), ('nutrition', 'NN'), ('in', 'IN'), ('the', 'DT'), ('world', 'NN'), ('the', 'DT'), ('state', 'NN'), ('of', 'IN'), ('safeguarding', 'VBG'), ('against', 'IN'), ('economic', 'JJ'), ('slowdowns', 'NNS'), ('an', 'DT'), ('...', ':')]


In [36]:
tokens = nltk.word_tokenize(text1)
tags = nltk.pos_tag(tokens)
nouns = [word for word,pos in tags if (pos == 'NNP' or pos == 'NN' or pos == 'NS' or pos == 'NPS')]
nouns

['food', 'security', 'nutrition', 'world', 'state']

In [37]:
type(nouns)

list

In [38]:
feedback = pd.Series(nouns) 

feedback

0         food
1     security
2    nutrition
3        world
4        state
dtype: object

If you want to know the details of the POS, here is the way. Note we might need to download the ‘tagset’. Below example shows NN is noun.

In [39]:
nltk.download('tagsets')

[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\mwamb\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

In [40]:
nltk.help.upenn_tagset('NN')

NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...


In [41]:
nltk.help.upenn_tagset('DT')
nltk.help.upenn_tagset('IN')
nltk.help.upenn_tagset('VBZ')
nltk.help.upenn_tagset('NNP')
nltk.help.upenn_tagset('PRP$')

DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
IN: preposition or conjunction, subordinating
    astride among uppon whether out inside pro despite on by throughout
    below within for towards near behind atop around if like until below
    next into if beside ...
VBZ: verb, present tense, 3rd person singular
    bases reconstructs marks mixes displeases seals carps weaves snatches
    slumps stretches authorizes smolders pictures emerges stockpiles
    seduces fizzes uses bolsters slaps speaks pleads ...
NNP: noun, proper, singular
    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
    Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA
    Shannon A.K.C. Meltex Liverpool ...
PRP$: pronoun, possessive
    her his mine my our ours their thy your


In [42]:
text1

'0    food security and nutrition in the world the state of safeguarding against economic slowdowns an...'

In [43]:
text2= nltk.sent_tokenize(text1)
data=[]
for sent in text2:
    data = data + nltk.pos_tag(nltk.word_tokenize(sent))
    
    
for word in data:
    if 'DT' in word[1]:
        print(word)
    


('the', 'DT')
('the', 'DT')
('an', 'DT')


## Named Entity Recognition

In [44]:
for ent in doc.ents:
    print(ent.text, ent.label_, ent.start_char, ent.end_char)

## Chunking


Chunking is the process of extracting noun phrases from the text.  
spaCy can identify noun phrases (or noun chunks), as well. You can think of noun chunks as a noun plus the words describing the    noun. It’s also possible to identify and extract the base-noun of a given chunk.   


https://heartbeat.fritz.ai/nlp-chronicles-intro-to-spacy-34949f1bc118#:~:text=Chunking%20is%20the%20process%20of,noun%20of%20a%20given%20chunk.

In [45]:
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text)

food security security nsubj 0
nutrition nutrition conj security
the world world pobj in
the state state appos security
economic slowdowns slowdowns pobj against


https://stackoverflow.com/questions/40167612/how-to-keep-only-the-noun-words-in-a-wordlist-python-nltk

## Visualizing Named Entities

In [46]:
from spacy import displacy
import en_core_web_sm
sp = en_core_web_sm.load()
    
#sp = spacy.load('en_core_web_sm')
sen = sp(text1)
displacy.render(sen, style='ent', jupyter=True)



In [47]:
text1

'0    food security and nutrition in the world the state of safeguarding against economic slowdowns an...'

In [48]:
filter = {'ents': ['ORG']}
displacy.render(sen, style='ent', jupyter=True, options=filter)

In [49]:
#displacy.serve(sen, style='ent')

## Stemming vs Lemmatization

In [50]:
df

Unnamed: 0,cluster,words,no_contract,words_str,tokenized,lower,no_punc,stopwords_removed
0,WFP,FOOD SECURITY \n AND NUTRITION \nIN THE WORLD\nTHE STATE OF \nSAFEGUARDING AGAINST \nECONOMIC SL...,"[FOOD, SECURITY, AND, NUTRITION, IN, THE, WORLD, THE, STATE, OF, SAFEGUARDING, AGAINST, ECONOMIC...",FOOD SECURITY AND NUTRITION IN THE WORLD THE STATE OF SAFEGUARDING AGAINST ECONOMIC SLOWDOWNS AN...,"[FOOD, SECURITY, AND, NUTRITION, IN, THE, WORLD, THE, STATE, OF, SAFEGUARDING, AGAINST, ECONOMIC...","[food, security, and, nutrition, in, the, world, the, state, of, safeguarding, against, economic...","[food, security, and, nutrition, in, the, world, the, state, of, safeguarding, against, economic...","[food, security, nutrition, world, state, safeguarding, economic, slowdowns, downturnsdemocratic..."


In [51]:
df['pos_tags'] = df['stopwords_removed'].apply(nltk.tag.pos_tag)
df

Unnamed: 0,cluster,words,no_contract,words_str,tokenized,lower,no_punc,stopwords_removed,pos_tags
0,WFP,FOOD SECURITY \n AND NUTRITION \nIN THE WORLD\nTHE STATE OF \nSAFEGUARDING AGAINST \nECONOMIC SL...,"[FOOD, SECURITY, AND, NUTRITION, IN, THE, WORLD, THE, STATE, OF, SAFEGUARDING, AGAINST, ECONOMIC...",FOOD SECURITY AND NUTRITION IN THE WORLD THE STATE OF SAFEGUARDING AGAINST ECONOMIC SLOWDOWNS AN...,"[FOOD, SECURITY, AND, NUTRITION, IN, THE, WORLD, THE, STATE, OF, SAFEGUARDING, AGAINST, ECONOMIC...","[food, security, and, nutrition, in, the, world, the, state, of, safeguarding, against, economic...","[food, security, and, nutrition, in, the, world, the, state, of, safeguarding, against, economic...","[food, security, nutrition, world, state, safeguarding, economic, slowdowns, downturnsdemocratic...","[(food, NN), (security, NN), (nutrition, NN), (world, NN), (state, NN), (safeguarding, VBG), (ec..."


In [52]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
df['wordnet_pos'] = df['pos_tags'].apply(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])
df

Unnamed: 0,cluster,words,no_contract,words_str,tokenized,lower,no_punc,stopwords_removed,pos_tags,wordnet_pos
0,WFP,FOOD SECURITY \n AND NUTRITION \nIN THE WORLD\nTHE STATE OF \nSAFEGUARDING AGAINST \nECONOMIC SL...,"[FOOD, SECURITY, AND, NUTRITION, IN, THE, WORLD, THE, STATE, OF, SAFEGUARDING, AGAINST, ECONOMIC...",FOOD SECURITY AND NUTRITION IN THE WORLD THE STATE OF SAFEGUARDING AGAINST ECONOMIC SLOWDOWNS AN...,"[FOOD, SECURITY, AND, NUTRITION, IN, THE, WORLD, THE, STATE, OF, SAFEGUARDING, AGAINST, ECONOMIC...","[food, security, and, nutrition, in, the, world, the, state, of, safeguarding, against, economic...","[food, security, and, nutrition, in, the, world, the, state, of, safeguarding, against, economic...","[food, security, nutrition, world, state, safeguarding, economic, slowdowns, downturnsdemocratic...","[(food, NN), (security, NN), (nutrition, NN), (world, NN), (state, NN), (safeguarding, VBG), (ec...","[(food, n), (security, n), (nutrition, n), (world, n), (state, n), (safeguarding, v), (economic,..."


In [53]:
wnl = WordNetLemmatizer()
df['lemmatized'] = df['wordnet_pos'].apply(lambda x: [wnl.lemmatize(word, tag) for word, tag in x])
df

Unnamed: 0,cluster,words,no_contract,words_str,tokenized,lower,no_punc,stopwords_removed,pos_tags,wordnet_pos,lemmatized
0,WFP,FOOD SECURITY \n AND NUTRITION \nIN THE WORLD\nTHE STATE OF \nSAFEGUARDING AGAINST \nECONOMIC SL...,"[FOOD, SECURITY, AND, NUTRITION, IN, THE, WORLD, THE, STATE, OF, SAFEGUARDING, AGAINST, ECONOMIC...",FOOD SECURITY AND NUTRITION IN THE WORLD THE STATE OF SAFEGUARDING AGAINST ECONOMIC SLOWDOWNS AN...,"[FOOD, SECURITY, AND, NUTRITION, IN, THE, WORLD, THE, STATE, OF, SAFEGUARDING, AGAINST, ECONOMIC...","[food, security, and, nutrition, in, the, world, the, state, of, safeguarding, against, economic...","[food, security, and, nutrition, in, the, world, the, state, of, safeguarding, against, economic...","[food, security, nutrition, world, state, safeguarding, economic, slowdowns, downturnsdemocratic...","[(food, NN), (security, NN), (nutrition, NN), (world, NN), (state, NN), (safeguarding, VBG), (ec...","[(food, n), (security, n), (nutrition, n), (world, n), (state, n), (safeguarding, v), (economic,...","[food, security, nutrition, world, state, safeguard, economic, slowdown, downturnsdemocratic, re..."


https://towardsdatascience.com/summarization-of-covid-research-papers-using-bart-model-5b109a6669a6

In [56]:
from transformers import pipeline
summarizer = pipeline("summarization")

Neither PyTorch nor TensorFlow >= 2.0 have been found.Models won't be available and only tokenizers, configurationand file/data utilities can be used.


RuntimeError: At least one of TensorFlow 2.0 or PyTorch should be installed. To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ To install PyTorch, read the instructions at https://pytorch.org/.

In [64]:

df.to_pickle("C:/Users/mwamb/OneDrive/Desktop/tenthdimensionanalytics/WFP/processed_data/wfpdata_clean")
