# Topic Modelling

In [2]:
import pandas as pd

In [174]:
# Read CSVs, only columns of interest
## Cordis
cordish2020 = pd.read_csv("data/cordis/cordis-h2020projects.csv",sep=";",usecols=['id','objective'])
cordisfp6 = pd.read_csv("data/cordis/cordis-fp6projects.csv",sep=";",usecols=['id','objective'])
cordisfp7 = pd.read_csv("data/cordis/cordis-fp7projects.csv",sep=";",usecols=['id','objective'])
cordis = cordish2020.append(cordisfp7)
cordis = cordis.append(cordisfp6)
## NSF
nsf = pd.read_csv('data/nsf/nsf.csv',usecols=['AwardID','AbstractNarration'])

# Rename columns to match headers later
cordis = cordis.rename(columns={'objective':'abstract'})
nsf = nsf.rename(columns={'AwardID':'id','AbstractNarration':'abstract'})

# Add source of documents to identifier
cordis['id'] = 'EU_' + cordis['id'].astype(str)
nsf['id'] = 'NSF_' + nsf['id'].astype(str)

# Join dataframes into 1
corpusdata = cordis.append(nsf)
corpusdata

Unnamed: 0,id,abstract
0,EU_672890,'The project targets all luxury fashion firms ...
1,EU_633814,EU's agricultural and forestry land provides a...
2,EU_785419,"According to the Global Market Forecast, there..."
3,EU_721362,In the aerospace industry very high quality st...
4,EU_861924,Large areas of agricultural land in W. and N. ...
...,...,...
688501,NSF_1614484.0,A few micro-seconds after the Big Bang the uni...
688502,NSF_1624547.0,Calcium sulfate in crystalline form (anhydrite...
688503,NSF_1604697.0,The Rocky Mountain-Great Plains Graduate Resea...
688504,NSF_1653917.0,Investments made across national boundaries cr...


In [212]:
corpusdata.dtypes

id          object
abstract    object
lemmas      object
dtype: object

## Spacy

In [69]:
import spacy
from termcolor import colored

### 1. Introduction & basic usage

In [None]:
nlp = spacy.load('en_core_web_md')
doc = nlp(corpusdata['abstract'].iloc[0])

In [173]:
print(colored('============= Original Text =============', 'blue'))
print(doc)
print(colored('\n============= Lemmatized Text =============', 'red'))
print(' '.join([tk.lemma_ for tk in doc]))
print(colored('\n============= Entities Found =============', 'green'))
print('\n'.join([ent.text for ent in doc.ents]))

'The project targets all luxury fashion firms that specifically manufacture classic menswear clothing (i.e. shirts, jackets, pants, coats...) and propose its personalisation. On one hand, personalisation is a great opportunity to provide the final clients with the cloths they desire, but on the other luxury fashion houses are facing the increasing need of having some automated solutions that will help them in creating 'customized product' in a faster lead time. In fact all fashion houses manufacturing menswear are focused on providing Made to Measure products to their customers, because it increases their revenues, but at the same time this also increases their costs.  An integrated and automated management of the whole value chain will decrease the lead time, increase customization application and decrease costs.
Crea Solution Srl proposes the TailorFit solution that will dramatically accelerate the timing of the whole process, by managing every step of the cloths manufacturing: 1) Ac

### 2. Architecture (vocabulary)

In [175]:
vocab = []
for wd in nlp.vocab:
    vocab.append([wd.text, wd.orth, wd.is_stop])

vocab = pd.DataFrame(vocab, columns=['Term', 'Hash', 'Stopword'])
vocab

Unnamed: 0,Term,Hash,Stopword
0,nuthin,17780520906925867008,False
1,ü.,9616619598791593984,False
2,p.m,6364458155313776643,False
3,Kan,8969436956900823045,False
4,Mar,12595687976425261068,False
...,...,...,...
915,She's,7021531544473137138,False
916,ve,10105644630884274164,False
917,E.g.,4115108073383360500,False
918,:-|,280013313535684598,False


In [176]:
stopw = vocab[vocab['Stopword']==True]
stopw

Unnamed: 0,Term,Hash,Stopword
19,it,10239237003504588839,True
32,is,3411606890003347522,True
35,Might,9747367433533540424,True
42,where,16318918034475841628,True
50,some,7000492816108906599,True
...,...,...,...
865,Had,6865576549405853585,True
886,should,10292920167869855674,True
894,are,5012629990875267006,True
898,anywhere,5899329028063008718,True


### 3. Word tokenization

In [177]:
print(colored('============= The original text information is still kept in the Doc object =============', 'blue'))
print(doc)

print(colored('\n============= Identified Tokens =============', 'red'))
for token in doc:
    print(token.text, end='\t\t')
# print('\t\t'.join([token.text for token in doc]))

'The project targets all luxury fashion firms that specifically manufacture classic menswear clothing (i.e. shirts, jackets, pants, coats...) and propose its personalisation. On one hand, personalisation is a great opportunity to provide the final clients with the cloths they desire, but on the other luxury fashion houses are facing the increasing need of having some automated solutions that will help them in creating 'customized product' in a faster lead time. In fact all fashion houses manufacturing menswear are focused on providing Made to Measure products to their customers, because it increases their revenues, but at the same time this also increases their costs.  An integrated and automated management of the whole value chain will decrease the lead time, increase customization application and decrease costs.
Crea Solution Srl proposes the TailorFit solution that will dramatically accelerate the timing of the whole process, by managing every step of the cloths manufacturing: 1) Ac

### 4. POS tagging

In [181]:
df = pd.DataFrame([[token.text, token.pos_, token.tag_] for token in doc],
                  columns = ['Token', 'POS', 'TAG'])
df

Unnamed: 0,Token,POS,TAG
0,',PUNCT,``
1,The,DET,DT
2,project,NOUN,NN
3,targets,VERB,VBZ
4,all,DET,DT
...,...,...,...
327,and,CCONJ,CC
328,paintings,NOUN,NNS
329,.,PUNCT,.
330,\n,SPACE,_SP


### 5. Named entity recognition

In [182]:
df_ents = pd.DataFrame([[ent.text, ent.label_, spacy.explain(ent.label_)] for ent in doc.ents], columns=['Entity', 'Type', 'Description'])
df_ents

Unnamed: 0,Entity,Type,Description
0,one,CARDINAL,Numerals that do not fall under another type
1,Crea,PERSON,"People, including fictional"
2,Srl,GPE,"Countries, cities, states"
3,TailorFit,ORG,"Companies, agencies, institutions, etc."
4,1,CARDINAL,Numerals that do not fall under another type
5,2,CARDINAL,Numerals that do not fall under another type
6,CAD,ORG,"Companies, agencies, institutions, etc."
7,3,CARDINAL,Numerals that do not fall under another type
8,TailorFit,ORG,"Companies, agencies, institutions, etc."


In [183]:
from spacy import displacy

displacy.render(doc, style="ent", jupyter=True, options={'distance': 90})

entypes = set([ent.label_ for ent in doc.ents])
df_ent = pd.DataFrame([[enttyp, spacy.explain(enttyp)] for enttyp in entypes], columns=['Entity type', 'Description'])
df_ent

Unnamed: 0,Entity type,Description
0,CARDINAL,Numerals that do not fall under another type
1,ORG,"Companies, agencies, institutions, etc."
2,PERSON,"People, including fictional"
3,GPE,"Countries, cities, states"


### 6. Lemmatization

In [184]:
print(colored('============= Original text =============', 'blue'))
print(doc.text)
print(colored('\n============= Lemmas =============', 'red'))
print(' '.join([token.lemma_ for token in doc]))

'The project targets all luxury fashion firms that specifically manufacture classic menswear clothing (i.e. shirts, jackets, pants, coats...) and propose its personalisation. On one hand, personalisation is a great opportunity to provide the final clients with the cloths they desire, but on the other luxury fashion houses are facing the increasing need of having some automated solutions that will help them in creating 'customized product' in a faster lead time. In fact all fashion houses manufacturing menswear are focused on providing Made to Measure products to their customers, because it increases their revenues, but at the same time this also increases their costs.  An integrated and automated management of the whole value chain will decrease the lead time, increase customization application and decrease costs.
Crea Solution Srl proposes the TailorFit solution that will dramatically accelerate the timing of the whole process, by managing every step of the cloths manufacturing: 1) Ac

### Pipeline

In [188]:
nlp.disable_pipe('parser')
nlp.disable_pipe('ner')

valid_POS = set(['VERB', 'NOUN', 'ADJ', 'PROPN'])
specific_stw = set(['relevant', 'simple', 'base'])

def text_preprocessing(rawtext):
    lemmatized = ' '.join([token.lemma_ for token in doc 
                           if token.is_alpha
                           and token.pos_ in valid_POS
                           and not token.is_stop
                           and token.lemma_ not in specific_stw])
    return lemmatized

print(colored('============= Original text =============', 'blue'))
print(doc)
print(colored('\n============= Lemmatized text =============', 'red'))
print(text_preprocessing(doc))

'The project targets all luxury fashion firms that specifically manufacture classic menswear clothing (i.e. shirts, jackets, pants, coats...) and propose its personalisation. On one hand, personalisation is a great opportunity to provide the final clients with the cloths they desire, but on the other luxury fashion houses are facing the increasing need of having some automated solutions that will help them in creating 'customized product' in a faster lead time. In fact all fashion houses manufacturing menswear are focused on providing Made to Measure products to their customers, because it increases their revenues, but at the same time this also increases their costs.  An integrated and automated management of the whole value chain will decrease the lead time, increase customization application and decrease costs.
Crea Solution Srl proposes the TailorFit solution that will dramatically accelerate the timing of the whole process, by managing every step of the cloths manufacturing: 1) Ac

In [242]:
corpusdata['lemmas'] = None
corpusdata = textdata[textdata['abstract'].notnull()]

for i in range(628061,len(corpusdata)):
    doc = nlp(corpusdata['abstract'].iloc[i])
    corpusdata['lemmas'].iloc[i] = text_preprocessing(doc)

corpusdata.head()

Unnamed: 0,id,abstract,lemmas
0,EU_672890,'The project targets all luxury fashion firms ...,project target luxury fashion firm manufacture...
1,EU_633814,EU's agricultural and forestry land provides a...,EU agricultural forestry land provide wide ran...
2,EU_785419,"According to the Global Market Forecast, there...",accord Global Market Forecast strong need ramp...
3,EU_721362,In the aerospace industry very high quality st...,aerospace industry high quality standard meet ...
4,EU_861924,Large areas of agricultural land in W. and N. ...,large area agricultural land Africa degrade wa...


In [243]:
corpusdata.to_csv('data/corpus_lemmatized.csv')