In [1]:
import pandas as pd
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Load Dataset

In [3]:
# load SDGs data
sdg = pd.read_csv('/content/drive/MyDrive/Portfolio/Neo4j/sdg_data.csv')
sdg.dropna( how='any', inplace=True)
sdg.drop_duplicates()

Unnamed: 0,subject,predicated,object
0,Nd5f024d488114f5f9492e0469195152f,type,Restriction
1,Manufacturing_employment,subClassOf,Green_Indicators
2,Frameworks_for_gender,subClassOf,Green_Indicators
3,Zero_Hunger,subClassOf,Nf63622207036429dbf160093085f6979
4,N6ab7d2e40a0e4607b8966acc4669489a,onProperty,has_Child_wasting_or_obesity
...,...,...,...
2973,Degradad_land,subClassOf,Red_Indicators
2974,N0ead90e990404480b2af87413e4b619e,onProperty,has_Material_footprint
2975,Zero_Hunger,subClassOf,Nade67e07d1f447f289a87a25e9269517
2976,has_Journalist_media_killings,domain,Peace_Justice_and_Strong_Institutions


In [4]:
sdg['predicated'].unique()

array(['type', 'subClassOf', 'onProperty', 'cardinality', 'domain',
       'label', 'range', 'onClass', 'first', 'member', 'Concept', 'rest',
       'unionOf', 'broaderTransitive'], dtype=object)

In [5]:
#load ConceptNet data
conceptnet = pd.read_csv('/content/drive/MyDrive/Portfolio/Neo4j/conceptnet_data.csv')
conceptnet = conceptnet.drop(columns=['Unnamed: 0'])
conceptnet.dropna( how='any', inplace=True)
conceptnet.drop_duplicates()

new_cols = ["start_node","relation","end_node"]
conceptnet=conceptnet[new_cols]

conceptnet

Unnamed: 0,start_node,relation,end_node
0,augmented_assignment,RelatedTo,variable
1,augmented_fifth,RelatedTo,diatonic_scale
2,augmented_fifth,RelatedTo,enharmonically
3,augmented_fifth,RelatedTo,minor_sixth
4,augmented_fifth,RelatedTo,musical_interval
...,...,...,...
38406,zero,DefinedAs,empty_set
38407,zero_degrees_celcius,DefinedAs,freezing_point_of_water
38408,zero_degrees_centigrade,DefinedAs,freezing_point_for_water_temperature
38409,zombie,DefinedAs,animated_flesh_of_dead_person


In [6]:
conceptnet.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38411 entries, 0 to 38410
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   start_node  38411 non-null  object
 1   relation    38411 non-null  object
 2   end_node    38411 non-null  object
dtypes: object(3)
memory usage: 1.2+ MB


In [7]:
conceptnet['relation'].unique()

array(['RelatedTo', 'HasA', 'HasProperty', 'IsA', 'PartOf', 'SimilarTo',
       'UsedFor', 'InstanceOf', 'DefinedAs'], dtype=object)

## NLP

In [8]:
import nltk 
import string
import re

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

import itertools
from tqdm import tqdm

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [9]:
def space(text):
    return re.sub(r"_", " ", text)

###SDG data

In [10]:
sdg['subject'] = sdg['subject'].apply(space)
sdg['object'] = sdg['object'].apply(space)
sdg

Unnamed: 0,subject,predicated,object
0,Nd5f024d488114f5f9492e0469195152f,type,Restriction
1,Manufacturing employment,subClassOf,Green Indicators
2,Frameworks for gender,subClassOf,Green Indicators
3,Zero Hunger,subClassOf,Nf63622207036429dbf160093085f6979
4,N6ab7d2e40a0e4607b8966acc4669489a,onProperty,has Child wasting or obesity
...,...,...,...
2973,Degradad land,subClassOf,Red Indicators
2974,N0ead90e990404480b2af87413e4b619e,onProperty,has Material footprint
2975,Zero Hunger,subClassOf,Nade67e07d1f447f289a87a25e9269517
2976,has Journalist media killings,domain,Peace Justice and Strong Institutions


In [11]:
def process(dfx,x,x2):
  df = dfx
  s_clean=[]
  o_clean=[]

  for i in x:
      if i.startswith('N'):
        if len(i)==33:
          s_clean.append(i)

  for i in x2:
      if i.startswith('N'):
        if len(i)==33:
          o_clean.append(i)

  df = df[~df['subject'].isin(s_clean)]
  df = df[~df['object'].isin(o_clean)]

  return df

In [12]:
df_sdg = process(sdg,sdg.subject,sdg.object)

In [13]:
df_sdg

Unnamed: 0,subject,predicated,object
1,Manufacturing employment,subClassOf,Green Indicators
2,Frameworks for gender,subClassOf,Green Indicators
6,has Inclusive decision making,domain,Peace Justice and Strong Institutions
8,Financial services access,subClassOf,Green Indicators
9,has Science tech cooperation,domain,Partnerships for The Goals
...,...,...,...
2967,has Genetic resource sharing,range,Genetic resource sharing
2971,has Investment for LDCs,domain,Partnerships for The Goals
2972,has Safe drinking water,domain,Clean Water and Sanitation
2973,Degradad land,subClassOf,Red Indicators


In [14]:
ps=PorterStemmer()
def process(dfx):
  df = dfx

  s = df['subject'].unique()
  o = df['object'].unique()
  sentence = np.concatenate((s, o))  
  
  #dataframe
  df_sen = pd.DataFrame(data=list(sentence),columns=['word'])
  
  # remove stop word
  df_sen['word'] = df_sen['word'].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords.words('english')]))
  
  #steam
  df_sen['word'] = df_sen['word'].apply(lambda x: ' '.join([ps.stem(y) for y in x.split()]))

  # #token
  # df_sen['word'] = df_sen['word'].apply(word_tokenize)

  # unique value 
  df_sen = df_sen.drop_duplicates()

  return df_sen  

In [15]:
df_sdg_nlp = process(df_sdg)

In [16]:
df_sdg_nlp

Unnamed: 0,word
0,manufactur employ
1,framework gender
2,inclus decis make
3,financi servic access
4,scienc tech cooper
...,...
770,good health well-b
776,non-communic (ncd)
777,children on-track
778,violenc non-partn


In [17]:
def token(df):
  token = []
  for i in df['word']:
    text_tokens = word_tokenize(i)
    token.append(text_tokens)
  
  all_word = list(itertools.chain.from_iterable(token))
  
  keyword = pd.DataFrame(data=list(all_word),columns=['keyword'])
  keyword = keyword.drop_duplicates()
  return keyword

In [18]:
sdg_keyword = token(df_sdg_nlp)
sdg_keyword

Unnamed: 0,keyword
0,manufactur
1,employ
2,framework
3,gender
4,inclus
...,...
719,(
720,ncd
721,)
723,on-track


In [19]:
drop = ['(', ')','b','r','&','d','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17'] #class
def replace(df):
    df = df.replace(drop, 0.5)
    df = df[(df.keyword != 0.5)]
    return df

In [20]:
sdg_keyword_r = replace(sdg_keyword)
sdg_keyword_r

Unnamed: 0,keyword
0,manufactur
1,employ
2,framework
3,gender
4,inclus
...,...
717,well-b
718,non-communic
720,ncd
723,on-track


In [21]:
list_sdg = []
for i in sdg_keyword_r.keyword:
    list_sdg.append(i)

In [22]:
len(list_sdg)

362

In [23]:
list_sdg

['manufactur',
 'employ',
 'framework',
 'gender',
 'inclus',
 'decis',
 'make',
 'financi',
 'servic',
 'access',
 'scienc',
 'tech',
 'cooper',
 'fossil',
 'fuel',
 'subsidi',
 'develop',
 'nation',
 'export',
 'debt',
 'suicid',
 'mortal',
 'poverti',
 'reduct',
 'prog',
 'safe',
 'citi',
 'space',
 'statist',
 'legisl',
 'child',
 'labour',
 'fish',
 'stock',
 'level',
 'unemploy',
 'rate',
 'result',
 'cultur',
 'heritag',
 'integr',
 'climat',
 'polici',
 'tuberculosi',
 'incid',
 'corpor',
 'sust',
 'report',
 'marin',
 'protect',
 'area',
 'youth',
 'educ',
 'train',
 'risk',
 'mgmt',
 'healthcar',
 'coverag',
 'urban',
 'slum',
 'popul',
 'domest',
 'tax',
 'mountain',
 'biodivers',
 'public',
 'satisfact',
 'seiz',
 'surrend',
 'arm',
 'conflict',
 'relat',
 'death',
 'equal',
 'aid',
 'trade',
 'forest',
 'manag',
 'road',
 'traffic',
 'injuri',
 'extrem',
 'plan',
 'institut',
 'represent',
 'research',
 'matern',
 'person',
 'remitt',
 'differenti',
 'tariff',
 'sdg',
 'su

### Conceptnet data 

In [24]:
#remove _
conceptnet['start_node'] = conceptnet['start_node'].apply(space)
conceptnet['end_node'] = conceptnet['end_node'].apply(space)

In [25]:
#stopwords
conceptnet['start_node_nlp'] = conceptnet['start_node'].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords.words('english')]))
conceptnet['end_node_nlp'] = conceptnet['end_node'].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords.words('english')]))

In [26]:
#stem
ps=PorterStemmer()
conceptnet['start_node_nlp'] = conceptnet['start_node_nlp'].apply(lambda x: ' '.join([ps.stem(y) for y in x.split()]))
conceptnet['end_node_nlp'] = conceptnet['end_node_nlp'].apply(lambda x: ' '.join([ps.stem(y) for y in x.split()]))

In [27]:
#word_tokenize
conceptnet['start_node_nlp'] = conceptnet['start_node_nlp'].apply(word_tokenize)
conceptnet['end_node_nlp'] = conceptnet['end_node_nlp'].apply(word_tokenize) 

In [28]:
conceptnet

Unnamed: 0,start_node,relation,end_node,start_node_nlp,end_node_nlp
0,augmented assignment,RelatedTo,variable,"[augment, assign]",[variabl]
1,augmented fifth,RelatedTo,diatonic scale,"[augment, fifth]","[diaton, scale]"
2,augmented fifth,RelatedTo,enharmonically,"[augment, fifth]",[enharmon]
3,augmented fifth,RelatedTo,minor sixth,"[augment, fifth]","[minor, sixth]"
4,augmented fifth,RelatedTo,musical interval,"[augment, fifth]","[music, interv]"
...,...,...,...,...,...
38406,zero,DefinedAs,empty set,[zero],"[empti, set]"
38407,zero degrees celcius,DefinedAs,freezing point of water,"[zero, degre, celciu]","[freez, point, water]"
38408,zero degrees centigrade,DefinedAs,freezing point for water temperature,"[zero, degre, centigrad]","[freez, point, water, temperatur]"
38409,zombie,DefinedAs,animated flesh of dead person,[zombi],"[anim, flesh, dead, person]"


In [29]:
import re

def replace_like_list(text_list):
  text_list = re.sub(r"\'", "", text_list)
  text_list = re.sub(r"\[", "", text_list)
  text_list = re.sub(r"\]", "", text_list)

  return text_list

In [30]:
conceptnet["start_node_nlp_txt"] = conceptnet["start_node_nlp"].apply(lambda x: str(x))
conceptnet["start_node_nlp_txt"] = conceptnet["start_node_nlp_txt"].apply(lambda x: replace_like_list(x))

In [31]:
conceptnet["end_node_nlp_txt"] = conceptnet["end_node_nlp"].apply(lambda x: str(x))
conceptnet["end_node_nlp_txt"] = conceptnet["end_node_nlp_txt"].apply(lambda x: replace_like_list(x))

In [32]:
conceptnet

Unnamed: 0,start_node,relation,end_node,start_node_nlp,end_node_nlp,start_node_nlp_txt,end_node_nlp_txt
0,augmented assignment,RelatedTo,variable,"[augment, assign]",[variabl],"augment, assign",variabl
1,augmented fifth,RelatedTo,diatonic scale,"[augment, fifth]","[diaton, scale]","augment, fifth","diaton, scale"
2,augmented fifth,RelatedTo,enharmonically,"[augment, fifth]",[enharmon],"augment, fifth",enharmon
3,augmented fifth,RelatedTo,minor sixth,"[augment, fifth]","[minor, sixth]","augment, fifth","minor, sixth"
4,augmented fifth,RelatedTo,musical interval,"[augment, fifth]","[music, interv]","augment, fifth","music, interv"
...,...,...,...,...,...,...,...
38406,zero,DefinedAs,empty set,[zero],"[empti, set]",zero,"empti, set"
38407,zero degrees celcius,DefinedAs,freezing point of water,"[zero, degre, celciu]","[freez, point, water]","zero, degre, celciu","freez, point, water"
38408,zero degrees centigrade,DefinedAs,freezing point for water temperature,"[zero, degre, centigrad]","[freez, point, water, temperatur]","zero, degre, centigrad","freez, point, water, temperatur"
38409,zombie,DefinedAs,animated flesh of dead person,[zombi],"[anim, flesh, dead, person]",zombi,"anim, flesh, dead, person"


#### map with SDGs Keyword

In [33]:
idx_list = []

for keyword in tqdm(sdg_keyword_r["keyword"]):
  for i in range(conceptnet.shape[0]):
    if keyword in conceptnet["start_node_nlp_txt"][i] or keyword in conceptnet["end_node_nlp_txt"][i]:
      idx_list.append(i)

100%|██████████| 362/362 [02:35<00:00,  2.32it/s]


In [34]:
len(idx_list)

35282

In [35]:
conceptnet_df_keyw = conceptnet.iloc[idx_list].copy()
conceptnet_df_keyw

Unnamed: 0,start_node,relation,end_node,start_node_nlp,end_node_nlp,start_node_nlp_txt,end_node_nlp_txt
3547,automaker,RelatedTo,manufacture,[automak],[manufactur],automak,manufactur
3551,automaking,RelatedTo,manufacture,[automak],[manufactur],automak,manufactur
3553,automan,RelatedTo,manufacture,[automan],[manufactur],automan,manufactur
9024,people,HasA,manufactured,[peopl],[manufactur],peopl,manufactur
11070,cars,HasProperty,manufactured in factory,[car],"[manufactur, factori]",car,"manufactur, factori"
...,...,...,...,...,...,...,...
36971,genocide,DefinedAs,killing of specific group,[genocid],"[kill, specif, group]",genocid,"kill, specif, group"
37743,piano,DefinedAs,heart of modern jazz group,[piano],"[heart, modern, jazz, group]",piano,"heart, modern, jazz, group"
37999,slowest person in group,DefinedAs,caboose,"[slowest, person, group]",[caboos],"slowest, person, group",caboos
38079,stigma,DefinedAs,attempt to label particular group of people as...,[stigma],"[attempt, label, particular, group, peopl, les...",stigma,"attempt, label, particular, group, peopl, less..."


In [36]:
conceptnet_df_keyw = conceptnet_df_keyw.drop(columns=["start_node_nlp_txt", "end_node_nlp_txt","start_node_nlp","end_node_nlp"])
conceptnet_df_keyw = conceptnet_df_keyw[~conceptnet_df_keyw.index.duplicated(keep='first')]
conceptnet_df_keyw

Unnamed: 0,start_node,relation,end_node
3547,automaker,RelatedTo,manufacture
3551,automaking,RelatedTo,manufacture
3553,automan,RelatedTo,manufacture
9024,people,HasA,manufactured
11070,cars,HasProperty,manufactured in factory
...,...,...,...
25702,alkali metal,SimilarTo,group 1
25707,alkaline earth metal,SimilarTo,beryllium group
25708,alkaline earth metal,SimilarTo,group 2
27268,beryllium group,SimilarTo,alkaline earth metal


In [37]:
conceptnet_df_keyw.to_csv('conceptnet_sdgwordV2.csv')

In [38]:
#rootwork
conceptnet_df_keyw = conceptnet.iloc[idx_list].copy()
conceptnet_df_keyw_V2 = conceptnet_df_keyw[['start_node_nlp_txt','relation','end_node_nlp_txt']]
conceptnet_df_keyw_V2 = conceptnet_df_keyw_V2[~conceptnet_df_keyw_V2.index.duplicated(keep='first')]
conceptnet_df_keyw_V2

Unnamed: 0,start_node_nlp_txt,relation,end_node_nlp_txt
3547,automak,RelatedTo,manufactur
3551,automak,RelatedTo,manufactur
3553,automan,RelatedTo,manufactur
9024,peopl,HasA,manufactur
11070,car,HasProperty,"manufactur, factori"
...,...,...,...
25702,"alkali, metal",SimilarTo,"group, 1"
25707,"alkalin, earth, metal",SimilarTo,"beryllium, group"
25708,"alkalin, earth, metal",SimilarTo,"group, 2"
27268,"beryllium, group",SimilarTo,"alkalin, earth, metal"


### SDGs goals

In [39]:
# df_sdg[df_sdg.subject=='Sustainable Development Goals']

In [40]:
g3 = df_sdg[(df_sdg.object=='Good health and well being')|(df_sdg.subject=='Good health and well being')]
g6 = df_sdg[(df_sdg.object=='Clean Water and Sanitation')|(df_sdg.subject=='Clean Water and Sanitation')]
g11 = df_sdg[(df_sdg.object=='Sustainable Cities and Communities')|(df_sdg.subject=='Sustainable Cities and Communities')]
g12 = df_sdg[(df_sdg.object=='Responsible Consumption and Production')|(df_sdg.subject=='Responsible Consumption and Production')]
g13 = df_sdg[(df_sdg.object=='Climate Action')|(df_sdg.subject=='Climate Action')]
g15 = df_sdg[(df_sdg.object=='Life On Land')|(df_sdg.subject=='Life On Land')]

In [41]:
ps=PorterStemmer()
def process(dfx):
  df =dfx

  s = df['subject'].unique()
  o = df['object'].unique()
  sentence = np.concatenate((s, o))  
  
  #dataframe
  df_sen = pd.DataFrame(data=list(sentence),columns=['word'])
  
  # remove stop word
  df_sen['word'] = df_sen['word'].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords.words('english')]))
  
  #steam
  df_sen['word'] = df_sen['word'].apply(lambda x: ' '.join([ps.stem(y) for y in x.split()]))

  # #token
  # df_sen['word'] = df_sen['word'].apply(word_tokenize)

  # unique value 
  df_sen = df_sen.drop_duplicates()

  return df_sen  

In [42]:
sen_g3 = process(g3) #32
sen_g6 = process(g6) #15
sen_g15 = process(g15) #16
sen_g11 = process(g11) #17
sen_g12 = process(g12) #14
sen_g13 = process(g13) #9

In [43]:
def token(df):
  token = []
  for i in df['word']:
    text_tokens = word_tokenize(i)
    token.append(text_tokens)
  all_word = list(itertools.chain.from_iterable(token))
  keyword = pd.DataFrame(data=list(all_word),columns=['keyword'])
  keyword = keyword.drop_duplicates()
  return keyword

In [44]:
kw_g3 = token(sen_g3) #58
kw_g6 = token(sen_g6) #26
kw_g11 = token(sen_g11) #31
kw_g12 = token(sen_g12) #27
kw_g13 = token(sen_g13) #18
kw_g15 = token(sen_g15)  #29          

In [45]:
drop = ['(', ')','b','r','&','d','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17'] #class
def replace(df):
    df = df.replace(drop, 0.5)
    df = df[(df.keyword != 0.5)]
    return df

In [46]:
kw_g3r = replace(kw_g3) 
kw_g6r = replace(kw_g6) 
kw_g11r = replace(kw_g11) 
kw_g12r = replace(kw_g12) 
kw_g13r = replace(kw_g13) 
kw_g15r = replace(kw_g15)      

In [47]:
kw_g6r

Unnamed: 0,keyword
0,ambient
1,water
2,qualiti
3,clean
5,sanit
6,freshwat
7,stress
9,use
10,effici
11,goal


####  map with conceptnet

In [48]:
# only  word == rootword 

In [49]:
def con(x):  
  idx_list = []

  for keyword in tqdm(x["keyword"]):
    for i in range(conceptnet.shape[0]):
      if keyword in conceptnet["start_node_nlp_txt"][i] or keyword in conceptnet["end_node_nlp_txt"][i]:
        idx_list.append(i)
  
  conceptnet_df_keyw = conceptnet.iloc[idx_list].copy()
  conceptnet_df_keyw = conceptnet_df_keyw.drop(columns=["start_node_nlp_txt", "end_node_nlp_txt","start_node_nlp","end_node_nlp"])
  conceptnet_df_keyw = conceptnet_df_keyw[~conceptnet_df_keyw.index.duplicated(keep='first')]
  return conceptnet_df_keyw

In [50]:
res_g3 = con(kw_g3r)
res_g6 = con(kw_g6r)
res_g11 = con(kw_g11r)
res_g12 = con(kw_g12r)
res_g13 = con(kw_g13r)
res_g15 = con(kw_g15r)

100%|██████████| 56/56 [00:21<00:00,  2.59it/s]
100%|██████████| 25/25 [00:09<00:00,  2.60it/s]
100%|██████████| 30/30 [00:11<00:00,  2.59it/s]
100%|██████████| 26/26 [00:09<00:00,  2.67it/s]
100%|██████████| 17/17 [00:06<00:00,  2.54it/s]
100%|██████████| 28/28 [00:10<00:00,  2.62it/s]


In [51]:
res_g13

Unnamed: 0,start_node,relation,end_node
6972,greek style buildings,HasA,beautiful support columns
9391,professional football teams must,HasA,supportive fans
9957,support,HasA,come from unions and newspaper columnists
12750,friends,HasProperty,very nice and very supportive
16098,abutment,IsA,support
...,...,...,...
35936,seawolf class submarine,InstanceOf,nuclear submarine
36561,champagne,DefinedAs,drink of classy
36591,class 5,DefinedAs,strongest tornado
36592,classrooms,DefinedAs,plural form of classroom


In [52]:
# res_g6.to_csv('res_g6.csv')

#### word freaquency V2

In [53]:
import pandas as pd 
import nltk 
from nltk import FreqDist

In [54]:
def freaquency(res_g):
  res_g['start_node'] = res_g['start_node'].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords.words('english')]))
  res_g['end_node'] = res_g['end_node'].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords.words('english')]))
  s = res_g['start_node']
  o = res_g['end_node']
  word_g = np.concatenate((s, o))  

  token_g = []
  for i in word_g:
    text_tokens = word_tokenize(i)
    token_g.append(text_tokens)

  all_word_g = list(itertools.chain.from_iterable(token_g))

  freq = FreqDist(all_word_g)  
  G = pd.DataFrame(list(freq.items()), columns = ["Word","Frequency"]) 
  return G.sort_values(by='Frequency')

In [55]:
freaquency(res_g3)

Unnamed: 0,Word,Frequency
0,auspicious,1
3226,152ppm,1
3227,deuterium,1
3228,waterfalls,1
3229,magnesium,1
...,...,...
487,impairment,372
488,might,376
36,person,396
311,one,411


In [56]:
freaquency(res_g6)

Unnamed: 0,Word,Frequency
899,examination,1
1193,proper,1
1192,ensuring,1
1190,aromatic,1
1189,sweetheart,1
...,...,...
943,attending,54
1005,useful,70
97,clean,95
222,class,105


In [57]:
freaquency(res_g11)

Unnamed: 0,Word,Frequency
0,autoresonance,1
2514,hottest,1
2518,knocked,1
2521,rates,1
2522,embraced,1
...,...,...
596,impairment,372
228,might,375
549,person,381
129,one,397


In [58]:
freaquency(res_g12)

Unnamed: 0,Word,Frequency
0,autoadjuvant,1
1408,happened,1
1407,tv,1
1406,producing,1
1404,experiences,1
...,...,...
705,national,71
301,airplane,81
275,class,105
829,plant,156


In [59]:
freaquency(res_g13)

Unnamed: 0,Word,Frequency
0,greek,1
1329,else,1
1328,businesswoman,1
1327,businessperson,1
1326,businessman,1
...,...,...
137,action,101
123,class,105
85,green,106
618,building,124


In [60]:
freaquency(res_g15)

Unnamed: 0,Word,Frequency
9105,turning,1
3836,barf,1
3835,bans,1
6733,repressive,1
6734,regimes,1
...,...,...
21,air,158
1249,impairment,372
1239,might,378
291,person,636


## Create Dataset for kGE model

In [72]:
# load SDGs data
sdg = pd.read_csv('/content/drive/MyDrive/Portfolio/Neo4j/sdg_data.csv')
sdg.dropna( how='any', inplace=True)
sdg.drop_duplicates()

Unnamed: 0,subject,predicated,object
0,Nd5f024d488114f5f9492e0469195152f,type,Restriction
1,Manufacturing_employment,subClassOf,Green_Indicators
2,Frameworks_for_gender,subClassOf,Green_Indicators
3,Zero_Hunger,subClassOf,Nf63622207036429dbf160093085f6979
4,N6ab7d2e40a0e4607b8966acc4669489a,onProperty,has_Child_wasting_or_obesity
...,...,...,...
2973,Degradad_land,subClassOf,Red_Indicators
2974,N0ead90e990404480b2af87413e4b619e,onProperty,has_Material_footprint
2975,Zero_Hunger,subClassOf,Nade67e07d1f447f289a87a25e9269517
2976,has_Journalist_media_killings,domain,Peace_Justice_and_Strong_Institutions


In [73]:
s_clean=[]
o_clean=[]
df = sdg

for i in df['subject']:
    if i.startswith('N'):
      if len(i)==33:
        s_clean.append(i)

for i in df['object']:
    if i.startswith('N'):
      if len(i)==33:
        o_clean.append(i)

df = df[~df['subject'].isin(s_clean)]
df = df[~df['object'].isin(o_clean)]
sdg_df=df

In [74]:
# clean data complete
sdg_df.rename(columns = {'subject':'start_node', 'predicated':'relation', 'object':'end_node'}, inplace = True)
sdg_df 

Unnamed: 0,start_node,relation,end_node
1,Manufacturing_employment,subClassOf,Green_Indicators
2,Frameworks_for_gender,subClassOf,Green_Indicators
6,has_Inclusive_decision_making,domain,Peace_Justice_and_Strong_Institutions
8,Financial_services_access,subClassOf,Green_Indicators
9,has_Science_tech_cooperation,domain,Partnerships_for_The_Goals
...,...,...,...
2967,has_Genetic_resource_sharing,range,Genetic_resource_sharing
2971,has_Investment_for_LDCs,domain,Partnerships_for_The_Goals
2972,has_Safe_drinking_water,domain,Clean_Water_and_Sanitation
2973,Degradad_land,subClassOf,Red_Indicators


In [75]:
#Load data SDG keyword mapping in concpetnet
con_sdg = pd.read_csv('/content/drive/MyDrive/Portfolio/Neo4j/conceptnet_sdgwordV2.csv')
con_sdg = con_sdg.drop(columns=['Unnamed: 0'])
con_sdg

Unnamed: 0,start_node,relation,end_node
0,automaker,RelatedTo,manufacture
1,automaking,RelatedTo,manufacture
2,automan,RelatedTo,manufacture
3,people,HasA,manufactured
4,cars,HasProperty,manufactured in factory
...,...,...,...
21244,alkali metal,SimilarTo,group 1
21245,alkaline earth metal,SimilarTo,beryllium group
21246,alkaline earth metal,SimilarTo,group 2
21247,beryllium group,SimilarTo,alkaline earth metal


In [76]:
dataset = pd.concat([sdg_df, con_sdg], ignore_index=True)
dataset

Unnamed: 0,start_node,relation,end_node
0,Manufacturing_employment,subClassOf,Green_Indicators
1,Frameworks_for_gender,subClassOf,Green_Indicators
2,has_Inclusive_decision_making,domain,Peace_Justice_and_Strong_Institutions
3,Financial_services_access,subClassOf,Green_Indicators
4,has_Science_tech_cooperation,domain,Partnerships_for_The_Goals
...,...,...,...
22988,alkali metal,SimilarTo,group 1
22989,alkaline earth metal,SimilarTo,beryllium group
22990,alkaline earth metal,SimilarTo,group 2
22991,beryllium group,SimilarTo,alkaline earth metal


In [77]:
def space(text):
    return re.sub(r"_", " ", text)

In [78]:
dataset['start_node'] = dataset['start_node'].apply(space)
dataset['end_node'] = dataset['end_node'].apply(space)
dataset

Unnamed: 0,start_node,relation,end_node
0,Manufacturing employment,subClassOf,Green Indicators
1,Frameworks for gender,subClassOf,Green Indicators
2,has Inclusive decision making,domain,Peace Justice and Strong Institutions
3,Financial services access,subClassOf,Green Indicators
4,has Science tech cooperation,domain,Partnerships for The Goals
...,...,...,...
22988,alkali metal,SimilarTo,group 1
22989,alkaline earth metal,SimilarTo,beryllium group
22990,alkaline earth metal,SimilarTo,group 2
22991,beryllium group,SimilarTo,alkaline earth metal


In [79]:
dataset['start_node'] = dataset['start_node'].str.lower()
dataset['end_node'] = dataset['end_node'].str.lower()
dataset

Unnamed: 0,start_node,relation,end_node
0,manufacturing employment,subClassOf,green indicators
1,frameworks for gender,subClassOf,green indicators
2,has inclusive decision making,domain,peace justice and strong institutions
3,financial services access,subClassOf,green indicators
4,has science tech cooperation,domain,partnerships for the goals
...,...,...,...
22988,alkali metal,SimilarTo,group 1
22989,alkaline earth metal,SimilarTo,beryllium group
22990,alkaline earth metal,SimilarTo,group 2
22991,beryllium group,SimilarTo,alkaline earth metal


In [80]:
dataset.to_csv('datasetV2.csv')

In [81]:
pip freeze > requirements.txt