In [1]:
with open('TRAIN_FILE.TXT') as f:
    train_file = f.readlines()

In [2]:
train_file

['1\t"The system as described above has its greatest application in an arrayed <e1>configuration</e1> of antenna <e2>elements</e2>."\n',
 'Component-Whole(e2,e1)\n',
 'Comment: Not a collection: there is structure here, organisation.\n',
 '\n',
 '2\t"The <e1>child</e1> was carefully wrapped and bound into the <e2>cradle</e2> by means of a cord."\n',
 'Other\n',
 'Comment:\n',
 '\n',
 '3\t"The <e1>author</e1> of a keygen uses a <e2>disassembler</e2> to look at the raw assembly code."\n',
 'Instrument-Agency(e2,e1)\n',
 'Comment:\n',
 '\n',
 '4\t"A misty <e1>ridge</e1> uprises from the <e2>surge</e2>."\n',
 'Other\n',
 'Comment:\n',
 '\n',
 '5\t"The <e1>student</e1> <e2>association</e2> is the voice of the undergraduate student population of the State University of New York at Buffalo."\n',
 'Member-Collection(e1,e2)\n',
 'Comment:\n',
 '\n',
 '6\t"This is the sprawling <e1>complex</e1> that is Peru\'s largest <e2>producer</e2> of silver."\n',
 'Other\n',
 'Comment:\n',
 '\n',
 '7\t"The 

In [3]:
def prepare_dataset(raw):
    sentences, relations = [], []
    to_replace = [("\"", ""), ("\n", ""), ("<", " <"), (">", "> ")]
    last_was_sentence = False
    for line in raw:
        sl = line.split("\t")
        if last_was_sentence:
            relations.append(sl[0].split("(")[0].replace("\n", ""))
            last_was_sentence = False
        if sl[0].isdigit():
            sent = sl[1]
            for rp in to_replace:
                sent = sent.replace(rp[0], rp[1])
            sentences.append(sent)
            last_was_sentence = True
    print("Found {} sentences".format(len(sentences)))
    return sentences, relations

In [4]:
sentences, relations = prepare_dataset(train_file)

Found 8000 sentences


In [5]:
import pandas as pd
final = pd.DataFrame({'SENTENCES':sentences, 'RELATIONS': relations})

causeeffect = final[final['RELATIONS']=='Cause-Effect']

In [6]:
len(causeeffect)

1003

In [7]:
def preprocess(raw_text):
    # keep only words
    #letters_only_text = re.sub("[^a-zA-Z]", " ", raw_text)

    # convert to lower case and split 
    #words = letters_only_text.lower()
    words = raw_text.lower()
    
    # transform cause-effects keywords
    text_1 = words.replace("<e1> ", "<e1>")
    text_2 = text_1.replace("<e2> ", "<e2>")
    text_3 = text_2.replace(" </e1> ", "</e1>")
    text_4 = text_3.replace(" </e2> ", "</e2>")
    
    # convert string into list of words
    list_of_words = text_4.split()
    
    return list_of_words

def identify_nature_of_words_for_a_sentence(raw_text):
    # clean my sentences
    list_of_words = preprocess(raw_text)
    
    # labelized words of my sentences
    nature_of_sentences = []
    for word in list_of_words:
        if '<e1>' in word:
            result_nature_of_word = 'cause'
        elif '<e2>' in word:
            result_nature_of_word = 'consequence'
        else:
            result_nature_of_word = 'neutral'
        nature_of_sentences.append(result_nature_of_word)
    
    return nature_of_sentences

def give_me_my_sentences_into_graph(raw_text):
    import re
    # keep only words
    letters_only_text = re.sub("[^a-zA-Z]", " ", raw_text)
    
    # clean my sentences
    list_of_words = preprocess(letters_only_text)
    list_of_words_cleaned = [word for word in list_of_words if word != 'e']
    
    # transform into dataframes of pair tokens
    word_1 = []; word_2 = []
    for num in range(len(list_of_words_cleaned)-1):
        word_1.append(list_of_words_cleaned[num]);  word_2.append(list_of_words_cleaned[num+1])
    
    try:
        graph_of_pair_of_words_dataframe = pd.DataFrame({'from':word_1, 'to': word_2})
    except:
        print('Sorry, there are more <from> than <to>')
    
    return graph_of_pair_of_words_dataframe

def give_me_nature_of_my_sentences_for_graph(raw_text):
    import re
    # keep only words
    letters_only_text = re.sub("[^a-zA-Z]", " ", raw_text)
    
    # clean my sentences
    list_of_words = preprocess(letters_only_text)
    list_of_words_cleaned = [word for word in list_of_words if word != 'e']
    
    # retrieve nature of my words
    nature_of_words = identify_nature_of_words_for_a_sentence(raw_text)
    
    try:
        labelisation_of_words = pd.DataFrame({'id':list_of_words_cleaned, 'group': nature_of_words})
        labelisation_of_words = labelisation_of_words.drop_duplicates()
        print('Done because : NATURE -> ', len(nature_of_words), '<- and WORDS ->', len(list_of_words_cleaned))
    except:
        print('!! FAILURE !! because : NATURE -> ', len(nature_of_words), '<- and WORDS ->', len(list_of_words_cleaned))
        labelisation_of_words = None
    
    return labelisation_of_words

In [8]:
text = causeeffect['SENTENCES'].iloc[10]

In [9]:
text

'A neoplastic  <e1> recurrence </e1>  arose from an extensive  <e2> radiation </e2>  induced ulceration.'

In [10]:
nature = give_me_nature_of_my_sentences_for_graph(text)
words = give_me_my_sentences_into_graph(text)

Done because : NATURE ->  10 <- and WORDS -> 10


In [11]:
words

Unnamed: 0,from,to
0,a,neoplastic
1,neoplastic,recurrence
2,recurrence,arose
3,arose,from
4,from,an
5,an,extensive
6,extensive,radiation
7,radiation,induced
8,induced,ulceration


In [12]:
id_word_total = []
group_total = []
id_sentence = []
index_nodes = []

for item in range(len(causeeffect)):
    try:
        id_word_total.append(give_me_nature_of_my_sentences_for_graph(causeeffect['SENTENCES'].iloc[item]).id.tolist())
        group_total.append(give_me_nature_of_my_sentences_for_graph(causeeffect['SENTENCES'].iloc[item]).group.tolist())
        index_nodes.append(item)
    except:
        print('No id')
        
id_sentence = []
for i in range(len(id_word_total)):
    id_sentence.append(['sentence_id_'+str(i)]*(len(id_word_total[i])))

!! FAILURE !! because : NATURE ->  35 <- and WORDS -> 36
No id
Done because : NATURE ->  9 <- and WORDS -> 9
Done because : NATURE ->  9 <- and WORDS -> 9
Done because : NATURE ->  17 <- and WORDS -> 17
Done because : NATURE ->  17 <- and WORDS -> 17
!! FAILURE !! because : NATURE ->  25 <- and WORDS -> 26
No id
Done because : NATURE ->  11 <- and WORDS -> 11
Done because : NATURE ->  11 <- and WORDS -> 11
Done because : NATURE ->  10 <- and WORDS -> 10
Done because : NATURE ->  10 <- and WORDS -> 10
Done because : NATURE ->  21 <- and WORDS -> 21
Done because : NATURE ->  21 <- and WORDS -> 21
Done because : NATURE ->  12 <- and WORDS -> 12
Done because : NATURE ->  12 <- and WORDS -> 12
!! FAILURE !! because : NATURE ->  25 <- and WORDS -> 26
No id
Done because : NATURE ->  12 <- and WORDS -> 12
Done because : NATURE ->  12 <- and WORDS -> 12
Done because : NATURE ->  10 <- and WORDS -> 10
Done because : NATURE ->  10 <- and WORDS -> 10
Done because : NATURE ->  16 <- and WORDS -> 16

Done because : NATURE ->  15 <- and WORDS -> 15
!! FAILURE !! because : NATURE ->  20 <- and WORDS -> 21
No id
Done because : NATURE ->  20 <- and WORDS -> 20
Done because : NATURE ->  20 <- and WORDS -> 20
Done because : NATURE ->  19 <- and WORDS -> 19
Done because : NATURE ->  19 <- and WORDS -> 19
!! FAILURE !! because : NATURE ->  24 <- and WORDS -> 26
No id
Done because : NATURE ->  14 <- and WORDS -> 14
Done because : NATURE ->  14 <- and WORDS -> 14
!! FAILURE !! because : NATURE ->  24 <- and WORDS -> 23
No id
Done because : NATURE ->  21 <- and WORDS -> 21
Done because : NATURE ->  21 <- and WORDS -> 21
!! FAILURE !! because : NATURE ->  24 <- and WORDS -> 25
No id
Done because : NATURE ->  14 <- and WORDS -> 14
Done because : NATURE ->  14 <- and WORDS -> 14
Done because : NATURE ->  16 <- and WORDS -> 16
Done because : NATURE ->  16 <- and WORDS -> 16
!! FAILURE !! because : NATURE ->  21 <- and WORDS -> 17
No id
Done because : NATURE ->  29 <- and WORDS -> 29
Done because 

Done because : NATURE ->  10 <- and WORDS -> 10
!! FAILURE !! because : NATURE ->  16 <- and WORDS -> 17
No id
Done because : NATURE ->  20 <- and WORDS -> 20
Done because : NATURE ->  20 <- and WORDS -> 20
!! FAILURE !! because : NATURE ->  22 <- and WORDS -> 21
No id
!! FAILURE !! because : NATURE ->  57 <- and WORDS -> 56
No id
Done because : NATURE ->  15 <- and WORDS -> 15
Done because : NATURE ->  15 <- and WORDS -> 15
!! FAILURE !! because : NATURE ->  14 <- and WORDS -> 15
No id
Done because : NATURE ->  16 <- and WORDS -> 16
Done because : NATURE ->  16 <- and WORDS -> 16
Done because : NATURE ->  17 <- and WORDS -> 17
Done because : NATURE ->  17 <- and WORDS -> 17
Done because : NATURE ->  16 <- and WORDS -> 16
Done because : NATURE ->  16 <- and WORDS -> 16
Done because : NATURE ->  38 <- and WORDS -> 38
Done because : NATURE ->  38 <- and WORDS -> 38
Done because : NATURE ->  15 <- and WORDS -> 15
Done because : NATURE ->  15 <- and WORDS -> 15
Done because : NATURE ->  16

Done because : NATURE ->  15 <- and WORDS -> 15
Done because : NATURE ->  15 <- and WORDS -> 15
Done because : NATURE ->  14 <- and WORDS -> 14
Done because : NATURE ->  14 <- and WORDS -> 14
!! FAILURE !! because : NATURE ->  17 <- and WORDS -> 18
No id
Done because : NATURE ->  17 <- and WORDS -> 17
Done because : NATURE ->  17 <- and WORDS -> 17
!! FAILURE !! because : NATURE ->  19 <- and WORDS -> 18
No id
Done because : NATURE ->  27 <- and WORDS -> 27
Done because : NATURE ->  27 <- and WORDS -> 27
Done because : NATURE ->  19 <- and WORDS -> 19
Done because : NATURE ->  19 <- and WORDS -> 19
Done because : NATURE ->  15 <- and WORDS -> 15
Done because : NATURE ->  15 <- and WORDS -> 15
Done because : NATURE ->  22 <- and WORDS -> 22
Done because : NATURE ->  22 <- and WORDS -> 22
Done because : NATURE ->  16 <- and WORDS -> 16
Done because : NATURE ->  16 <- and WORDS -> 16
Done because : NATURE ->  25 <- and WORDS -> 25
Done because : NATURE ->  25 <- and WORDS -> 25
Done becau

Done because : NATURE ->  13 <- and WORDS -> 13
Done because : NATURE ->  15 <- and WORDS -> 15
Done because : NATURE ->  15 <- and WORDS -> 15
Done because : NATURE ->  23 <- and WORDS -> 23
Done because : NATURE ->  23 <- and WORDS -> 23
Done because : NATURE ->  12 <- and WORDS -> 12
Done because : NATURE ->  12 <- and WORDS -> 12
Done because : NATURE ->  11 <- and WORDS -> 11
Done because : NATURE ->  11 <- and WORDS -> 11
Done because : NATURE ->  16 <- and WORDS -> 16
Done because : NATURE ->  16 <- and WORDS -> 16
Done because : NATURE ->  15 <- and WORDS -> 15
Done because : NATURE ->  15 <- and WORDS -> 15
Done because : NATURE ->  15 <- and WORDS -> 15
Done because : NATURE ->  15 <- and WORDS -> 15
Done because : NATURE ->  13 <- and WORDS -> 13
Done because : NATURE ->  13 <- and WORDS -> 13
Done because : NATURE ->  21 <- and WORDS -> 21
Done because : NATURE ->  21 <- and WORDS -> 21
!! FAILURE !! because : NATURE ->  22 <- and WORDS -> 23
No id
!! FAILURE !! because : N

Done because : NATURE ->  18 <- and WORDS -> 18
!! FAILURE !! because : NATURE ->  13 <- and WORDS -> 14
No id
Done because : NATURE ->  8 <- and WORDS -> 8
Done because : NATURE ->  8 <- and WORDS -> 8
Done because : NATURE ->  10 <- and WORDS -> 10
Done because : NATURE ->  10 <- and WORDS -> 10
Done because : NATURE ->  9 <- and WORDS -> 9
Done because : NATURE ->  9 <- and WORDS -> 9
Done because : NATURE ->  9 <- and WORDS -> 9
Done because : NATURE ->  9 <- and WORDS -> 9
Done because : NATURE ->  11 <- and WORDS -> 11
Done because : NATURE ->  11 <- and WORDS -> 11
Done because : NATURE ->  13 <- and WORDS -> 13
Done because : NATURE ->  13 <- and WORDS -> 13
Done because : NATURE ->  20 <- and WORDS -> 20
Done because : NATURE ->  20 <- and WORDS -> 20
Done because : NATURE ->  17 <- and WORDS -> 17
Done because : NATURE ->  17 <- and WORDS -> 17
Done because : NATURE ->  17 <- and WORDS -> 17
Done because : NATURE ->  17 <- and WORDS -> 17
Done because : NATURE ->  17 <- and W

Done because : NATURE ->  26 <- and WORDS -> 26
Done because : NATURE ->  22 <- and WORDS -> 22
Done because : NATURE ->  22 <- and WORDS -> 22
Done because : NATURE ->  8 <- and WORDS -> 8
Done because : NATURE ->  8 <- and WORDS -> 8
Done because : NATURE ->  25 <- and WORDS -> 25
Done because : NATURE ->  25 <- and WORDS -> 25
!! FAILURE !! because : NATURE ->  14 <- and WORDS -> 12
No id
!! FAILURE !! because : NATURE ->  19 <- and WORDS -> 18
No id
Done because : NATURE ->  45 <- and WORDS -> 45
Done because : NATURE ->  45 <- and WORDS -> 45
Done because : NATURE ->  20 <- and WORDS -> 20
Done because : NATURE ->  20 <- and WORDS -> 20
Done because : NATURE ->  15 <- and WORDS -> 15
Done because : NATURE ->  15 <- and WORDS -> 15
Done because : NATURE ->  17 <- and WORDS -> 17
Done because : NATURE ->  17 <- and WORDS -> 17
Done because : NATURE ->  17 <- and WORDS -> 17
Done because : NATURE ->  17 <- and WORDS -> 17
Done because : NATURE ->  22 <- and WORDS -> 22
Done because :

Done because : NATURE ->  4 <- and WORDS -> 4
Done because : NATURE ->  4 <- and WORDS -> 4
Done because : NATURE ->  4 <- and WORDS -> 4
Done because : NATURE ->  4 <- and WORDS -> 4
Done because : NATURE ->  8 <- and WORDS -> 8
Done because : NATURE ->  8 <- and WORDS -> 8
!! FAILURE !! because : NATURE ->  17 <- and WORDS -> 19
No id
!! FAILURE !! because : NATURE ->  27 <- and WORDS -> 26
No id
Done because : NATURE ->  26 <- and WORDS -> 26
Done because : NATURE ->  26 <- and WORDS -> 26
Done because : NATURE ->  17 <- and WORDS -> 17
Done because : NATURE ->  17 <- and WORDS -> 17
!! FAILURE !! because : NATURE ->  18 <- and WORDS -> 19
No id
!! FAILURE !! because : NATURE ->  11 <- and WORDS -> 9
No id
Done because : NATURE ->  16 <- and WORDS -> 16
Done because : NATURE ->  16 <- and WORDS -> 16
!! FAILURE !! because : NATURE ->  19 <- and WORDS -> 18
No id
Done because : NATURE ->  19 <- and WORDS -> 19
Done because : NATURE ->  19 <- and WORDS -> 19
Done because : NATURE ->  

In [13]:
def flatten_list(nested_list):
    from functools import reduce
    result = reduce(lambda x,y: x+y,nested_list)
    return result

In [14]:
id_word_total = flatten_list(id_word_total)
group_total = flatten_list(group_total)
id_sentence = flatten_list(id_sentence)

#nature = pd.DataFrame({'id':id_word_total, 'group': group_total, 'which_sentence':id_sentence})

In [15]:
print("id_word_total:", len(id_word_total), "||","group_total:", len(group_total), "||","id_sentence:", len(id_sentence))

id_word_total: 11602 || group_total: 11602 || id_sentence: 11602


In [16]:
nature = pd.DataFrame({'id':id_word_total, 'group': group_total, 'which_sentence':id_sentence})
nature.to_excel('nature.xlsx')

### Nodes

In [17]:
causeeffect = causeeffect.reset_index()
causeeffect['index'] = causeeffect.index
causeeffect = causeeffect[causeeffect['index'].isin(index_nodes)]

In [19]:
from_word_total = []
to_word_total = []

for item in range(len(causeeffect)):
    try:
        from_word_total.append(give_me_my_sentences_into_graph(causeeffect['SENTENCES'].iloc[item])['from'].tolist())
        print(from_word_total)
        to_word_total.append(give_me_my_sentences_into_graph(causeeffect['SENTENCES'].iloc[item])['to'].tolist())
    except:
        print('No id')
        
id_sentence_v2 = []
for i in range(len(from_word_total)):
    id_sentence_v2.append(['sentence_id_'+str(i)]*(len(from_word_total[i])))

[['the', 'burst', 'has', 'been', 'caused', 'by', 'water', 'hammer']]
[['the', 'burst', 'has', 'been', 'caused', 'by', 'water', 'hammer'], ['the', 'singer', 'who', 'performed', 'three', 'of', 'the', 'nominated', 'songs', 'also', 'caused', 'a', 'commotion', 'on', 'the', 'red']]
[['the', 'burst', 'has', 'been', 'caused', 'by', 'water', 'hammer'], ['the', 'singer', 'who', 'performed', 'three', 'of', 'the', 'nominated', 'songs', 'also', 'caused', 'a', 'commotion', 'on', 'the', 'red'], ['he', 'had', 'chest', 'pains', 'and', 'headaches', 'from', 'mold', 'in', 'the']]
[['the', 'burst', 'has', 'been', 'caused', 'by', 'water', 'hammer'], ['the', 'singer', 'who', 'performed', 'three', 'of', 'the', 'nominated', 'songs', 'also', 'caused', 'a', 'commotion', 'on', 'the', 'red'], ['he', 'had', 'chest', 'pains', 'and', 'headaches', 'from', 'mold', 'in', 'the'], ['financial', 'stress', 'is', 'one', 'of', 'the', 'main', 'causes', 'of']]
[['the', 'burst', 'has', 'been', 'caused', 'by', 'water', 'hammer'],

[['the', 'burst', 'has', 'been', 'caused', 'by', 'water', 'hammer'], ['the', 'singer', 'who', 'performed', 'three', 'of', 'the', 'nominated', 'songs', 'also', 'caused', 'a', 'commotion', 'on', 'the', 'red'], ['he', 'had', 'chest', 'pains', 'and', 'headaches', 'from', 'mold', 'in', 'the'], ['financial', 'stress', 'is', 'one', 'of', 'the', 'main', 'causes', 'of'], ['the', 'women', 'that', 'caused', 'the', 'accident', 'was', 'on', 'the', 'cell', 'phone', 'and', 'ran', 'thru', 'the', 'intersection', 'without', 'pausing', 'on', 'the'], ['calluses', 'are', 'caused', 'by', 'improperly', 'fitting', 'shoes', 'or', 'by', 'a', 'skin'], ['the', 'radiation', 'from', 'the', 'atomic', 'bomb', 'explosion', 'is', 'a', 'typical', 'acute'], ['a', 'neoplastic', 'recurrence', 'arose', 'from', 'an', 'extensive', 'radiation', 'induced'], ['he', 'has', 'a', 'tattoo', 'on', 'his', 'right', 'arm', 'and', 'scars', 'from', 'stitches', 'on', 'his', 'right'], ['the', 'continuing', 'nigerian', 'outbreak', 'is', 'the

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [20]:
from_word_total = flatten_list(from_word_total)
to_word_total = flatten_list(to_word_total)
id_sentence_v2 = flatten_list(id_sentence_v2)

In [21]:
print("from_word_total:", len(from_word_total), "||","to_word_total:", len(to_word_total), "||","id_sentence_v2:", len(id_sentence_v2))

from_word_total: 12296 || to_word_total: 12296 || id_sentence_v2: 12296


In [22]:
words = pd.DataFrame({'from':from_word_total, 'to': to_word_total, 'which_sentence':id_sentence_v2})
words.to_excel('word.xlsx')

In [None]:
words

# New Approach

In [132]:
from nltk import sent_tokenize, word_tokenize, pos_tag
import nltk

In [133]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\adsieg\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [134]:
def get_postag(string):
    string = nltk.word_tokenize(string)
    pos_string = nltk.pos_tag(string)
    return pos_string

def transform_get_postag_into_dataframe(string):
    import re
    # keep only words
    letters_only_text = re.sub("[^a-zA-Z]", " ", string)
    
    # clean my sentences
    list_of_words = preprocess(letters_only_text)
    list_of_words_cleaned = [word for word in list_of_words if word != 'e']
    list_of_words_cleaned = " ".join(list_of_words_cleaned)
    
    # new algo
    pos_string = get_postag(list_of_words_cleaned)
    word = [word[0] for word in pos_string]
    nature = [word[1] for word in pos_string]
    result_item_sentence = []
    for item in nature:
        if item in ['NN', 'JJ', 'NNS']:
            result_item = 'cause_consequence'
            result_item_sentence.append(result_item)
        else:
            result_item = 'neutral'
            result_item_sentence.append(result_item)
    result = pd.DataFrame({'id':word, 'group': nature, 'qualif':result_item_sentence})
    
    return result

def sentences_into_graph(raw_text):
    import re
    # keep only words
    letters_only_text = re.sub("[^a-zA-Z]", " ", raw_text)
    
    # clean my sentences
    list_of_words = preprocess(letters_only_text)
    list_of_words_cleaned = [word for word in list_of_words if word != 'e']
    
    # transform into dataframes of pair tokens
    word_1 = []; word_2 = []
    for num in range(len(list_of_words_cleaned)-1):
        word_1.append(list_of_words_cleaned[num]);  word_2.append(list_of_words_cleaned[num+1])
    
    try:
        graph_of_pair_of_words_dataframe = pd.DataFrame({'from':word_1, 'to': word_2})
    except:
        print('Sorry, there are more <from> than <to>')
    
    return graph_of_pair_of_words_dataframe

### Nodes

In [135]:
data_df = transform_get_postag_into_dataframe('A neoplastic recurrence arose from an extensive radiation induced ulceration')

In [136]:
text = causeeffect['SENTENCES'].iloc[10]

In [137]:
text

'The  <e1> fire </e1>  inside WTC was caused by exploding  <e2> fuel </e2> .'

In [138]:
transform_get_postag_into_dataframe(text)

Unnamed: 0,id,group,qualif
0,the,DT,neutral
1,fire,NN,cause_consequence
2,inside,IN,neutral
3,wtc,NN,cause_consequence
4,was,VBD,neutral
5,caused,VBN,neutral
6,by,IN,neutral
7,exploding,VBG,neutral
8,fuel,NN,cause_consequence


In [139]:
id_word_total = []
group_total = []
qualif_total = []

for item in range(len(causeeffect)):
    try:
        # computation
        buffer_df = transform_get_postag_into_dataframe(causeeffect['SENTENCES'].iloc[item])
        # stockage
        id_word_total.append(buffer_df['id'].tolist())
        group_total.append(buffer_df['group'].tolist())
        qualif_total.append(buffer_df['qualif'].tolist())
    except:
        print('No id')
        
id_sentence = []
for i in range(len(id_word_total)):
    id_sentence.append(['sentence_id_'+str(i)]*(len(id_word_total[i])))

In [140]:
def flatten_list(nested_list):
    from functools import reduce
    result = reduce(lambda x,y: x+y,nested_list)
    return result

id_word_total = flatten_list(id_word_total)
group_total = flatten_list(group_total)
qualif_total = flatten_list(qualif_total)
id_sentence = flatten_list(id_sentence)

In [141]:
print("id_word_total:", len(id_word_total), "||","group_total:", len(group_total), "||","qualif_total:", len(qualif_total), "||","id_sentence:", len(id_sentence))

id_word_total: 13034 || group_total: 13034 || qualif_total: 13034 || id_sentence: 13034


In [142]:
nodes = pd.DataFrame({'id':id_word_total, 'group': group_total, 'qualif':qualif_total,'which_sentence':id_sentence})

In [143]:
nodes

Unnamed: 0,id,group,qualif,which_sentence
0,the,DT,neutral,sentence_id_0
1,burst,NN,cause_consequence,sentence_id_0
2,has,VBZ,neutral,sentence_id_0
3,been,VBN,neutral,sentence_id_0
4,caused,VBN,neutral,sentence_id_0
5,by,IN,neutral,sentence_id_0
6,water,NN,cause_consequence,sentence_id_0
7,hammer,NN,cause_consequence,sentence_id_0
8,pressure,NN,cause_consequence,sentence_id_0
9,the,DT,neutral,sentence_id_1


In [144]:
nodes.to_excel('nodes.xlsx')

### Edges

In [117]:
sentences_into_graph(text)

Unnamed: 0,from,to
0,the,fire
1,fire,inside
2,inside,wtc
3,wtc,was
4,was,caused
5,caused,by
6,by,exploding
7,exploding,fuel


In [146]:
from_word_total = []
to_word_total = []

for item in range(len(causeeffect)):
    try:
        from_word_total.append(sentences_into_graph(causeeffect['SENTENCES'].iloc[item])['from'].tolist())
        to_word_total.append(sentences_into_graph(causeeffect['SENTENCES'].iloc[item])['to'].tolist())
    except:
        print('No id')
        
id_sentence_v2 = []
for i in range(len(from_word_total)):
    id_sentence_v2.append(['sentence_id_'+str(i)]*(len(from_word_total[i])))

In [149]:
def flatten_list(nested_list):
    from functools import reduce
    result = reduce(lambda x,y: x+y,nested_list)
    return result

from_word_total = flatten_list(from_word_total)
to_word_total = flatten_list(to_word_total)
id_sentence_v2 = flatten_list(id_sentence_v2)

In [150]:
print("from_word_total:", len(from_word_total), "||","to_word_total:", len(to_word_total), "||","id_sentence_v2:", len(id_sentence_v2))

from_word_total: 12296 || to_word_total: 12296 || id_sentence_v2: 12296


In [151]:
edges = pd.DataFrame({'from':from_word_total, 'to': to_word_total, 'which_sentence':id_sentence_v2})

In [152]:
edges

Unnamed: 0,from,to,which_sentence
0,the,burst,sentence_id_0
1,burst,has,sentence_id_0
2,has,been,sentence_id_0
3,been,caused,sentence_id_0
4,caused,by,sentence_id_0
5,by,water,sentence_id_0
6,water,hammer,sentence_id_0
7,hammer,pressure,sentence_id_0
8,the,singer,sentence_id_1
9,singer,who,sentence_id_1


In [154]:
edges.to_excel('edges.xlsx')