In [1]:
import PyPDF2

In [2]:
def pdf_to_txt(pdf_file_path, txt_file_path):
    with open(pdf_file_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text().rstrip()

    with open(txt_file_path, 'w', encoding='utf-8') as txt_file:
        txt_file.write(text)

In [3]:
pdf_to_txt('/Users/Desktop/ACM CONFERENCE WORK/FINAL_LIST_DOCS/Oil_Natural_Gas_Corporation_Ltd_vs_Saw_Pipes_Ltd_on_17_April_2003.PDF', 'oil.txt')


In [4]:
import pandas as pd
import nltk
nltk.download('punkt')  # Download the required NLTK data

[nltk_data] Downloading package punkt to /Users/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
def find_related_sentences(sentences, keyword):
    related_sentences = []

    for i, sentence in enumerate(sentences):
        if keyword.lower() in sentence.lower():
            prev_sentence = sentences[i - 1].strip() if i > 0 else None
            next_sentence = sentences[i + 1].strip() if i < len(sentences) - 1 else None
            related_sentences.append({
                "Sentence": sentence.strip(),
                "Previous": prev_sentence,
                "Next": next_sentence
            })

    return related_sentences

In [6]:
with open('/Users/Desktop/ACM CONFERENCE WORK/codes/Oil Natural Gas Case/oil.txt', "r") as file:
    text = file.read()

# Tokenize into sentences using nltk.sent_tokenize()
sentences = nltk.sent_tokenize(text)

keyword = "arbitral"
result = []

related_sentences = find_related_sentences(sentences, keyword)
result.extend(related_sentences)

# Create a DataFrame with columns "Sentence," "Previous," and "Next"
output_df = pd.DataFrame(result, columns=["Sentence", "Previous", "Next"])

# Save results to a new CSV
output_df.to_csv("output_sentences_oil.csv", index=False)



In [7]:
encodings_to_try = ['utf-8', 'latin1', 'utf-16', 'ISO-8859-1']

# Try reading the CSV file with different encodings
for encoding in encodings_to_try:
    try:
        output_df = pd.read_csv("output_sentences_oil.csv", encoding=encoding)
        print("CSV file read successfully with encoding:", encoding)
        break  # Exit loop if successful
    except UnicodeDecodeError:
        print("Failed to read CSV file with encoding:", encoding)
        continue  # Try next encoding

CSV file read successfully with encoding: utf-8


In [8]:
output_df

Unnamed: 0,Sentence,Previous,Next
0,COURT'S JURISDICTION UNDER SECTION 34 OF THE A...,"JUDGMENT:\nJ U D G M E N T Shah, J.",In other words - whether the Court would have ...
1,In other words - whether the Court would have ...,COURT'S JURISDICTION UNDER SECTION 34 OF THE A...,"Mr. Dushyant Dave, learned senior counsel appe..."
2,Application for setting aside arbitral award -...,"For deciding this controversy, we would refer ...",(2) An arbitral award may be set aside by the ...
3,(2) An arbitral award may be set aside by the ...,Application for setting aside arbitral award -...,Explanation-Without prejudice to the generalit...
4,For proper\nadjudication of the question of ju...,"However,\nclause\n(v) of sub-section 2(a) and ...",'ARBITRAL PROCEDURE' The ingredients of clause...
...,...,...,...
81,"Hence, this part of the award passed by the\na...",It is to be reiterated that it is the primary ...,"CONCLUSIONS:-\nIn the result, it is held that:..."
82,(1) The Court can set aside the arbitral award...,"CONCLUSIONS:-\nIn the result, it is held that:...",(ii) if the arbitral procedure was not in acco...
83,(ii) if the arbitral procedure was not in acco...,(1) The Court can set aside the arbitral award...,"However, exception for setting aside the award..."
84,"However, exception for setting aside the award...",(ii) if the arbitral procedure was not in acco...,(c) If the award passed by the arbitral tribun...


In [9]:
concatenated_sen = pd.DataFrame(output_df['Previous'].astype(str) + ' ' + output_df['Sentence'].astype(str) + ' ' + output_df['Next'].astype(str), columns=['concat'])


In [10]:
print(concatenated_sen['concat'].iloc[0])


JUDGMENT:
J U D G M E N T Shah, J. COURT'S JURISDICTION UNDER SECTION 34 OF THE ARBITRATION AND CONCILIATION
ACT, 1966 Before dealing with the issues involved in this appeal, we would first decide the main
point in controversy, namely - the ambit and scope of Court's jurisdiction in case where award
passed by the Arbitral Tribunal is challenged under Section 34 of the Arbitration and Conciliation
Act, 1996 (hereinafter referred to as "the Act") as the decision in this appeal would depend upon the
said finding. In other words - whether the Court would have jurisdiction under Section 34 of the Act
to set aside an award passed by the Arbitral Tribunal which is patently illegal or in contravention of
the provisions of the Act or any other substantive law governing the parties or is against the terms of
the contract?Oil & Natural Gas Corporation Ltd vs Saw Pipes Ltd on 17 April, 2003
Indian Kanoon - http://indiankanoon.org/doc/919241/ 1Learned senior counsel Mr. Ashok Desai appearing for th

In [11]:
concatenated_sen.to_csv('sentences_final_concatenated_oil.csv', index=False)

In [12]:
import csv

In [13]:
def create_person_id_map(file_path):
    person_id_map = {}

    with open(file_path, 'r') as csvfile:
        reader = csv.reader(csvfile)
        header = next(reader)
        
        for row in reader:
            person_id_list = row[0]
            person = person_id_list.strip()
            person_id = row[1].strip()
            person_id_map[person] = person_id

    return person_id_map

def create_location_id_map(file_path):
    location_id_map = {}

    with open(file_path, 'r') as csvfile:
        reader = csv.reader(csvfile)
        header = next(reader)
        
        for row in reader:
            location_id_list = row[0]
            location = location_id_list.strip()
            location_id = row[1].strip()
            location_id_map[location] = location_id

    return location_id_map

def create_time_id_map(file_path):
    time_id_map = {}

    with open(file_path, 'r') as csvfile:
        reader = csv.reader(csvfile)
        header = next(reader)
        
        for row in reader:
            time_id_list = row[0].split(',')
            time = time_id_list.strip()
            time_id = row[1].strip()
            time_id_map[time] = time_id

    return time_id_map

def create_event_id_map(file_path):
    event_id_map = {}

    with open(file_path, 'r') as csvfile:
        reader = csv.reader(csvfile)
        header = next(reader)
        
        for row in reader:
            event_id_list = row[0].split(',')
            event = event_id_list.strip()
            event_id = row[1].strip()
            event_id_map[event] = event_id

    return event_id_map

def create_other_id_map(file_path):
    other_id_map = {}

    with open(file_path, 'r') as csvfile:
        reader = csv.reader(csvfile)
        header = next(reader)
        
        for row in reader:
            other_id_list = row[0].split(',')
            other = other_id_list.strip()
            other_id = row[1].strip()
            other_id_map[other] = other_id

    return other_id_map

In [14]:
file_path_person = '/Users/Desktop/ACM CONFERENCE WORK/Entities/Oil Natural Gas Case/person.csv'
file_path_location = '/Users/Desktop/ACM CONFERENCE WORK/Entities/Oil Natural Gas Case/location.csv'
file_path_time = '/Users/Desktop/ACM CONFERENCE WORK/Entities/Oil Natural Gas Case/time.csv'
file_path_event = '/Users/Desktop/ACM CONFERENCE WORK/Entities/Oil Natural Gas Case/event.csv'
file_path_other = '/Users/Desktop/ACM CONFERENCE WORK/Entities/Oil Natural Gas Case/activity.csv'
person_id_map = create_person_id_map(file_path_person)
location_id_map = create_location_id_map(file_path_location)
time_id_map = create_location_id_map(file_path_time)
event_id_map = create_location_id_map(file_path_event)
other_id_map = create_location_id_map(file_path_other)

In [15]:
def generate_ngrams(text, n=6):
    words = text.split()
    ngrams = []
    for j in range(n, 0, -1):
        for i in range(len(words)):
            if i + j <= len(words):
                ngrams.append(' '.join(words[i:i+j]))
    return ngrams

In [16]:
def preprocess_text(text):
    text = text.replace("'s", "")
    text = text.replace(".", "")
    text = text.replace(";", "")
#     text = text.replace("-", "")
    text = text.replace("!", "")
    text = text.replace("?", "")
#     text = text.replace("/", "")
    text = text.replace("@", "")
    text = text.replace("#", "")
    text = text.replace(",", "")
    return text

In [17]:
def replace_entities(text):
    text = text.lower()
    ngrams = generate_ngrams(text)
    replaced_text = text

    for ngram in ngrams:
        original = ngram.lower()
        
        if original in person_id_map:
            entity_id = person_id_map[original]
            replaced_text = replaced_text.replace(ngram, entity_id)
        else:
            preprocessed_ngram = preprocess_text(ngram.lower())
            if preprocessed_ngram in person_id_map:
                entity_id = person_id_map[preprocessed_ngram]
                replaced_text = replaced_text.replace(ngram, entity_id)
            else:
                if original in location_id_map:
                    entity_id = location_id_map[original]
                    replaced_text = replaced_text.replace(ngram, entity_id)
                elif preprocessed_ngram in location_id_map:
                    entity_id = location_id_map[preprocessed_ngram]
                    replaced_text = replaced_text.replace(ngram, entity_id)
                elif original in time_id_map:
                    entity_id = time_id_map[original]
                    replaced_text = replaced_text.replace(ngram, entity_id)
                elif preprocessed_ngram in time_id_map:
                    entity_id = time_id_map[preprocessed_ngram]
                    replaced_text = replaced_text.replace(ngram, entity_id)
                elif original in event_id_map:
                    entity_id = event_id_map[original]
                    replaced_text = replaced_text.replace(ngram, entity_id)
                elif preprocessed_ngram in event_id_map:
                    entity_id = event_id_map[preprocessed_ngram]
                    replaced_text = replaced_text.replace(ngram, entity_id)
                elif original in other_id_map:
                    entity_id = other_id_map[original]
                    replaced_text = replaced_text.replace(ngram, entity_id)
                elif preprocessed_ngram in other_id_map:
                    entity_id = other_id_map[preprocessed_ngram]
                    replaced_text = replaced_text.replace(ngram, entity_id)
                    
    return replaced_text

In [18]:
with open('sentences_final_concatenated_oil.csv', 'r', encoding='latin-1') as csvfile:
     reader = csv.reader(csvfile)
     next(reader)
     ref_sen = []
     for row in reader:
         sentence = row[0]
         ref_sen.append(sentence)

In [19]:
final_sentences = []

for sen in ref_sen:
     s = replace_entities(sen)
     final_sentences.append(s)

In [20]:
with open('cleaned_sentences_oil.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Sentences'])
#      for sen in final_sentences:
    for sen in final_sentences:
        writer.writerow([sen])

## Matrix Creation

In [21]:
# with the sentences
import pandas as pd
import csv
import re

# Load the CSV files
time_df = pd.read_csv(file_path_time)
location_df = pd.read_csv(file_path_location)
event_df = pd.read_csv(file_path_event)
person_df = pd.read_csv(file_path_person)
activity_df = pd.read_csv(file_path_other)

location_ID = location_df['Id'].tolist()
location_df = location_df['Location'].tolist()

time_ID = time_df['Id'].tolist()
time_df = time_df['Time'].tolist()

# event_ID = event_df['ID'].tolist()
# event_df = event_df['Event'].tolist()

# person_ID = person_df['ID'].tolist()
# person_df = person_df['Person'].tolist()

# activity_ID = activity_df['ID'].tolist()
# activity_df = activity_df['Others'].tolist()

event_ID_map = dict(zip(event_df['Id'], event_df['Event']))
person_ID_map = dict(zip(person_df['Id'], person_df['Person']))
activity_ID_map = dict(zip(activity_df['Id'], activity_df['Activity']))

matrix = {}

In [22]:
sentences = []
with open('cleaned_sentences_oil.csv', 'r', encoding='latin-1') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)
    for row in reader:
        sentence = row[0]
        sentences.append(sentence)

In [23]:
for s in sentences:
    pattern = r'([PTLEA]\d+)'
    matches = re.findall(pattern, s)

    persons = []
    time = []
    location = []
    event = []
    activity = []

    for match in matches:
        if match[0] == 'P':
            persons.append(person_ID_map.get(match, 'Unknown Person'))
        elif match[0] == 'L':
             location.append(location_ID.index(match))
        elif match[0] == 'E':
            event.append(event_ID_map.get(match, 'Unknown Event'))
        elif match[0] == 'A':
            activity.append(activity_ID_map.get(match, 'Unknown Activity'))
        elif match[0] == 'T':
            time.append(time_ID.index(match))

    if location and time:
        for loc_index in location:
            loc_key = location_df[loc_index]
            for time_index in time:
                time_key = time_df[time_index]
                if time_key not in matrix:
                    matrix[time_key] = {}
                if loc_key not in matrix[time_key]:
                    matrix[time_key][loc_key] = []
                matrix[time_key][loc_key].extend([(entity, s) for entity in (persons + event + activity)])
    elif location:
        for loc_index in location:
            loc_key = location_df[loc_index]
            if loc_key not in matrix:
                matrix[loc_key] = {}
            if 'NULL' not in matrix:
                matrix['NULL'] = {}
            if loc_key not in matrix["NULL"]:
                matrix["NULL"][loc_key] = []
            matrix["NULL"][loc_key].extend([(entity, s) for entity in (persons + event + activity)])
    elif time:
        for time_index in time:
            time_key = time_df[time_index]
            if time_key not in matrix:
                matrix[time_key] = {}
            if 'NULL' not in matrix[time_key]:
                matrix[time_key]['NULL'] = []
            matrix[time_key]["NULL"].extend([(entity, s) for entity in (persons + event + activity)])
    else:
        if 'NULL' not in matrix:
            matrix['NULL'] = {}
        if 'NULL' not in matrix['NULL']:
            matrix['NULL']['NULL'] = []
        matrix["NULL"]["NULL"].extend([(entity, s) for entity in (persons + event + activity)])

matrix_df = pd.DataFrame.from_dict(matrix, orient='index')

In [24]:
matrix_df.to_csv('matrix_with_sen_oil.csv')

In [25]:
import networkx as nx

In [26]:
df = pd.read_csv('matrix_with_sen_oil.csv')

In [27]:
df.rename(columns={'Unnamed: 0': 'Time'}, inplace=True)

In [28]:
G = nx.Graph()

for index, row in df.iterrows():
    time = row['Time']
    for location in df.columns:
        if location == 'Time':
            continue
        # Convert the tuple to a string and add it as a node
        G.add_node(f"{time}, {location}")

In [29]:
import ast

In [30]:
final = set()

# Update the final set with only the entities
for value in pd.unique(df.drop(columns='Time').values.ravel()):
    if pd.notnull(value):
        value_list = ast.literal_eval(value)
        entity_list = [entity for entity, sentence in value_list]
        final.update(entity_list)

In [31]:
G.add_nodes_from(final)

In [32]:
for index, row in df.iterrows():
    time = row['Time']
    for location in df.columns:
        if location == 'Time':
            continue
        if pd.notnull(row[location]):
            value_list = ast.literal_eval(row[location])
            entity_list = [entity for entity, sentence in value_list]
            for entity in entity_list:
                if entity in final:
                    # Convert the tuples to strings and add them as an edge
                    G.add_edge(f"{time}, {location}", entity)

In [33]:
for node in list(G.nodes):
    if not list(G.neighbors(node)):
        G.remove_node(node)
        

In [34]:
for node1 in G.nodes:
    # Check if node1 can be split into a pair
    if ', ' in node1:
        parts1 = node1.split(", ")
        if len(parts1) == 2:  # Ensure it splits into exactly two parts
            time1, location1 = parts1
            for node2 in G.nodes:
                # Check if node2 can be split into a pair
                if ', ' in node2:
                    parts2 = node2.split(", ")
                    if len(parts2) == 2:  # Ensure it splits into exactly two parts
                        time2, location2 = parts2
                        # Add edge if the time or location matches
                        if time1 == time2 or location1 == location2:
                            G.add_edge(node1, node2)

In [35]:
degree_dict = dict(G.degree(G.nodes()))
nx.set_node_attributes(G, degree_dict, 'degree')


In [36]:
# Create a dictionary to store the sentences for each node
sentences_dict = {}

# Create a dictionary to store the sentences for each edge i.e. between two nodes
edge_sentences_dict = {}

# Update the sentences_dict and edge_sentences_dict
for index, row in df.iterrows():
    time = row['Time']
    for location in df.columns:
        if location == 'Time':
            continue
        if pd.notnull(row[location]):
            value_list = ast.literal_eval(row[location])
            for entity, sentence in value_list:
                if entity in final:
                    # Add the sentence to the sentences_dict for the entity
                    if entity not in sentences_dict:
                        sentences_dict[entity] = set()
                    sentences_dict[entity].add(sentence)

                    # Add the sentence to the sentences_dict for the (time, location) node
                    time_location_node = f"{time}, {location}"
                    if time_location_node not in sentences_dict:
                        sentences_dict[time_location_node] = set()
                    sentences_dict[time_location_node].add(sentence)

                    # Add the sentence to the edge_sentences_dict for the edge
                    edge = (time_location_node, entity)
                    if edge not in edge_sentences_dict:
                        edge_sentences_dict[edge] = set()
                    edge_sentences_dict[edge].add(sentence)

# Convert the sets back to lists if needed
for node, sentences in sentences_dict.items():
    sentences_dict[node] = list(sentences)

for edge, sentences in edge_sentences_dict.items():
    edge_sentences_dict[edge] = list(sentences)


In [37]:
pip install node2vec


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [38]:
from node2vec import Node2Vec

# Create a Node2Vec instance
node2vec = Node2Vec(G, dimensions=64, walk_length=30, num_walks=200, workers=4)

# Generate embeddings
model = node2vec.fit(window=10, min_count=1, batch_words=4)


Computing transition probabilities:   0%|          | 0/6 [00:00<?, ?it/s]

Generating walks (CPU: 4): 100%|██████████| 50/50 [00:00<00:00, 1759.95it/s]
Generating walks (CPU: 2): 100%|██████████| 50/50 [00:00<00:00, 1748.56it/s]
Generating walks (CPU: 3): 100%|██████████| 50/50 [00:00<00:00, 1732.08it/s]
Generating walks (CPU: 1): 100%|██████████| 50/50 [00:00<00:00, 1786.47it/s]


In [39]:
from sklearn.metrics.pairwise import cosine_similarity

# Get the node embeddings
node_embeddings = {node: model.wv[node] for node in G.nodes}

# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(list(node_embeddings.values()))


In [40]:
from sklearn.cluster import DBSCAN

# Initialize DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=3)  # Adjust parameters as needed

# Fit and predict clusters
labels = dbscan.fit_predict(similarity_matrix)

In [41]:
cluster_colors = [
    'rgb(31, 119, 180)',  # Blue
    'rgb(255, 127, 14)',  # Orange
    'rgb(44, 160, 44)',   # Green
    'rgb(214, 39, 40)',   # Red
    'rgb(148, 103, 189)', # Purple
    'rgb(140, 86, 75)',   # Brown
    'rgb(227, 119, 194)', # Pink
    'rgb(127, 127, 127)', # Gray
    'rgb(188, 189, 34)',  # Olive
    'rgb(23, 190, 207)',  # Teal
    'rgb(255, 187, 120)', # Peach
    'rgb(214, 39, 40)',   # Maroon
    'rgb(77, 175, 74)',   # Light Green
    'rgb(152, 78, 163)',  # Plum
    'rgb(255, 152, 150)'  # Salmon
]


In [42]:
import networkx as nx
import plotly.graph_objs as go

# Assuming 'G' is your graph and 'labels' is the list of labels
# Renumber clusters from 1 to 7 if needed, otherwise ensure you have clusters numbered 1 to 7
unique_labels = list(set(labels))
label_mapping = {label: idx+1 for idx, label in enumerate(unique_labels)}

def draw_subgraph_plotly(subgraph, cluster_id, cluster_color):
    pos = nx.spring_layout(subgraph, seed=42)  # Fixed seed for reproducibility
    
    # Extract the node positions
    x_nodes = [pos[node][0] for node in subgraph.nodes()]
    y_nodes = [pos[node][1] for node in subgraph.nodes()]

    # Extract the edges
    edge_x = []
    edge_y = []
    for edge in subgraph.edges():
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])

    # Edge trace
    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=1, color='gray'),
        hoverinfo='none',
        mode='lines')

    # Node trace
    node_trace = go.Scatter(
        x=x_nodes, y=y_nodes,
        mode='markers+text',
        text=[f'{node}' for node in subgraph.nodes()],
        textposition="top center",
        marker=dict(
            showscale=False,
            color=cluster_color,  # Assign the cluster color
            size=20,
            line=dict(width=2, color='black')
        ),
        hoverinfo='text'
    )

    # Create figure
    fig = go.Figure(data=[edge_trace, node_trace],
                    layout=go.Layout(
                        title=f'Cluster {cluster_id}',
                        showlegend=False,
                        hovermode='closest',
                        margin=dict(b=20, l=5, r=5, t=40),
                        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        height=600,
                        width=600,
                        paper_bgcolor='white',
                        plot_bgcolor='white'
                    ))

    fig.show()

# Iterate over each cluster and draw the subgraph
for original_cluster_id in set(labels):
    cluster_id = label_mapping[original_cluster_id]
    cluster_color = cluster_colors[cluster_id - 1]  # Assign a unique color to each cluster
    cluster_nodes = [node for node, label in zip(G.nodes, labels) if label == original_cluster_id]
    subgraph = G.subgraph(cluster_nodes)
    draw_subgraph_plotly(subgraph, cluster_id, cluster_color)

In [43]:
# # Initialize lists for nodes and links
# nodes = []
# links = []

# # Add nodes to the list
# for node, sentences in sentences_dict.items():
#     nodes.append({
#         "id": node,
#         "group": 1,  # Update this as needed
#         "size": len(sentences)  # The size could be based on the number of sentences
#     })

# # Add links to the list
# for edge, sentences in edge_sentences_dict.items():
#     source, target = edge
#     links.append({
#         "source": source,
#         "target": target,
#         "value": len(sentences)  # The value could be based on the number of sentences
#     })

# # Combine nodes and links into a single dictionary
# graph_data = {
#     "nodes": nodes,
#     "links": links
# }


In [44]:
# import os

# os.environ['METIS_DLL'] = '/Users/.local/lib/libmetis.dylib'

In [45]:
# import metis

In [46]:
# edgecuts, parts = metis.part_graph(G)

In [47]:
import json

# Create a mapping from nodes to integers
node_to_int = {node: i for i, node in enumerate(G.nodes)}


In [48]:
degree_dict = dict(G.degree(G.nodes()))
nx.set_node_attributes(G, degree_dict, 'degree')

data = nx.node_link_data(G)




The default value will be `edges="edges" in NetworkX 3.6.


  nx.node_link_data(G, edges="links") to preserve current behavior, or
  nx.node_link_data(G, edges="edges") for forward compatibility.



In [49]:
import numpy as np

In [50]:
labels = labels.astype(np.int32)

In [51]:
graph_data = {
    "nodes": [{"name": str(node), "n": degree_dict[node], "grp": int(labels[i]), "id": str(node)} for i, node in enumerate(G.nodes())],
    "links": [{"source": str(link_data['source']), "target": str(link_data['target']), "value": 1} for link_data in data['links']]
}

In [52]:
json_data = json.dumps(graph_data)
with open('data_oil.json', 'w') as f:
    f.write(json_data)

In [53]:
# from collections import defaultdict

# # Create a dictionary where keys are part numbers and values are lists of nodes
# part_dict = defaultdict(list)
# for node, part in enumerate(parts):
#     part_dict[part].append(node)

# # Print the number of parts
# print("Number of parts:", len(part_dict))

# # Print the nodes in each part
# for part, nodes in part_dict.items():
#     print("Part", part, ":", nodes)

In [54]:
# Assuming you have already created clusters using DBSCAN and have 'labels' and 'G' available

# Initialize an empty dictionary to store sentences for each cluster
cluster_sentences_dict = {}

# Iterate over each cluster
for original_cluster_id in set(labels):
    cluster_id = label_mapping[original_cluster_id]
    cluster_nodes = [node for node, label in zip(G.nodes, labels) if label == original_cluster_id]
    
    # Initialize a set to store sentences for this cluster
    cluster_sentences = set()
    
    # Iterate over nodes in the cluster
    for node in cluster_nodes:
        # Assuming 'node' contains the relevant information (e.g., entity, time, location)
        # Extract sentences associated with this node and add them to the cluster_sentences set
        if node in sentences_dict:
            cluster_sentences.update(sentences_dict[node])
    
    # Store the cluster_sentences set in the cluster_sentences_dict
    cluster_sentences_dict[cluster_id] = cluster_sentences

In [55]:
sentences_dict

{'commencement of arbitral proceedings': ['this power includes the power\nto determine the admissibility, relevance, the materiality and weight of any evidence. sections 20, 21\nand 22 deal with place of arbitration, E17 and language\nrespectively. thereafter, sections 23, 24 and 25 deal with statements of claim and defence, hearings\nand written proceedings and procedure to be followed in case of default of a party.'],
 'nan, NULL': ['in response thereto, it was pointed out that it was not\nthe case of learned counsel mr. setalwad on behalf of the claimants that "these stipulations in the\ncontract for deduction of liquidated damages was by way of penalty". further, the arbitral tribunal\nobserved that in view of the decisions rendered in fateh chand and maula bux cases, "all that we\nare required to consider is whether the respondents have established their case of actual loss in\nmoney terms because of the delay in the supply of the casing pipes under the contract between the\nparti

In [56]:
with open("output.txt", "w") as f:
    f.write(str(sentences_dict))