In [1]:
import numpy as np
import pandas as pd # type: ignore
import csv
import os
import json
import scipy.spatial as sp
from uuid import uuid1
from functools import partial
from multiprocessing import Pool, Manager
import random

In [5]:
def giveUUID1():
    return str(uuid1())[:8]

def my_cosine_similarity(matrix_A, matrix_B):
    a = matrix_A.flatten()
    b = matrix_B.flatten()
    dot_A_B = np.dot(a, b)
    mag_a = np.linalg.norm(a)
    mag_b = np.linalg.norm(b)
    return dot_A_B / (mag_a * mag_b)

def compute_gcd(x, y):
   while(y):
       x, y = y, x % y
   return x

def compute_lcm(x, y):
   lcm = (x*y)//compute_gcd(x,y)
   return lcm

def normalize_cosine_similarity(curr_sum, other_sum, curr_matrix, other_matrix):
    lcm_matrix = compute_lcm(curr_sum, other_sum)
    normalize_factor_curr = int(lcm_matrix / curr_sum)
    normalize_factor_other = int(lcm_matrix / other_sum)
    np_curr_matrix = np.array(curr_matrix) * normalize_factor_curr * 100
    np_other_matrix = np.array(other_matrix) * normalize_factor_other * 100
    cosine_similarty_1 = 1 - my_cosine_similarity(np_curr_matrix, np_other_matrix)
    return cosine_similarty_1

In [22]:
def create_train_test(file_name, csv_path, seen_graph_matrix, keywords, ncs_list, combinations):
    file_path = os.path.join(csv_path, file_name)
    data = pd.read_csv(file_path, header=0)

    if keywords is None:
        keywords = list(data["Keywords"])

    curr_matrix = data.iloc[:, 1:].values
    curr_graph = []
    for head in range(curr_matrix.shape[0]):
        for tail in range(curr_matrix.shape[1]):
            for _ in range(curr_matrix[head, tail]):
                curr_graph.append([head, tail])
    curr_sum = np.sum(curr_matrix)
    seen_graph_matrix[file_name] = (curr_graph, curr_matrix, curr_sum)
    results = []
    combinations[file_name] = set()

    for other_file_name, val in combinations.items():
        if other_file_name == file_name:
            continue
        while other_file_name not in seen_graph_matrix:
            wait = True    
        other_graph, other_matrix, other_sum = seen_graph_matrix[other_file_name]
        if file_name in combinations[other_file_name]:
            continue
        combinations[file_name] = other_file_name
        ncs = normalize_cosine_similarity(curr_sum, other_sum, curr_matrix, other_matrix)
        curr_json = {
            "graph_1": curr_graph,
            "graph_2": other_graph,
            "labels_1": keywords,
            "labels_2": keywords,
            "ged": ncs
        }
        temp_fn = file_name
        temp_ofn = other_file_name
        if len(temp_fn) > 20:
            temp_fn = temp_fn[:20]
        if len(temp_ofn) > 20:
            temp_ofn = temp_ofn[:20]
        json_filename = f"{temp_fn}-{temp_ofn}.json"
        
        results.append((curr_json, json_filename))
        ncs_list.append(ncs)
            
    return results

#     for file_name in tqdm(os.listdir(csv_path)):
# graph, matrix, sum
# seen_graph_matrix[file_name[:-4]] = (curr_graph, curr_matrix, curr_sum)

def parallelize(folder_name):
    # GET ALL CONN MATRIX CSV
    csv_path = "connMatrixCSV/AgricultureData-042124-8dd37ece/"
    keywords = None

    # pdfname : (graph, matrix)

    parent_folder = "dataset"

    folder_name = f"{giveUUID1()}-{folder_name}"
    os.makedirs(f"{parent_folder}/{folder_name}/test", exist_ok=True)
    os.makedirs(f"{parent_folder}/{folder_name}/train", exist_ok=True)

    manager = Manager()
    seen_graph_matrix = manager.dict()
    combinations = manager.dict()
    ncs_list = manager.list()
    with Pool() as pool:
        process_matrix = partial(create_train_test, csv_path=csv_path, seen_graph_matrix=seen_graph_matrix, keywords=keywords, ncs_list=ncs_list, combinations=combinations)

        for results in pool.imap_unordered(process_matrix, os.listdir(csv_path)):
            
            for curr_json, json_filename in results:
                rand_num = random.randint(0, 100)
                model_train_or_test = None
                if rand_num <= 80:
                    model_train_or_test = 'test'
                else:
                    model_train_or_test = 'train'
                json_name = f"{parent_folder}/{folder_name}/{model_train_or_test}/{json_filename}"
                with open(json_name, "w+") as file:
                    json.dump(curr_json, file)
    print(ncs_list)


if __name__ == "__main__":

    parallelize(f"test_set")

[1.0, 0.9976151236758831, 1.0, 0.9977198406751413, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9959965555560298, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9978025427448661, 1.0, 1.0, 0.9967916401864488, 0.9968812373793411, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9886149442350064, 1.0, 1.0, 1.0, 0.9845400418459687, 1.0, 0.9930008293525855, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9858917715553523, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9968547393447907, 1.0, 1.0, 0.9967617014990573, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9852188696787479, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9971263299993784, 1.0, 1.0, 0.9967475159831753, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9939807073457115, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9903263044522175, 1.0, 1.0, 1.0, 0.9903263044522175, 0.9510450606417166, 1.0, 0.9917882998048674, 1.0, 1.0, 1.0, 1.0, 1.0, 0.962823255795066, 1.0, 0.997

In [6]:
import math
a = [0.8376273698859185, 1.0, 0.804134737724209, 0.9906350546024151, 1.0, 0.999379733041431, 1.0, 1.0, 0.9971692308603514, 1.0, 0.9954492005327866, 1.0, 1.0, 0.9942811908026784, 0.9829002202672691, 0.992127764637217, 0.9194248578278925, 1.0, 0.993123214543967, 1.0, 1.0, 1.0, 0.995463561725975, 0.9982135435768786, 0.9984512602209843, 0.9823086910413454, 0.9846982708499421, 0.9987551930444917, 0.9993047992944879, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
lena = len(a)
suma = 0
for v in a:
    suma += v
print(suma / lena)

AttributeError: module 'math' has no attribute 'mean'

In [15]:
# Load CSV

csv_path = "adjMatrixCSV/041124-e699802c-f830-11ee-8afb-0adac10d6833.csv"
list_of_column_names = []
with open(csv_path) as csv_file:
    csv_reader = csv.reader(csv_file, delimiter = ',')
    for row in csv_reader:
        list_of_column_names.append(row)
        break
    list_of_column_names = list_of_column_names[0]
pandas_df = pd.read_csv(csv_path)
adjacency_matrix = np.array(pandas_df)
print(list_of_column_names)
print(adjacency_matrix)


['keyword', 'agriculture_usda_article_p1.pdf', 'agriculture_usda_article_p2.pdf', 'agriculture_usda_article_p3.pdf', 'agriculture_usda_article_p4.pdf']
[['erosion' 3 4 3 0]
 ['drainage' 2 0 0 0]
 ['drainage channel' 1 0 0 0]
 ['organic matter' 3 0 0 0]
 ['soil' 2 2 0 0]
 ['root' 4 0 0 0]
 ['soil organic matter' 1 0 0 0]
 ['macropore' 2 0 0 0]
 ['rill' 0 1 0 0]
 ['soil crusting' 0 2 0 0]
 ['rill erosion' 0 1 0 0]
 ['strip-till' 0 1 0 0]
 ['no-till' 0 1 0 0]
 ['ridge-till' 0 3 0 0]
 ['tillage' 0 2 0 3]
 ['till' 0 1 0 0]
 ['sediment' 0 0 3 0]
 ['runoff' 0 0 1 0]
 ['soil erosion' 0 0 3 0]
 ['erosion by wind' 0 0 3 0]
 ['water' 0 0 1 0]
 ['field' 0 0 1 0]
 ['windbreak' 0 0 3 0]
 ['wind barrier' 0 0 1 0]
 ['cross wind trap strips' 0 0 1 0]
 ['conservation tillage' 0 0 0 2]
 ['conservation' 0 0 0 1]
 ['efficiency' 0 0 0 4]
 ['cost' 0 0 0 1]
 ['burndown' 0 0 0 3]
 ['weed control' 0 0 0 2]
 ['herbicide' 0 0 0 1]
 ['profit' 0 0 0 3]
 ['pesticide' 0 0 0 2]
 ['fertilizer' 0 0 0 3]
 ['seed' 0 0 0 1

In [1]:
# GET ALL CONN MATRIX CSV
csv_path = "connMatrixCSV/AgricultureData-041424-b2d75432-fab8-11ee-aeaf-0adac10d6833/"
keywords = None
flattened_matricies = []
for file_name in os.listdir(csv_path):
    if file_name.endswith('.csv'):
        file_path = os.path.join(csv_path, file_name)
        data = pd.read_csv(file_path, header=0)
        if keywords is None:
            keywords = data["Keywords"]
        data = data.iloc[:, 1:].values
        flattened_data = np.array([], dtype=int)
        for col in range(data.shape[0]):
            np_row = data[col, col:]
            flattened_data = np.append(flattened_data, np_row)
        maxVal = np.max(flattened_data)
        flattened_data = flattened_data / maxVal
        flattened_matricies.append(flattened_data)

flattened_matricies = np.array(flattened_matricies)

KeyboardInterrupt: 

In [24]:
import tensorflow as tf  
from tensorflow import keras
import cv2
import numpy as np
import math

ModuleNotFoundError: No module named 'cv2'

In [99]:
TRAIN_TEST_SPLIT = math.floor(0.8 * len(flattened_matricies))
INPUT_LENGTH = flattened_matricies[0].shape[0]

train_data = flattened_matricies[:TRAIN_TEST_SPLIT]
test_data = flattened_matricies[TRAIN_TEST_SPLIT:]

train_data = np.expand_dims(train_data, axis=1)
test_data = np.expand_dims(test_data, axis=1)

encoder_input = keras.Input(shape=(1,INPUT_LENGTH), name="connectivityMatrix")
encoder_output = keras.layers.Dense(32, activation="relu")(encoder_input)

encoder = keras.Model(encoder_input, encoder_output, name="encoder")

decoder_input = keras.layers.Dense(32, activation="relu")(encoder_output)
decoder_output = keras.layers.Dense(INPUT_LENGTH, activation="relu")(decoder_input)

optimizer = keras.optimizers.Adam(learning_rate=0.001)

autoencoder = keras.Model(encoder_input, decoder_output, name="autoencoder")
autoencoder.summary()

In [100]:
autoencoder.compile(optimizer, loss="mse")

In [101]:
autoencoder.fit(train_data, train_data, epochs=3, batch_size=3, validation_split=0.1)

Epoch 1/3
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - loss: 0.0011 - val_loss: 7.6258e-04
Epoch 2/3
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0011 - val_loss: 7.6213e-04
Epoch 3/3
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.0011 - val_loss: 7.6203e-04


<keras.src.callbacks.history.History at 0x2e7b43990>

In [103]:
ae_out = autoencoder.predict([test_data])[0]
print(ae_out)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[[0.         0.00680224 0.         ... 0.         0.         0.        ]]


In [11]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import json

# Load the CSV data
data = pd.read_csv('connMatrixCSV/BigSample/1-s2.0-S0048969721043369-main.csv')
data = data.iloc[:, 1:]

# Load the JSON file
with open("SimGNN/dataset/test/1-s2.0-S004896972104-2104.14294v2.csv.json", mode="r") as file:
    jsonfile = json.load(file)
    keywords = jsonfile['labels_1']


# Plot the heat map
plt.figure(figsize=(12, 12))
sns.heatmap(data, annot=True, cmap='coolwarm')

# Add title and labels
plt.title('Keyword Frequency Heat Map')
plt.xlabel('Sample Index')
plt.ylabel('Keywords')

# Show the heat map
plt.show()


KeyboardInterrupt: 

Error in callback <function flush_figures at 0x7fd5d8b45268> (for post_execute):


KeyboardInterrupt: 