In [1]:
"""
Imports
The core
"""
import MAG_Util as MAG_Util
import MAG_Manager_Dataframe as MAG_Manager_Dataframe
import MAG_Graph_Layered as MAG_Graph_Layered


Generate the sample graph for citations

In [2]:
"""
Generate the random graph for a citation system, edges are weighted based on citation
Paper1 cites Paper2
- u = paper1
- v = paper2
- w = number of references in paper1
"""

# generate the graph as a random one
import Generator_Graph as Generator_Graph
current_edges_citations = Generator_Graph.generated_graph_simple(
    argv_count_vertices=100,
    argv_count_edges=300,
    argv_flag_directed=True,
    argv_weight_type="destination",
    argv_weight_range=[1,100],
    argv_random_seed=1234
    )

# have it output into a file
filename_graph = "Sample_Citations.csv"
file_graph = open(filename_graph, "w+")
# header
file_graph.write("paper1,paper2,reference")
file_graph.write("\n")
# add in the rows 1 by 1
for current_edge in current_edges_citations:
    file_graph.write(current_edge + "\n")
file_graph.close()

In [3]:
# the dataset filename
dict_filenames_dataset = {}
dict_filenames_dataset["Citations"] = "Sample_Citations.csv"

# manager_dataframe to load in the data
manager_dataframe = MAG_Manager_Dataframe.Manager_Dataframe(argv_filename=dict_filenames_dataset)
manager_dataframe.load_dataframes()

In [4]:
# get the dataframe
df_citations = manager_dataframe.get_dataframe(argv_id="Citations")

# add a new column with weight of 1, this is assuming if we are to ignore number of references
df_citations["cite"] = 1

# get the head to check
print(df_citations.head(10))

   paper1  paper2  reference  cite
0      49      97         60     1
1      44      63         91     1
2      43      74         72     1
3      59      60          1     1
4       5      60          1     1
5      76      21         79     1
6      56      93         20     1
7      99      66         49     1
8      56      37         77     1
9      35      15          4     1


In [5]:
# get the papers reference (using paper2)
dict_references = {}
current_papers = df_citations["paper2"].to_arrow().to_pylist()
current_references = df_citations["reference"].to_arrow().to_pylist()
for i in range(len(current_papers)):
    current_paper = current_papers[i]
    current_reference = current_references[i]
    if current_paper not in dict_references:
        dict_references[current_paper] = current_reference
# sort the dictionary by value
dict_references_sorted = dict(sorted(dict_references.items(), key=lambda item: item[1]))
# print it out
for current_paper in dict_references_sorted:
    print(str(current_paper) + " : " + str(dict_references_sorted[current_paper]))

60 : 1
3 : 1
28 : 2
16 : 3
14 : 3
57 : 3
15 : 4
77 : 4
6 : 5
42 : 6
86 : 6
98 : 6
68 : 7
32 : 9
36 : 9
45 : 9
73 : 10
59 : 10
69 : 10
9 : 11
24 : 12
39 : 12
10 : 13
27 : 15
80 : 15
2 : 15
62 : 19
76 : 19
93 : 20
23 : 20
95 : 20
72 : 21
67 : 22
91 : 23
25 : 24
75 : 28
13 : 31
51 : 32
31 : 32
92 : 35
50 : 35
43 : 35
88 : 36
94 : 39
47 : 39
84 : 40
61 : 43
17 : 45
12 : 46
66 : 49
87 : 54
1 : 57
54 : 59
85 : 59
97 : 60
22 : 60
35 : 60
64 : 61
48 : 62
20 : 62
30 : 63
71 : 64
29 : 65
40 : 66
58 : 66
55 : 68
81 : 68
34 : 70
74 : 72
44 : 72
99 : 73
41 : 75
5 : 75
70 : 76
37 : 77
21 : 79
19 : 80
18 : 83
52 : 84
89 : 85
65 : 85
7 : 86
83 : 86
56 : 86
33 : 86
38 : 87
8 : 89
63 : 91
26 : 92
11 : 99
79 : 99
0 : 100
96 : 100


With this, we would now start the 2 possible variants
- Shortest citation chain
- Citation impact

In [6]:
# one for shortest paper
colname_source = "paper1"
colname_destination = "paper2"
colname_weight = "cite"

# generate the graph
current_graph = MAG_Graph_Layered.Graph_Layered(
    argv_dataframe=df_citations,
    argv_source=colname_source, argv_destination=colname_destination, argv_weight=[colname_weight]
    )

# what if we want to go through specific paper
# intermediate to see which of the 'landmark' paper would be the intermediate between 2 papers
# let us set the landmark paper to the papers with at least 80 references
current_intermediate_vertices = []
current_intermediate_weights = []
current_intermediate_vertices.append([19, 18, 52, 89, 65, 7, 83, 56, 33, 38, 8, 63, 26, 11, 79, 0, 96])
current_intermediate_weights.append(0)

# generate the layer graph with this data
current_graph.generate_layer_graph(
    argv_intermediates_vertices=current_intermediate_vertices,
    argv_intermediates_weights=current_intermediate_weights
    )
print(current_graph.get_stat_weights())

# let say we want the relationship between these papers
starting_paper = 3
ending_paper = 20

# run dijkstra
current_distance, current_path, timer_dijkstra, timer_pathbuilding = current_graph.run_dijsktra(
    argv_start=str(starting_paper), argv_end=str(ending_paper), argv_path=True
    )
# print out the information
print("Distance: " + str(current_distance))
print("Path: " + str(current_path))

{'min': 1, 'max': 1, 'mean': 1.0, 'median': 1.0, 'zero': 0}
Distance: 5.0
Path: ['20_1', '52_1', '52_0', '89_0', '33_0', '50_0', '3_0']


In [7]:
current_table_distance_pred = current_graph.run_dijsktra(
    argv_start=str(starting_paper), argv_end=str(ending_paper), argv_path=False
    )
# print(current_table_distance_pred)

In [8]:
# one for using the citation
# we do the same except now we convert the edges to be 1/reference for impact
df_citations["impact"] = 1/df_citations["reference"]
print(df_citations.head())

# then we do everything the same as above except the weight
colname_source = "paper1"
colname_destination = "paper2"
colname_weight = "impact"

# generate the graph
current_graph = MAG_Graph_Layered.Graph_Layered(
    argv_dataframe=df_citations,
    argv_source=colname_source, argv_destination=colname_destination, argv_weight=[colname_weight]
    )
print(current_graph.get_stat_weights())

# what if we want to go through specific paper
# intermediate to see which of the 'landmark' paper would be the intermediate between 2 papers
# let us set the landmark paper to the papers with at least 80 references
current_intermediate_vertices = []
current_intermediate_weights = []
current_intermediate_vertices.append([19, 18, 52, 89, 65, 7, 83, 56, 33, 38, 8, 63, 26, 11, 79, 0, 96])
current_intermediate_weights.append(0)

# generate the layer graph with this data
current_graph.generate_layer_graph(
    argv_intermediates_vertices=current_intermediate_vertices,
    argv_intermediates_weights=current_intermediate_weights
    )

# let say we want the relationship between these papers
starting_paper = 3
ending_paper = 20

# run dijkstra
current_distance, current_path, timer_dijkstra, timer_pathbuilding = current_graph.run_dijsktra(
    argv_start=str(starting_paper), argv_end=str(ending_paper)
    )
# print out the information
print("Distance: " + str(current_distance))
print("Path: " + str(current_path))

  paper1 paper2  reference  cite    impact
0     49     97         60     1  0.016667
1     44     63         91     1  0.010989
2     43     74         72     1  0.013889
3     59     60          1     1  1.000000
4      5     60          1     1  1.000000
{'min': 0.01, 'max': 1.0, 'mean': 0.07942728006357762, 'median': 0.022222222222222223, 'zero': 0}
Distance: 0.07999783559335212
Path: ['20_1', '52_1', '52_0', '89_0', '33_0', '50_0', '3_0']
