In [None]:
## Comment out in Kaggle
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [None]:
import json
import pandas as pd
import numpy as np

### Comment out the next two lines if using Kaggle
from google.colab import userdata
import google.generativeai as genai

import faiss

# Load data

In [None]:
cols = ['id', 'authors', 'doi', 'title', 'abstract']
data = []
# file_name = '/content/drive/MyDrive/VT/Information Storage Project Files/arxiv-metadata-oai-snapshot.json'
file_name = '/content/drive/MyDrive/VT/Information Storage Project Files/arxiv-metadata-oai-snapshot.json'
### File name (Kaggle only)
#file_name = '/kaggle/working/arxiv-metadata-oai-snapshot.json'


with open(file_name, encoding='latin-1') as f:
    for line in f:
        doc = json.loads(line)
        lst = [doc['id'], doc['authors'], doc['doi'], doc['title'], doc['abstract']]
        data.append(lst)

df_data = pd.DataFrame(data=data, columns=cols)

print(df_data.shape)

df_data.head()

(2436004, 5)


Unnamed: 0,id,authors,doi,title,abstract
0,704.0001,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",10.1103/PhysRevD.76.013009,Calculation of prompt diphoton production cros...,A fully differential calculation in perturba...
1,704.0002,Ileana Streinu and Louis Theran,,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-..."
2,704.0003,Hongjun Pan,,The evolution of the Earth-Moon system based o...,The evolution of Earth-Moon system is descri...
3,704.0004,David Callan,,A determinant of Stirling cycle numbers counts...,We show that a determinant of Stirling cycle...
4,704.0005,Wael Abu-Shammala and Alberto Torchinsky,,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,In this paper we show how to compute the $\L...


In [None]:
df_data['prepared_text'] = '{title}: ' + df_data['title'] + ' {abstract}: ' + df_data['abstract']
df_data.head()

Unnamed: 0,id,authors,doi,title,abstract,prepared_text
0,704.0001,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",10.1103/PhysRevD.76.013009,Calculation of prompt diphoton production cros...,A fully differential calculation in perturba...,{title}: Calculation of prompt diphoton produc...
1,704.0002,Ileana Streinu and Louis Theran,,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-...",{title}: Sparsity-certifying Graph Decompositi...
2,704.0003,Hongjun Pan,,The evolution of the Earth-Moon system based o...,The evolution of Earth-Moon system is descri...,{title}: The evolution of the Earth-Moon syste...
3,704.0004,David Callan,,A determinant of Stirling cycle numbers counts...,We show that a determinant of Stirling cycle...,{title}: A determinant of Stirling cycle numbe...
4,704.0005,Wael Abu-Shammala and Alberto Torchinsky,,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,In this paper we show how to compute the $\L...,{title}: From dyadic $\Lambda_{\alpha}$ to $\L...


In [None]:
df_data['prepared_text'][0]

'{title}: Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies {abstract}:   A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with data from the Fermilab Tevatron, and predictions are made for\nmore detailed tests with CDF and DO data. Predictions are shown for\ndistributions of diphoton pairs produced at the energy of the Large Hadron\nCollider (LHC). Distributions of the diphoton pairs from the decay of a Higgs\nboson are contrasted with those produced from QCD processes at the 

# Embed data

In [None]:
GOOGLE_API_KEY='AIzaSyBN6_iW25NEfLLxScsDThFDJm6F3evMPb0'
genai.configure(api_key=GOOGLE_API_KEY)


In [None]:
import time
start = time.time()

result = genai.embed_content(
    model="models/embedding-001",
    content=list(df_data['prepared_text'][:100000]),
    task_type="retrieval_document",
    title="Embedding of list of paper abstracts")

embeds = np.array(result['embedding']).astype('float32')

end = time.time()
print(end-start) #40mins

In [None]:
embeds_annoy.shape

(100000, 768)

# Use Faiss to store the embeddings

In [None]:
start = time.time()

index = faiss.IndexIDMap(faiss.IndexFlatIP(768))
index.add_with_ids(embeds, np.arange(len(df_data['prepared_text'][:100000])))

path = '/content/drive/MyDrive/VT/Information Storage Project Files/indices/gemini_faiss_100000.index'

end = time.time()
print(end-start)
# Save index
faiss.write_index(index, path)

# Load Faiss indices

In [None]:
path = '/content/drive/MyDrive/VT/Information Storage Project Files/indices/gemini_faiss_100000.index'
index = faiss.read_index(path)

# Get top k results

In [None]:
def get_top_results(query, k):
  input_type_query = "search_query"

  query_embed = genai.embed_content(
      model="models/embedding-001",
      content=query,
      task_type="retrieval_query",)

  top_k = index.search(np.array([query_embed['embedding']]).astype('float32'), k)

  query_results = pd.DataFrame(data={'id': df_data.iloc[top_k[1][0]]['id'],
                                     'texts': df_data.iloc[top_k[1][0]]['prepared_text'],
                                     'authors': df_data.iloc[top_k[1][0]]['authors'],
                                     'doi': df_data.iloc[top_k[1][0]]['doi'],
                                     'distance': top_k[0][0]})

  return query_results

In [None]:
get_top_results("quantum computer", 10)

Unnamed: 0,id,texts,authors,doi,distance
201,704.0202,{title}: Towards Minimal Resources of Measurem...,Simon Perdrix,10.1088/1367-2630/9/6/206,0.679549
322,704.0323,{title}: General sequential quantum cloning {a...,"Gui-Fang Dang, and Heng Fan",10.1088/1751-8113/41/15/155303,0.659433
41,704.0042,"{title}: General System theory, Like-Quantum S...",Ignazio Licata,,0.645335
481,704.0482,{title}: Implementation of holonomic quantum c...,"Zhang-qi Yin, Fu-li Li, Peng Peng",10.1103/PhysRevA.76.062311,0.643945
33,704.0034,{title}: Origin of adaptive mutants: a quantum...,Vasily Ogryzko,,0.643015
267,704.0268,{title}: Automated Generation of Layout and Co...,"Mark Whitney, Nemanja Isailovic, Yatish Patel,...",,0.642517
408,704.0409,{title}: On the over-barrier reflection in qua...,"D.G. Levkov, A.G. Panin, S.M. Sibiryakov",10.1103/PhysRevA.76.032114,0.623184
277,704.0278,{title}: q-Deformed spin foam models of quantu...,"Igor Khavkine, J. Daniel Christensen",10.1088/0264-9381/24/13/009,0.623143
419,704.042,{title}: The Hourglass - Consequences of Pure ...,Donald McCartor,,0.618922
45,704.0046,{title}: A limit relation for entropy and chan...,"I. Csiszar, F. Hiai and D. Petz",10.1063/1.2779138,0.612362


# Evaluation

In [None]:
eval_data = pd.read_csv('/content/drive/MyDrive/VT/Information Storage Project Files/evaluation_data_500.csv')

In [None]:
eval_data.head()

Unnamed: 0.1,Unnamed: 0,id,authors,doi,title,abstract,question1,question2
0,0,704.0001,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",10.1103/PhysRevD.76.013009,Calculation of prompt diphoton production cros...,A fully differential calculation in perturba...,- Calculation of diphoton production at hadron...,- Comparison of diphoton production from Higgs...
1,1,704.0002,Ileana Streinu and Louis Theran,,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-...","- How to characterize $(k,\ell)$-sparse graphs?","- How does the $(k,\ell)$-pebble game with col..."
2,2,704.0003,Hongjun Pan,,The evolution of the Earth-Moon system based o...,The evolution of Earth-Moon system is descri...,- How has the Earth-Moon system evolved accord...,- What is the significance of the Roche's limi...
3,3,704.0004,David Callan,,A determinant of Stirling cycle numbers counts...,We show that a determinant of Stirling cycle...,- How can acyclic single-source automata be en...,- What is the significance of Stirling cycle n...
4,4,704.0005,Wael Abu-Shammala and Alberto Torchinsky,,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,In this paper we show how to compute the $\L...,- How to compute the $\Lambda_{\alpha}$ norm u...,- How are the Hardy spaces $H^p(R^N)$ describe...


In [None]:
eval_data['question1'][0]

'- Calculation of diphoton production at hadron colliders'

In [None]:
get_top_results(eval_data['question1'][0], 10)

Unnamed: 0,id,texts,authors,doi,distance
0,704.0001,{title}: Calculation of prompt diphoton produc...,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",10.1103/PhysRevD.76.013009,0.793048
293,704.0294,{title}: QED x QCD Resummation and Shower/ME M...,B.F.L. Ward and S.A. Yost,,0.683034
105,704.0106,{title}: Multiple Parton Scattering in Nuclei:...,"Andreas Schafer, Xin-Nian Wang and Ben-Wei Zhang",10.1016/j.nuclphysa.2007.06.009,0.671896
317,704.0318,{title}: Effects of Dirac sea on pion propagat...,Subhrajyoti Biswas and Abhee K. Dutt-Mazumder,10.1103/PhysRevC.77.045201,0.671161
36,704.0037,{title}: The discrete dipole approximation for...,"Maxim A. Yurkin, Valeri P. Maltsev, Alfons G. ...",10.1016/j.jqsrt.2007.01.033,0.667122
425,704.0426,{title}: Feedback from first radiation sources...,"Leonid Chuzhoy, Michael Kuhlen, Paul R. Shapiro",10.1086/521438,0.662461
495,704.0496,{title}: Fusion process studied with preequili...,"C\'edric Simenel (SPhN, GANIL), Philippe Choma...",10.1103/PhysRevC.76.024609,0.661005
32,704.0033,{title}: Convergence of the discrete dipole ap...,"Maxim A. Yurkin, Valeri P. Maltsev, Alfons G. ...",10.1364/JOSAA.23.002578 10.1364/JOSAA.32.002407,0.660479
135,704.0136,{title}: Compounding Fields and Their Quantum ...,Zihua Weng,,0.658491
151,704.0152,{title}: Kinetic equation for finite systems o...,"V.I. Abrosimov, D.M. Brink, A.Dellafiore, F. M...",10.1016/j.nuclphysa.2007.11.009,0.657107


## top k = 1 results

In [None]:
k =  1
true_false = []
for i in range(len(eval_data)):
  question = eval_data['question1'][i]
  correct_answer = eval_data['id'][i]
  results = get_top_results(question, k)
  result_ids = [float(j) for j in results['id'].values]
  # print(correct_answer)
  # print(result_ids)
  true_false.append(correct_answer in result_ids)
  print(i)

for i in range(len(eval_data)):
  try:
    question = eval_data['question2'][i]
    correct_answer = eval_data['id'][i]
    results = get_top_results(question, k)
    result_ids = [float(j) for j in results['id'].values]
    # print(correct_answer)
    # print(result_ids)
    true_false.append(correct_answer in result_ids)
    print(i)
  except:
    print(i, " - error")

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [None]:
all(true_false)

False

In [None]:
sum(true_false) / len(true_false)

0.9914529914529915

In [None]:
len(true_false)

936

## top k = 3 results

In [None]:
k =  3
true_false = []
for i in range(len(eval_data)):
  question = eval_data['question1'][i]
  correct_answer = eval_data['id'][i]
  results = get_top_results(question, k)
  result_ids = [float(j) for j in results['id'].values]
  # print(correct_answer)
  # print(result_ids)
  true_false.append(correct_answer in result_ids)
  print(i)

for i in range(len(eval_data)):
  try:
    question = eval_data['question2'][i]
    correct_answer = eval_data['id'][i]
    results = get_top_results(question, k)
    result_ids = [float(j) for j in results['id'].values]
    # print(correct_answer)
    # print(result_ids)
    true_false.append(correct_answer in result_ids)
    print(i)
  except:
    print(i,  " - error")

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [None]:
all(true_false)

False

In [None]:
sum(true_false) / len(true_false)

0.9989316239316239

## top k = 5 results

In [None]:
k =  5
true_false = []
for i in range(len(eval_data)):
  question = eval_data['question1'][i]
  correct_answer = eval_data['id'][i]
  results = get_top_results(question, k)
  result_ids = [float(j) for j in results['id'].values]
  # print(correct_answer)
  # print(result_ids)
  true_false.append(correct_answer in result_ids)
  print(i)

for i in range(len(eval_data)):
  try:
    question = eval_data['question2'][i]
    correct_answer = eval_data['id'][i]
    results = get_top_results(question, k)
    result_ids = [float(j) for j in results['id'].values]
    # print(correct_answer)
    # print(result_ids)
    true_false.append(correct_answer in result_ids)
    print(i)
  except:
    print(i,  " - error")

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [None]:
all(true_false)

True

In [None]:
sum(true_false) / len(true_false)

1.0