## Read .trec file for queries

In [1]:
!pip install openai -q

In [9]:
import xml.etree.ElementTree as ElementTree
import pandas as pd
import pyarrow.parquet as pq
import openai
import numpy as np
from getpass import getpass
from tenacity import retry, wait_random_exponential, stop_after_attempt

In [10]:
with open('irg_queries.trec', 'r') as qr:   # Reading file
    xml = qr.read()

root = ElementTree.fromstring(xml)

# Extract recordid and query text using list comprehensions
data_q = [(record_id.text, query.text) for doc in root for record_id in doc.findall("recordId") for query in doc.findall("text")]

# Create a dataframe from the extracted data
df_queries = pd.DataFrame(data_q, columns=["query_id", "query_text"])

# Display the dataframe
# print(df_queries)

In [4]:
## Read .trec file for collections

In [11]:
with open('irg_collection.trec', 'r') as c:   # Reading file
    xml = c.read()

root = ElementTree.fromstring(xml)
    
# Extract recordid and query text using list comprehensions
data_c = [(doc_id.text, doc_text.text) for doc in root for doc_id in doc.findall("recordId") for doc_text in doc.findall("text")]

# Create a dataframe from the extracted data
df_collection = pd.DataFrame(data_c, columns=["document_id", "document_text"])

# Display the dataframe
# print(df_collection)

In [6]:
openai.api_key = getpass()

 ········


In [7]:
@retry(wait=wait_random_exponential(min=1, max=10), stop=stop_after_attempt(6))
def get_embedding(text: str, model="text-embedding-ada-002") -> list[float]:
    return openai.Embedding.create(input=[text], model=model)["data"][0]["embedding"]


##### Below line of code should be executed only when embedding file doesn't exist (i.e., only for the first time).

In [8]:
# df_queries['embedding'] = df_queries['query_text'].apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))
# df_queries.to_csv("queries_embedding.csv")

In [12]:
queries_embedding = pd.read_csv("queries_embedding.csv")

In [13]:
queries_embedding['embedding'] = queries_embedding['embedding'].apply(eval).apply(np.array)

In [14]:
queries_embedding.head()

Unnamed: 0.1,Unnamed: 0,query_id,query_text,embedding
0,0,245,transistor phase splitting circuits,"[-0.028369968757033348, -0.0001554530754219740..."
1,1,891,the determination of the orbits of individual ...,"[-0.009570655412971973, -0.0001979938097065314..."
2,2,1217,"articles on text formatting systems, including...","[0.0027503534220159054, 0.0276944562792778, 0...."
3,3,1741,i am interested in cognitive models of library...,"[-0.033059824258089066, 0.024907903745770454, ..."
4,4,2405,memory management aspects of operating systems,"[-0.0075305369682610035, -0.014398602768778801..."


##### Below lines of codes should be executed only when embedding file doesn't exist (i.e., only for the first time).

In [15]:
## Split the collection dataframe to smaller chunks (speed and limitation of openai)
#items = np.array_split(df_collection, 1000)

In [16]:
#for index, item in enumerate(items):
#   embedding = item['document_text'].apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))
#   items[index] = item.assign(embedding=embedding)
#   print(index) #This line can be commented out but it is there to monitor the progress


In [17]:
## Extract the 'embedding' column from each dataframe
#embedding_column = pd.concat([df['embedding'] for df in items], ignore_index=True)

## Create a new dataframe with the extracted 'embedding' column
#df_embedding = pd.DataFrame({'embedding': embedding_column})

## Join the dataframes side by side
#collection_embedding = pd.concat([df_collection, df_embedding], axis=1)

In [18]:
#collection_embedding.to_csv("collection_embedding.csv")

In [19]:
collection_embedding = pd.read_csv("collection_embedding.csv")

In [20]:
collection_embedding['embedding'] = collection_embedding['embedding'].apply(eval).apply(np.array)

In [21]:
collection_embedding.head()

Unnamed: 0.1,Unnamed: 0,document_id,document_text,embedding
0,0,0,digital computers in universities (part i) jul...,"[-0.029409123584628105, -0.006393287796527147,..."
1,1,2,using synthetic images to register real images...,"[-0.01402338594198227, 0.0006011170335114002, ..."
2,2,4,training library assistants in mauritius. the ...,"[-0.027914943173527718, 0.00800692941993475, 0..."
3,3,5,the electrostatic centre of a conductor,"[-0.015851134434342384, 0.010691668838262558, ..."
4,4,7,a note on the interpretation of transient echo...,"[-0.02111559733748436, 0.00931366067379713, 0...."


Below is the code to write and read the embedding file to a parquet file.

In [22]:
#collection_embedding.to_parquet('collection_embedding.parquet.gzip', compression='gzip')

In [23]:
# test = pd.read_parquet('collection_embedding.parquet.gzip')

In [24]:
from openai.embeddings_utils import cosine_similarity

In [25]:
for i in range(len(queries_embedding)):
    col = "similarities_"+ str(queries_embedding.loc[i, "query_id"])
    #print(col)
    collection_embedding[col] = collection_embedding['embedding'].apply(lambda x: cosine_similarity(x, queries_embedding.loc[i, "embedding"]))


In [26]:
#del collection_embedding['Unnamed']
collection_embedding = collection_embedding.drop(collection_embedding.columns[[0]],axis=1) 
#df.drop(df.columns[[0, 1, 3]], axis=1) 

In [27]:
collection_embedding.head()

Unnamed: 0,document_id,document_text,embedding,similarities_245,similarities_891,similarities_1217,similarities_1741,similarities_2405,similarities_3777,similarities_4233,...,similarities_35491,similarities_35645,similarities_35727,similarities_37026,similarities_37202,similarities_37294,similarities_37511,similarities_37845,similarities_38367,similarities_39228
0,0,digital computers in universities (part i) jul...,"[-0.029409123584628105, -0.006393287796527147,...",0.748288,0.720514,0.704694,0.719002,0.713533,0.711794,0.738003,...,0.71314,0.72865,0.699461,0.743287,0.744689,0.749145,0.713676,0.741792,0.713558,0.757579
1,2,using synthetic images to register real images...,"[-0.01402338594198227, 0.0006011170335114002, ...",0.681608,0.73036,0.704362,0.712026,0.688932,0.720852,0.709531,...,0.686716,0.698559,0.692996,0.684337,0.698705,0.709403,0.740025,0.784974,0.71172,0.715872
2,4,training library assistants in mauritius. the ...,"[-0.027914943173527718, 0.00800692941993475, 0...",0.66872,0.680273,0.68007,0.756269,0.698548,0.689073,0.674318,...,0.689742,0.695682,0.668917,0.651526,0.681287,0.7494,0.702661,0.714444,0.661176,0.782521
3,5,the electrostatic centre of a conductor,"[-0.015851134434342384, 0.010691668838262558, ...",0.746736,0.718,0.659011,0.677857,0.683307,0.748782,0.704684,...,0.71593,0.682788,0.665518,0.741172,0.726771,0.692009,0.671068,0.71617,0.698576,0.685551
4,7,a note on the interpretation of transient echo...,"[-0.02111559733748436, 0.00931366067379713, 0....",0.707846,0.801214,0.676377,0.668582,0.678044,0.792348,0.711596,...,0.654211,0.68128,0.669563,0.698202,0.697437,0.678795,0.691707,0.796794,0.735329,0.676001


In [28]:
collection_embedding.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20636 entries, 0 to 20635
Data columns (total 53 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   document_id         20636 non-null  int64  
 1   document_text       20636 non-null  object 
 2   embedding           20636 non-null  object 
 3   similarities_245    20636 non-null  float64
 4   similarities_891    20636 non-null  float64
 5   similarities_1217   20636 non-null  float64
 6   similarities_1741   20636 non-null  float64
 7   similarities_2405   20636 non-null  float64
 8   similarities_3777   20636 non-null  float64
 9   similarities_4233   20636 non-null  float64
 10  similarities_4767   20636 non-null  float64
 11  similarities_5899   20636 non-null  float64
 12  similarities_6324   20636 non-null  float64
 13  similarities_7328   20636 non-null  float64
 14  similarities_8141   20636 non-null  float64
 15  similarities_9125   20636 non-null  float64
 16  simi

In [52]:
collection_embedding.to_csv("collection_embedding_similarities.csv")

In [29]:
import pandas as pd

In [30]:
collection_embedding = pd.read_csv("collection_embedding_similarities.csv")

In [31]:
collection_embedding.head()

Unnamed: 0.1,Unnamed: 0,document_id,document_text,embedding,similarities_245,similarities_891,similarities_1217,similarities_1741,similarities_2405,similarities_3777,...,similarities_35491,similarities_35645,similarities_35727,similarities_37026,similarities_37202,similarities_37294,similarities_37511,similarities_37845,similarities_38367,similarities_39228
0,0,0,digital computers in universities (part i) jul...,[-0.02940912 -0.00639329 -0.01865172 ... -0.02...,0.748288,0.720514,0.704694,0.719002,0.713533,0.711794,...,0.71314,0.72865,0.699461,0.743287,0.744689,0.749145,0.713676,0.741792,0.713558,0.757579
1,1,2,using synthetic images to register real images...,[-0.01402339 0.00060112 -0.00229915 ... -0.01...,0.681608,0.73036,0.704362,0.712026,0.688932,0.720852,...,0.686716,0.698559,0.692996,0.684337,0.698705,0.709403,0.740025,0.784974,0.71172,0.715872
2,2,4,training library assistants in mauritius. the ...,[-0.02791494 0.00800693 0.00613159 ... -0.02...,0.66872,0.680273,0.68007,0.756269,0.698548,0.689073,...,0.689742,0.695682,0.668917,0.651526,0.681287,0.7494,0.702661,0.714444,0.661176,0.782521
3,3,5,the electrostatic centre of a conductor,[-0.01585113 0.01069167 0.00420147 ... 0.02...,0.746736,0.718,0.659011,0.677857,0.683307,0.748782,...,0.71593,0.682788,0.665518,0.741172,0.726771,0.692009,0.671068,0.71617,0.698576,0.685551
4,4,7,a note on the interpretation of transient echo...,[-0.0211156 0.00931366 0.00497308 ... 0.00...,0.707846,0.801214,0.676377,0.668582,0.678044,0.792348,...,0.654211,0.68128,0.669563,0.698202,0.697437,0.678795,0.691707,0.796794,0.735329,0.676001


##### Below code is used to create a dataframe in the format that is request for trec_eval format.

In [32]:
# Create a list to store the temporary dataframes
temp_dfs = []

# Iterate over the columns starting with "similarities"
for column in collection_embedding.columns:
    if column.startswith('similarities'):
        # Extract the query_id from the column name
        query_id = int(column.split('_')[1])

        # Get the rank of the documents for each query_id
        rank = collection_embedding[column].rank(ascending=False).astype(int)

        # Create a temporary dataframe with query_id, document_id, rank, score, and name (mine)
        temp_df = pd.DataFrame({
            'Query.ID': query_id,
            'Iteration': str('Q0'),
            'Dok.Nummer': collection_embedding['document_id'],
            'Rang': rank-1,
            'Score': collection_embedding[column],
            'System': str('AK')
        })

        # Filter the temporary dataframe to include only the first 1000 document_id values
        #temp_df = temp_df[temp_df['Dok.Nummer'].isin(temp_df['Dok.Nummer'].unique()[:1000])]

        # Append the temporary dataframe to the list
        temp_dfs.append(temp_df)

# Concatenate the temporary dataframes into a single dataframe
trec_eval = pd.concat(temp_dfs, ignore_index=True)

# Sort the dataframe by 'Rang' in ascending order and Group by query id
df_sorted_grouped = trec_eval.sort_values(by='Rang').groupby('Query.ID')

# Initialize an empty list to store the results
result_frames = []

# Iterate over each group
for name, group in df_sorted_grouped:
    # Get the first 1000 rows and append them to the result list
    result_frames.append(group.iloc[:1000])

# Concatenate the result frames into a single dataframe
result_df = pd.concat(result_frames)

# Sort the result dataframe by 'Query.ID' and 'Rang'
result_df = result_df.sort_values(by=['Query.ID', 'Rang'])

#Reset the index of the results dataframe
trec_eval_final = result_df.reset_index(drop=True)

# Print the resulting dataframe
print(trec_eval_final)


       Query.ID Iteration  Dok.Nummer  Rang     Score System
0           245        Q0       34694     0  0.887807     AK
1           245        Q0       30014     1  0.879237     AK
2           245        Q0       38008     2  0.875611     AK
3           245        Q0       34877     3  0.868441     AK
4           245        Q0        9618     4  0.861605     AK
...         ...       ...         ...   ...       ...    ...
49995     39228        Q0        8378   995  0.770225     AK
49996     39228        Q0       33341   996  0.770224     AK
49997     39228        Q0       38560   997  0.770186     AK
49998     39228        Q0       16414   998  0.770161     AK
49999     39228        Q0       23291   999  0.770113     AK

[50000 rows x 6 columns]


In [33]:
#trec_eval.to_csv("trec_eval.csv", sep=" ")
trec_eval_final.to_csv('Rangliste.txt', 
          sep=" ",
          float_format='%3.3f',
          escapechar=" ",
          index=False)

In [34]:
# Below code makes the output text final cleaner and left alignment for each column
# Set the desired column width
#column_width = 10

# Convert the DataFrame to a string with left-aligned columns
#df_string = trec_eval_final.to_string(index=False, justify='left')

# Split the string by newline character to get each row
#rows = df_string.split('\n')

# Adjust the width and alignment for each column in each row
#formatted_rows = []
#for row in rows:
#    formatted_row = [column.ljust(column_width) for column in row.split()]
#    formatted_rows.append(' '.join(formatted_row))

# Join the formatted rows back into a single string
#formatted_df_string = '\n'.join(formatted_rows)

# Write the formatted DataFrame string to a text file
#with open('trec_eval_clean_format.txt', 'w') as file:
#    file.write(formatted_df_string)
