In [1]:
import json
import pandas as pd
import pyterrier as pt

In [2]:
# Paths to JSONL files
jsonl_path = "text_data_QVH/highlight_train_release.jsonl"
subs_path = "text_data_QVH/subs_train.jsonl"

In [3]:
subtitles_dict = {}

with open(subs_path, 'r') as subs_file:
    for line in subs_file:
        sub_data = json.loads(line)
        triple = sub_data['vid'].split("_")
        name = triple[0:-2]
        #turn the list name into a string
        name = "".join(name)
        if name not in subtitles_dict:
            subtitles_dict[name] = [(float(triple[-2]) + sub_data["relevant_windows"][0][0], float(triple[-2]) + sub_data["relevant_windows"][0][1], sub_data['query'])]
        else:
            subtitles_dict[name].append((float(triple[-2]) + sub_data["relevant_windows"][0][0], float(triple[-2]) + sub_data["relevant_windows"][0][1], sub_data['query']))

In [4]:
def parse_jsonl(jsonl_path):
    queries_data = []
    documents_data = []
    query_rankings_data = []
    with open(jsonl_path, 'r') as file:
        for idx,line in enumerate(file):

            # Load the JSON object from the line
            data = json.loads(line)

            triple = data["vid"].split("_")
            document_name = triple[0:-2]
            document_name = "".join(document_name)
            start_time = float(triple[-2])
            end_time = float(triple[-1])

            if document_name not in subtitles_dict:
                #print("Document not found in subtitles: ", document_name)
                continue

            queries_data.append({"qid" : data["qid"], "query": data["query"]})

            all_scores = []
            momentaneus_rank =[]
            for id,relevant_window in enumerate(data["relevant_windows"]):
                ts = [start_time+relevant_window[0], start_time+relevant_window[1]]
                subs = [sub for sub in subtitles_dict[document_name] if sub[0] <= ts[1] and ts[0] <= sub[1]]
                documents_data.append({"docno" : str(idx) +"_"+str(ts[0]) + "_" + str(ts[1]), "vid_name" : document_name, "ts": ts, "duration": data["duration"], "text": "".join([sub[2] for sub in subs])})
                scores = [data["saliency_scores"][i]  for i,clip_id in enumerate(data["relevant_clip_ids"]) if clip_id*2 >= relevant_window[0] and clip_id*2 <= relevant_window[1]]
                #each entry of scores is a triple of integers. Create a variable score which is the average of all the scores
                score = 0 if len(scores) ==0 else sum(sum(scores[i]) for i in range(len(scores)))/(3*len(scores))

                momentaneus_rank.append({"qid" : data["qid"], "query": data["query"] , "docno" : str(idx) +"_"+str(ts[0]) + "_" + str(ts[1]), "score": score, "rank":1})
            
            #adjust the rank of the momentaneus_rank based on the score
            momentaneus_rank = sorted(momentaneus_rank, key=lambda x: x["score"], reverse=True)
            for i in range(len(momentaneus_rank)):
                momentaneus_rank[i]["rank"] = i+1
        
            query_rankings_data.extend(momentaneus_rank)

    return queries_data, documents_data, query_rankings_data

In [5]:
queries_data, documents_data, query_rankings_data = parse_jsonl(jsonl_path)

# Convert lists to DataFrames (training)
query_set_df = pd.DataFrame(queries_data)
documents_set_df = pd.DataFrame(documents_data)
query_rankings_df = pd.DataFrame(query_rankings_data)

train_query_set_df = query_set_df.sample(frac=0.8)
val_query_set_df = query_set_df.drop(train_query_set_df.index)

train_query_rankings_df = query_rankings_df[query_rankings_df["qid"].isin(train_query_set_df["qid"])]
val_query_rankings_df = query_rankings_df[query_rankings_df["qid"].isin(val_query_set_df["qid"])]

train_documents_set_df = documents_set_df[documents_set_df["docno"].isin(train_query_rankings_df["docno"])]
val_documents_set_df = documents_set_df[documents_set_df["docno"].isin(val_query_rankings_df["docno"])]


In [6]:
#print the length of the dataframes
print(len(train_query_set_df))
print(len(train_documents_set_df))
print(len(train_query_rankings_df))

print(len(val_query_set_df))
print(len(val_documents_set_df))
print(len(val_query_rankings_df))


4445
7942
7942
1111
1940
1940


In [7]:
if not pt.started():
    pt.init()

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [8]:
# Create an index

indexer = pt.IterDictIndexer(
    "./indexQVH_path/",
    meta={
        "docno": 16,
        "vid_name": 64,
        "text": 131072,
    },
    stemmer="porter",
    stopwords="terrier",
    overwrite=True,
    type=pt.index.IndexingType.MEMORY,
)

In [9]:
indexed = indexer.index(pd.concat([train_documents_set_df, val_documents_set_df]).to_dict('records'))

In [10]:
# Initialize BatchRetrieve with the created index and specify BM25 as the weighting model
first_stage_bm25 = pt.BatchRetrieve(
    indexed,
    wmodel="BM25",
    num_results=3,
    metadata=["docno", "vid_name", "text"]
)

In [11]:
# features, can use any of the features in the list

pl2_retriever = pt.BatchRetrieve(indexed, wmodel="PL2")
dph_retriever = pt.BatchRetrieve(indexed, wmodel="DPH")
#tf_idf_retriever = pt.BatchRetrieve(indexed, wmodel="TF_IDF")
#bb2_retriever = pt.BatchRetrieve(indexed, wmodel="BB2")

In [12]:
# build a pipeline with the features
pipeline_with_features = ~first_stage_bm25 >> (
    pl2_retriever ** dph_retriever
)

In [13]:
# Prepare the queries for the pipeline, remove special characters and extra spaces
prepared_queries = train_query_set_df
prepared_queries['qid'] = prepared_queries['qid'].astype(str)
prepared_queries['query'] = prepared_queries['query'].str.replace('[\'"?!]', ' ', regex=True)
prepared_queries['query'] = prepared_queries['query'].str.replace(r'[^\w\s]', ' ', regex=True)
prepared_queries['query'] = prepared_queries['query'].str.replace(r'\s+', ' ', regex=True).str.strip()

prepared_rankings = train_query_rankings_df.reset_index()
prepared_rankings['qid'] = prepared_rankings['qid'].astype(str)
prepared_rankings['docno'] = prepared_rankings['docno'].astype(str)
prepared_rankings['query'] = prepared_rankings['query'].str.replace('[\'"?!]', ' ', regex=True)
prepared_rankings['query'] = prepared_rankings['query'].str.replace(r'[^\w\s]', ' ', regex=True)
prepared_rankings['query'] = prepared_rankings['query'].str.replace(r'\s+', ' ', regex=True).str.strip()

prepared_rankings['label'] = prepared_rankings['score']

prepared_rankings['label'] = prepared_rankings['label'].astype(int)

prepared_valqueries = val_query_set_df.reset_index()
prepared_valqueries['qid'] = prepared_valqueries['qid'].astype(str)
prepared_valqueries['query'] = prepared_valqueries['query'].str.replace('[\'"?!]', ' ', regex=True)
prepared_valqueries['query'] = prepared_valqueries['query'].str.replace(r'[^\w\s]', ' ', regex=True)

prepared_valqueries['query'] = prepared_valqueries['query'].str.replace(r'\s+', ' ', regex=True).str.strip()

prepared_valrankings = val_query_rankings_df.reset_index()
prepared_valrankings['qid'] = prepared_valrankings['qid'].astype(str)
prepared_valrankings['docno'] = prepared_valrankings['docno'].astype(str)
prepared_valrankings['query'] = prepared_valrankings['query'].str.replace('[\'"?!]', ' ', regex=True)
prepared_valrankings['query'] = prepared_valrankings['query'].str.replace(r'[^\w\s]', ' ', regex=True)
prepared_valrankings['query'] = prepared_valrankings['query'].str.replace(r'\s+', ' ', regex=True).str.strip()

prepared_valrankings['label'] = prepared_valrankings['score']
# make the label an int
prepared_valrankings['label'] = prepared_valrankings['label'].astype(int)

In [14]:
from sklearn.svm import SVR

index = pt.IndexFactory.of("./indexQVH_path/")

# load results from csv
# results_with_features = pd.read_csv('/content/drive/MyDrive/IR/results_with_features.csv')
fsr_pipelines = [
    pipeline_with_features
]

learned_models = [
    SVR()
]

In [15]:
trained_models = [first_stage_bm25]
names = ['BM25']

# ltr_svm = ~pipeline_with_features >> pt.ltr.apply_learned_model(SVR())

# ltr_svm.fit(
#     prepared_queries,
#     train_query_rankings_df,
#     # train_documents_set_df
# )

for fsr_pipeline in fsr_pipelines:
    for model in learned_models:
        names.append(f"prova_{model.__class__.__name__}")
        pipe = fsr_pipeline >> pt.ltr.apply_learned_model(model)
        pipe.fit(
            prepared_queries,
            prepared_rankings
        )
        trained_models.append(pipe)




In [16]:
from pyterrier.measures import nDCG, RR, MAP

pt.Experiment(
    trained_models,
    prepared_valqueries,
    prepared_valrankings,
    names=['BM25', 'BM25 > SVR'],
    eval_metrics=[nDCG @ 3, RR @ 3, MAP],
)

Unnamed: 0,name,nDCG@3,RR@3,AP
0,BM25,0.05078,0.057006,0.046424
1,BM25 > SVR,0.044434,0.047255,0.037773


In [17]:

pt.Experiment(
    trained_models,
    prepared_queries,
    prepared_rankings,
    names=['BM25', 'BM25 > SVR'],
    eval_metrics=[nDCG @ 3, RR @ 3, MAP],
)

Unnamed: 0,name,nDCG@3,RR@3,AP
0,BM25,0.056367,0.06333,0.049986
1,BM25 > SVR,0.050684,0.054443,0.042648
