In [1]:
#%pip install numpy==1.25 pandas scikit-learn matplotlib seaborn jsonlines pydot tqdm jupyter ipywidgets widgetsnbextension pandas-profiling umap-learn

In [1]:
import pandas as pd
import json

DATA_DIR = 'data'

# Load the data from files
with open(f'{DATA_DIR}/corpus.jsonl', 'r') as f:
    corpus_data = {item['_id']: item['text'] for item in (json.loads(line) for line in f)}

with open(f'{DATA_DIR}/queries.jsonl', 'r') as f:
    queries_data = {item['_id']: item['text'] for item in (json.loads(line) for line in f)}

train_data = pd.read_csv(f'{DATA_DIR}/task1_train.tsv', delimiter='\t')
test_data = pd.read_csv(f'{DATA_DIR}/task1_test.tsv', delimiter='\t')

# Rename corpus-id to document_id and query-id to query_id in both train and test data
train_data = train_data.rename(columns={'corpus-id': 'document_id', 'query-id': 'query_id'})
test_data = test_data.rename(columns={'corpus-id': 'document_id', 'query-id': 'query_id'})
# Make sure that the document_id and query_id are int64
train_data['document_id'] = train_data['document_id'].astype('int64')
train_data['query_id'] = train_data['query_id'].astype('int64')

In [2]:
# Create df from corpus_data
corpus_df = pd.DataFrame.from_dict(corpus_data, orient='index', columns=['text']).reset_index()
# rename index to document_id
corpus_df = corpus_df.rename(columns={'index': 'document_id'})

In [3]:
corpus_df.head()

Unnamed: 0,document_id,text
0,1867825,"After the invention of the cotton gin, cotton ..."
1,419610,"Timer has separate night and day outlets, whic..."
2,4614226,The rose-buying public still encounters a wide...
3,4108603,Map of Wendover (Aut) Airport. A detailed map ...
4,3744854,And as the poems Reapers and Cotton Song indic...


In [4]:
# Create df from queries_data
queries_df = pd.DataFrame.from_dict(queries_data, orient='index', columns=['text']).reset_index()
# rename index to query_id
queries_df = queries_df.rename(columns={'index': 'query_id'})

In [5]:
queries_df.head()

Unnamed: 0,query_id,text
0,1185869,)what was the immediate impact of the success ...
1,1185868,_________ justice is designed to repair the ha...
2,597651,what color is amber urine
3,403613,is autoimmune hepatitis a bile acid synthesis ...
4,1183785,elegxo meaning


In [6]:
# Apply tf-idf to corpus_df and queries_df and append to respective dfs
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
corpus_tfidf = tfidf.fit_transform(corpus_df['text'])
corpus_tfidf_df = pd.DataFrame.sparse.from_spmatrix(corpus_tfidf)
corpus_tfidf_df['document_id'] = corpus_df['document_id'].astype('int64')
corpus_tfidf_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,597615,597616,597617,597618,597619,597620,597621,597622,597623,document_id
0,0.0,0.10364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1867825
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,419610
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4614226
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4108603
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3744854


In [7]:
# Do the same to queries_df
queries_tfidf = tfidf.transform(queries_df['text'])
queries_tfidf_df = pd.DataFrame.sparse.from_spmatrix(queries_tfidf)
queries_tfidf_df['query_id'] = queries_df['query_id'].astype('int64')
queries_tfidf_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,597615,597616,597617,597618,597619,597620,597621,597622,597623,query_id
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1185869
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1185868
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,597651
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,403613
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1183785


In [19]:
# Merge train_data with corpus_tfidf_df and queries_tfidf_df
train_data = train_data.merge(corpus_tfidf_df, on='document_id')
train_data = train_data.merge(queries_tfidf_df, on='query_id')
train_data.head()

KeyboardInterrupt: 

In [5]:
import lightgbm as lgb
import numpy as np
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

# Prepare training data for LightGBM
X_train = []
y_train = []

for query_id, corpus_id, relevance_score in tqdm(zip(train_data['query-id'], train_data['corpus-id'], train_data['score'])):
    query_vector = query_features[list(queries_data.keys()).index(str(query_id))]
    document_vector = corpus_features[list(corpus_data.keys()).index(str(corpus_id))]
    feature_vector = np.concatenate((query_vector.toarray()[0], document_vector.toarray()[0]))
    
    X_train.append(feature_vector)
    y_train.append(relevance_score)

X_train = np.array(X_train)
y_train = np.array(y_train)

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Initialize and train a ranking model using LightGBM
ranker = lgb.LGBMRanker()
ranker.fit(X_train, y_train)



0it [00:00, ?it/s]

: 