In [1]:
RAW_FILE = "/home/uyen/workspace/nlp_project/data/zac-data/wikipedia_20220620_cleaned_v2.csv"
TOTAL_CHUNK = 1944407
BM25_FILE = "/home/uyen/workspace/nlp_project/data/cache/bm25_collection/wiki.jsonl"
COLLECTION_FILE = "/home/uyen/workspace/nlp_project/data/cache/database/wiki.db"

In [2]:
from datasets import load_dataset
collection_dataset = load_dataset('csv', data_files=RAW_FILE, streaming=True)["train"]

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import sqlite3

class WikiDataset:
    def __init__(self, db_path='wiki.db'):
        self.conn = sqlite3.connect(db_path)
        self.cursor = self.conn.cursor()
        self._create_table()

    def _create_table(self):
        self.cursor.execute('''
            CREATE TABLE IF NOT EXISTS wiki (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                title TEXT NOT NULL,
                text TEXT NOT NULL
            )
        ''')
        self.conn.commit()

    def add_row(self, index, title, text):
        self.cursor.execute('''
            INSERT INTO wiki (id, title, text) VALUES (?, ?, ?)
        ''', (index, title, text))
        # self.conn.commit()

    def __getitem__(self, index):
        self.cursor.execute('''
            SELECT id, title, text FROM wiki WHERE id = ?
        ''', (index,))
        row = self.cursor.fetchone()
        if row is None:
            raise IndexError("Index out of range")
        return {'index': row[0], 'title': row[1], 'text': row[2]}

    def __len__(self):
        self.cursor.execute('SELECT COUNT(*) FROM wiki')
        return self.cursor.fetchone()[0]

    def close(self):
        self.conn.close()

wikidataset = WikiDataset(COLLECTION_FILE)

In [4]:
import jsonlines
from tqdm import tqdm

def valid_text(text):
    return text is not None and text != ""

with jsonlines.open(BM25_FILE, mode="w") as writer:
    for index, item in enumerate(tqdm(collection_dataset, total=TOTAL_CHUNK)):
        if valid_text(item["title"]) is False:
            continue
        if valid_text(item["text"]) is False:
            continue
        if valid_text(item["bm25_text"]) is False:
            continue

        wikidataset.add_row(
            index=index,
            title=item["title"],
            text=item["text"]
        )
        if index % 10000 == 0:
            wikidataset.conn.commit()
            
        writer.write({
            'id': index,
            'contents': item['bm25_text']
        })

wikidataset.conn.commit()

100%|█████████▉| 1944406/1944407 [01:25<00:00, 22612.25it/s]
