In [1]:
import json

with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [2]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

documents[1]

{'text': 'GitHub - DataTalksClub/data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp'}

In [3]:
# Create coontent based ids 

import hashlib

def generate_document_id(doc):
    # combined = f"{doc['course']}-{doc['question']}"
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    # creates an MD5 hash object using the hashlib.md5() function; 
    # combined.encode() method converts the combined string into bytes, which is required by the md5 function.
    hash_object = hashlib.md5(combined.encode())
    # generates the hexadecimal representation of the MD5 hash.
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id


In [4]:
for doc in documents:
    doc['id'] = generate_document_id(doc)

In [5]:
documents[1]

{'text': 'GitHub - DataTalksClub/data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp',
 'id': '1f6520ca'}

If we don’t include text in our hashes we can see that they are not unique 

In [6]:
from collections import defaultdict

In [7]:
def generate_document_id_reduced(doc):
   
    combined = f"{doc['course']}-{doc['question']}"

    # creates an MD5 hash object using the hashlib.md5() function; 
    # combined.encode() method converts the combined string into bytes, which is required by the md5 function.
    hash_object = hashlib.md5(combined.encode())
    # generates the hexadecimal representation of the MD5 hash.
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

for doc in documents:
    doc['id'] = generate_document_id_reduced(doc)

hashes = defaultdict(list)

for doc in documents:
    doc_id = doc['id']
    hashes[doc_id].append(doc)


In [8]:
len(doc_id) == len(hashes)

False

In [9]:

for k, values in hashes.items():
    if len(values) > 1:
        print(k, len(values))

ca3dc12d 2
960fb254 2
67d2f21c 2
297f443c 2


We have collisions due to duplicate questions

In [10]:
hashes["ca3dc12d"]

[{'text': 'This error comes up on the Spark video 5.3.1 - First Look at Spark/PySpark,\nbecause as at the creation of the video, 2021 data was the most recent which utilised csv files but as at now its parquet.\nSo when you run the command spark.createDataFrame(df1_pandas).show(),\nYou get the Attribute error. This is caused by the pandas version 2.0.0 which seems incompatible with Spark 3.3.2, so to fix it you have to downgrade pandas to 1.5.3 using the command pip install -U pandas==1.5.3\nAnother option is adding the following after importing pandas, if one does not want to downgrade pandas version (source) :\npd.DataFrame.iteritems = pd.DataFrame.items\nNote that this problem is solved with Spark versions from 3.4.1',
  'section': 'Module 5: pyspark',
  'question': "AttributeError: 'DataFrame' object has no attribute 'iteritems'",
  'course': 'data-engineering-zoomcamp',
  'id': 'ca3dc12d'},
 {'text': 'Another alternative is to install pandas 2.0.1 (it worked well as at the time of

Save the indexed docs 