# Read and index Cran files

### This notebook reads in the Cran raw data and prepares the document and query files for processing. It is a one off process, the output files of which are used for indexing and ranking in subsequent notebooks for different IR models.

STEPS:

##### 1. Read Cran documents raw data - "**cran.all.1400.xml**": This is the XML file containing all sample documents. 
##### 2. Split this file (of all documents) into indivdual XML files, one for each document, and save to a new folder.
##### 3. Retrieve all document titles, indexed by document number and save to CSV file (to be used for subsequent query/document indexation and ranking).
##### 4. Retrieve all document contens (main doc body), indexed by document number and save to CSV file (to be used for subsequent query/document indexation and ranking).
##### 5. Read Cran queries raw data - "**cran.qry.xml**": This is the XML file containing all sample queries. 
##### 6. Split this file (of all queries) into indivdual XML files, one for each query, and save to a new folder.
##### 7. Retrieve all query titles, indexed by document number and save to CSV file (to be used for subsequent query/document indexation and ranking).

### Imports and setup

In [None]:
import csv
import xml.etree.ElementTree as ET

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

import os

Mounted at /content/drive


### Documents processing

##### Split master document XML file into individual document XML files

In [None]:
os.chdir("/content/drive/MyDrive/CA6005I - Mechanics of Search/Assignment1/Files_Cran_Original")

In [None]:
# Set the path to the input file
input_file = "cran.all.1400.xml"

# Set the XML tag to split the file into individual doc's
split_tag = "</doc>"

# Read input file into memory
with open(input_file, "r") as f:
    data = f.read()

In [None]:
# Split the data into separate XML documents based on split tag
documents = data.split(split_tag)

# Remove whitespaces from each document
documents = [doc.strip() for doc in documents]

In [None]:
# Folder for output individual documents
output_dir = "Files_Individual_Docs"

os.chdir("/content/drive/MyDrive/CA6005I - Mechanics of Search/Assignment1/")

In [None]:
# Write each document to a separate file
for i, doc in enumerate(documents):
    # Create a filename for the output file
    output_filename = os.path.join(output_dir, f"document_{i+1}.xml")
    # Write the document to the output file
    with open(output_filename, "w") as f:
        f.write(doc + split_tag)

##### Retrieve all document titles and read to single CSV file indexed by document number: "**Indexed_Titles.csv**"

In [None]:
os.chdir("/content/drive/MyDrive/CA6005I - Mechanics of Search/Assignment1/Files_Individual_Docs")
!ls

In [None]:
array_titles = []
i = 0

for filename in os.listdir():

  print("Processing file... " + str(i+1))  
  tree = ET.parse(filename)
  root = tree.getroot()

  # Get title and doc number
  docno = root.find('docno').text
  title = root.find('title').text

  #print("Got doc num... " + str(docno))    

  if title is None:
    title = "**NODATA**"
    #print("Found no data for doc no: " + str(docno))

  # Remove end of line characters and end of sentence periods from titles
  title = title.replace("\n", " ")
  title = title.replace(" .", "")  

  array_titles.append([i, docno, title])
  i += 1

In [None]:
for i in array_titles:
  print(i)

In [None]:
os.chdir("/content/drive/MyDrive/CA6005I - Mechanics of Search/Assignment1/Files_Indexed")

titles_csv_file = "Indexed_Titles.csv"

# Open the output CSV file for writing
with open(titles_csv_file, "w", newline="") as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerows(array_titles)

##### Retrieve all document contents and read to single CSV file indexed by document number: "**Indexed_Contents.csv**"

In [None]:
os.chdir("/content/drive/MyDrive/CA6005I - Mechanics of Search/Assignment1/Files_Individual_Docs")
!ls

In [None]:
array_content = []
i = 0

for filename in os.listdir():

  #print("Processing file... " + str(i+1))  
  #print("Processing content... " + filename)
  tree = ET.parse(filename)
  root = tree.getroot()

  # Get content and doc number
  docno = root.find('docno').text
  content = root.find('text').text

  #print("Got doc num... " + str(docno))    

  if content is None:
    content = "**NODATA**"
    #print("Found no data for doc no: " + str(docno))

  # Remove end of line characters and end of sentence periods from contents
  content = content.replace("\n", " ")
  content = content.replace(" .", "")  
  
  array_content.append([i, docno, content])

  i += 1

In [None]:
os.chdir("/content/drive/MyDrive/CA6005I - Mechanics of Search/Assignment1/Files_Indexed")

contents_csv_file = "Indexed_Contents.csv"

# Open the output CSV file for writing
with open(contents_csv_file, "w", newline="") as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerows(array_content)

### Queries processing

##### Split master query XML file into individual query XML files

In [None]:
os.chdir("/content/drive/MyDrive/CA6005I - Mechanics of Search/Assignment1/Files_Cran_Original")

In [None]:
# Set the path to the input file
input_file = "cran.qry.xml"

# Set the XML tag to split the file into individual doc's
split_tag = "</top>"

# Read input file into memory
with open(input_file, "r") as f:
    data = f.read()

In [None]:
# Split the data into separate XML documents based on split tag
documents = data.split(split_tag)

# Remove whitespaces from each document
documents = [doc.strip() for doc in documents]

In [None]:
# Folder for output individual documents
os.chdir("/content/drive/MyDrive/CA6005I - Mechanics of Search/Assignment1/Files_Individual_Queries")

In [None]:
# Write each document to a separate file
for i, doc in enumerate(documents):
    # Create a filename for the output file
    output_filename = os.path.join(output_dir, f"query_{i+1}.xml")
    # Write the document to the output file
    with open(output_filename, "w") as f:
        f.write(doc + split_tag)

##### Retrieve all search queries and read to single CSV file indexed by document number: "**Indexed_Queries.csv**"

In [None]:
os.chdir("/content/drive/MyDrive/CA6005I - Mechanics of Search/Assignment1/Files_Individual_Queries")

In [None]:
import xml.etree.ElementTree as ET

In [None]:
array_titles = []
i = 0

for filename in os.listdir():
  tree = ET.parse(filename)
  root = tree.getroot()

  # Get title and doc number
  querynum = i+1
  title = root.find('title').text

  if title is None:
    title = "**NODATA**"
    print("Found no data for doc no: " + str(docno))

  querynum = querynum.strip()
  title = title.strip()

  # Remove end of line characters and end of sentence periods from titles
  title = title.replace("\n", " ")
  title = title.replace(" .", "")  

  array_titles.append([i, querynum, title])
  i += 1

In [None]:
for i in array_titles:
  print(i)

In [None]:
os.chdir("/content/drive/MyDrive/CA6005I - Mechanics of Search/Assignment1/Files_Indexed")

titles_csv_file = "Indexed_Queries.csv"

# Open the output CSV file for writing
with open(titles_csv_file, "w", newline="") as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerows(array_titles)