In [3]:
# ==========================================
# NEW CELLS - Add these before "Run the app"
# ==========================================

# Cell: Check if index files exist
import os
import glob

index_exists = os.path.exists('index.pkl') and os.path.exists('DL.pkl') and os.path.exists('id_to_title.pkl')

if index_exists:
    print("⚠️ Index files already exist!")
    print("If you want to regenerate them, delete the files first.")
    print("\nExisting files:")
    for f in ['index.pkl', 'DL.pkl', 'id_to_title.pkl']:
        if os.path.exists(f):
            print(f"  ✓ {f}")
else:
    print("No index files found. Run the next cell to generate them.")

# Cell: Install PySpark and dependencies (if not already installed)
!pip install -q pyspark
!pip install -q graphframes

# Cell: Upload your Python files
from google.colab import files

print("Please upload: main.py, preprocessing.py, config.py, inverted_index_colab.py, BM25.py, search_frontend.py")
uploaded = files.upload()

required_files = ['main.py', 'preprocessing.py', 'config.py', 'inverted_index_colab.py']
for f in required_files:
    if f in uploaded:
        print(f"✓ {f} uploaded")
    else:
        print(f"✗ {f} MISSING!")

# Cell: Run main.py to generate index
print("Starting index generation...")
print("This will take 5-10 minutes. Please be patient!\n")

%run main.py

print("\n" + "="*50)
print("Index generation complete!")
print("="*50)

# Cell: Verify generated files
required_files = {
    'index.pkl': 'Inverted index metadata',
    'DL.pkl': 'Document lengths',
    'id_to_title.pkl': 'Document ID to title mapping'
}

print("Checking generated files:\n")
all_good = True
for filename, description in required_files.items():
    if os.path.exists(filename):
        size = os.path.getsize(filename)
        print(f"✓ {filename} ({size:,} bytes) - {description}")
    else:
        print(f"✗ {filename} MISSING - {description}")
        all_good = False

bin_files = glob.glob('*_*.bin')
if bin_files:
    print(f"\n✓ Found {len(bin_files)} binary posting list files")
else:
    print(f"\n✗ No binary posting list files found!")
    all_good = False

if all_good:
    print("\n" + "="*50)
    print("✓ All index files ready!")
    print("="*50)
else:
    print("\n✗ Some files missing! Check errors above.")

No index files found. Run the next cell to generate them.
Please upload: main.py, preprocessing.py, config.py, inverted_index_colab.py, BM25.py, search_frontend.py


Saving BM25.py to BM25.py
Saving config.py to config.py
Saving inverted_index_colab.py to inverted_index_colab.py
Saving main.py to main.py
Saving pagerank.py to pagerank.py
Saving preprocessing.py to preprocessing.py
Saving queries_train.json to queries_train.json
Saving search_frontend.py to search_frontend.py
✓ main.py uploaded
✓ preprocessing.py uploaded
✓ config.py uploaded
✓ inverted_index_colab.py uploaded
Starting index generation...
This will take 5-10 minutes. Please be patient!

Spark: 4.0.1 | GraphFrames Python wrapper imported OK
Downloading data from bucket...
Reading data from: wikidumps/*
Saving DL.pkl...
Saving id_to_title.pkl...
Writing Index to Disk...
Index created and written successfully.

Index generation complete!
Checking generated files:

✓ index.pkl (349,206 bytes) - Inverted index metadata
✓ DL.pkl (5,895 bytes) - Document lengths
✓ id_to_title.pkl (19,634 bytes) - Document ID to title mapping

✓ Found 124 binary posting list files

✓ All index files ready!


In [4]:
# download nltk stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
# Install a particular version of `google-cloud-storage` because (oddly enough)
# the  version on Colab and GCP is old. A dependency error below is okay.
!pip install -q google-cloud-storage==1.43.0

In [6]:
# authenticate below for Google Storage access as needed
from google.colab import auth
auth.authenticate_user()

# Run the app

In [7]:
# you need to upload your implementation of search_app.py
import search_frontend as se

Inverted Index loaded successfully.
Successfully loaded DL.pkl
Successfully loaded id_to_title.pkl


In [8]:
# uncomment the code below and execute to reload the module when you make
# changes to search_frontend.py (after you upload again).
import importlib
importlib.reload(se)

Inverted Index loaded successfully.
Successfully loaded DL.pkl
Successfully loaded id_to_title.pkl


<module 'search_frontend' from '/content/search_frontend.py'>

In [31]:
# find Colab's public URL
from google.colab.output import eval_js
server_url = eval_js("google.colab.kernel.proxyPort(5000)")
print(f"""Test your search engine by navigating to
{server_url}search?query=hello+world
This URL is only accessible from the same browser session. In other words, this
will not be accessible from a different machine, browser, or incognito session.
""")

# Uncomment the following line of code to run the frontend in the main
# process and wait for HTTP requests (colab will hang). The debug parameter
# lets you see incoming requests and get debug print outs if exceptions occur.
se.run(debug=False, use_reloader=False)

# Alternatively, the next few lines run the frontend in a background process.
# Just don't forget to terminate the process when you update your search engine
# or want to reload it.
import multiprocessing, time
proc = multiprocessing.Process(target=se.run,
                               kwargs={"debug": True, "use_reloader": False,
                                       "host": "0.0.0.0", "port": 5000})
proc.start()

time.sleep(1) # give Flask time to boot

from google.colab.output import eval_js
server_url = eval_js("google.colab.kernel.proxyPort(5000)")

print(f"Open this URL:\n{server_url}/search?query=hello+world")

#correct url - https://5000-m-s-2m5wjknpsaj7p-c.us-central1-0.prod.colab.dev/search?query=hello+world

# Use proc.terminate() to stop the process

Test your search engine by navigating to
https://5000-m-s-2m5wjknpsaj7p-c.us-central1-0.prod.colab.devsearch?query=hello+world
This URL is only accessible from the same browser session. In other words, this
will not be accessible from a different machine, browser, or incognito session.

 * Serving Flask app 'search_frontend'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


 * Serving Flask app 'search_frontend'
 * Debug mode: on


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.28.0.12:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


Open this URL:
https://5000-m-s-2m5wjknpsaj7p-c.us-central1-0.prod.colab.dev/search?query=hello+world


In [30]:
proc.terminate()

# Testing your app

Once your app is running you can query it. You can simply do that by clicking on the URL printed above (the one looking like https://XXXXX-5000-colab.googleusercontent.com/search?query=hello+world or by issuing an HTTP request through code (from colab).

The code below shows how to issue a query from python. This is also how our testing code will issue queries to your search engine, so make sure to test your search engine this way after you deploy it to GCP and before submission. Command line instructions for deploying your search engine to GCP are available at `run_frontend_in_gcp.sh`. Note that we will not only issue training queries to your search engine, but also test queries, i.e. queries that you've never seen before.

In [32]:
import json

with open('queries_train.json', 'rt') as f:
  queries = json.load(f)

In [33]:
def average_precision(true_list, predicted_list, k=40):
    true_set = frozenset(true_list)
    predicted_list = predicted_list[:k]
    precisions = []
    for i,doc_id in enumerate(predicted_list):
        if doc_id in true_set:
            prec = (len(precisions)+1) / (i+1)
            precisions.append(prec)
    if len(precisions) == 0:
        return 0.0
    return round(sum(precisions)/len(precisions),3)

In [34]:
def precision_at_k(true_list, predicted_list, k):
    true_set = frozenset(true_list)
    predicted_list = predicted_list[:k]
    if len(predicted_list) == 0:
        return 0.0
    return round(len([1 for doc_id in predicted_list if doc_id in true_set]) / len(predicted_list), 3)
def recall_at_k(true_list, predicted_list, k):
    true_set = frozenset(true_list)
    predicted_list = predicted_list[:k]
    if len(true_set) < 1:
        return 1.0
    return round(len([1 for doc_id in predicted_list if doc_id in true_set]) / len(true_set), 3)
def f1_at_k(true_list, predicted_list, k):
    p = precision_at_k(true_list, predicted_list, k)
    r = recall_at_k(true_list, predicted_list, k)
    if p == 0.0 or r == 0.0:
        return 0.0
    return round(2.0 / (1.0/p + 1.0/r), 3)
def results_quality(true_list, predicted_list):
    p5 = precision_at_k(true_list, predicted_list, 5)
    f1_30 = f1_at_k(true_list, predicted_list, 30)
    if p5 == 0.0 or f1_30 == 0.0:
        return 0.0
    return round(2.0 / (1.0/p5 + 1.0/f1_30), 3)

assert precision_at_k(range(10), [1,2,3] , 2) == 1.0
assert recall_at_k(   range(10), [10,5,3], 2) == 0.1
assert precision_at_k(range(10), []      , 2) == 0.0
assert precision_at_k([],        [1,2,3],  5) == 0.0
assert recall_at_k(   [],        [10,5,3], 2) == 1.0
assert recall_at_k(   range(10), [],       2) == 0.0
assert f1_at_k(       [],        [1,2,3],  5) == 0.0
assert f1_at_k(       range(10), [],       2) == 0.0
assert f1_at_k(       range(10), [0,1,2],  2) == 0.333
assert f1_at_k(       range(50), range(5), 30) == 0.182
assert f1_at_k(       range(50), range(10), 30) == 0.333
assert f1_at_k(       range(50), range(30), 30) == 0.75
assert results_quality(range(50), range(5))  == 0.308
assert results_quality(range(50), range(10)) == 0.5
assert results_quality(range(50), range(30)) == 0.857
assert results_quality(range(50), [-1]*5 + list(range(5,30))) == 0.0


In [39]:
import requests
from time import time
# In GCP the public URL for your engine should look like this:
# url = 'http://35.232.59.3:8080'
# In colab, we are going to send HTTP requests to localhost (127.0.0.1)
# and direct them to port where the server is listening (5000).
url = 'http://127.0.0.1:5000'

qs_res = []
for q, true_wids in queries.items():
  duration, rq = None, None
  t_start = time()
  try:
    res = requests.get(url + '/search', {'query': q}, timeout=35)
    print("1")
    duration = time() - t_start
    if res.status_code == 200:
      print("2")
      pred_wids, _ = zip(*res.json())
      rq = results_quality(true_wids, pred_wids)
      print("3")
  except:
    print("failed")
    pass

  qs_res.append((q, duration, rq))

1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
failed
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3


In [38]:

# Print summary by Shay
print(f"\nProcessed {len(qs_res)} queries")
successful = sum(1 for _, d, r in qs_res if d is not None and r is not None)
print(f"Successful: {successful}")
if successful > 0:
    avg_duration = sum(d for _, d, _ in qs_res if d is not None) / successful
    avg_quality = sum(r for _, _, r in qs_res if r is not None) / successful
    print(f"Average duration: {avg_duration:.3f}s")
    print(f"Average quality: {avg_quality:.3f}")


Processed 30 queries
Successful: 29
Average duration: 0.007s
Average quality: 0.004
