# Part 1 : Index Construction

## Data Crawling

In [1]:
from googlesearch import search 
import tldextract  
import requests 
from bs4 import BeautifulSoup 
import csv   
from time import sleep 

def format_data(doc_id, query, website, content):
    return {
        "doc_id" : doc_id,
        "query" : query,
        "website" : website,
        "content" : content
    }

# to search 
queries = ["Computer Science", "IITs in India", "Cities of Chattisgarh"]

docs = []

#Loop for each query
for i in range(len(queries)):
    query = queries[i]
    # Loop for each search result
    count_files = 20
    urls = []
    for url in search(query,num=25, start= 1, stop=50, pause=2): 
        urls.append(url)
        
    for url in urls:
        print(url)
        
        # pdf url are not useful for us as we can not extract data 
        #from it and sometimes they can create problem for us that's why we
        #are removing them
        if url[-3:] == 'pdf':
            continue
        
        try:
            respons = requests.get(url, timeout=5) #Response for the current URL
        except:
            continue
        soup = BeautifulSoup(respons.content, 'html5lib')
        paras=soup.find_all('p')  #Finding all the paragraph within the response
        para = "" #Initializing a empty paragraph
        
        count = 2 #To insure that file must contains 2 paragraph
        flag = 0
        
        for p in paras:   # Loop though each paragraph in response and append the contnet of that paragraph to  "para" so that we can save it as a continue paragraph
            if p!="":
                para += p.get_text()
                count-=1
            if (count==0):
                flag=1
                break
                
        if(flag):  #Perfom writing to file onlw when there is something in para
            ext_url = tldextract.extract(url)  #parsing the URL 
            domin_name = ext_url.domain   #Get the domain name from the parsed URL so that we can use it as a fine name
            
            docs.append(format_data(doc_id = f'd{21-count_files}_q{i+1}',
                                            query = query,
                                            website=url,
                                            content=para
                                           )
                               )
            count_files-=1
        
        if count_files==0:
            break

https://www.britannica.com/science/computer-science
https://www.edx.org/course/subject/computer-science
https://www.khanacademy.org/computing/computer-science
https://www.coursera.org/browse/computer-science
https://www.internationalstudent.com/study-computer-science/what-is-computer-science/
https://www.topuniversities.com/courses/computer-science-information-systems/guide
https://www.youtube.com/watch/Tzl0ELY_TiM
https://www.youtube.com/watch?v=nDDRliaoswg
https://www.timeshighereducation.com/student/what-to-study/computer-science
https://www.timeshighereducation.com/student/subjects/what-can-you-do-computer-science-degree
https://arxiv.org/archive/cs
https://www.computerscience.org/
https://www.seas.harvard.edu/computer-science
https://engineering.buffalo.edu/computer-science-engineering.html
https://www.codecademy.com/learn/paths/computer-science
https://www.stevens.edu/schaefer-school-engineering-science/departments/computer-science
https://github.com/ossu/computer-science
https:/

## Inverted Index

Associate a collection of terms (lexicon) with the documents that contain those terms.
The data structure is much more dense than a Document Term Matrix.

In [2]:
# Collection of documents (corpus)

docs

[{'doc_id': 'd1_q1',
  'query': 'Computer Science',
  'website': 'https://www.britannica.com/science/computer-science',
  'content': 'Our editors will review what you’ve submitted and determine whether to revise the article.Computer science,  the study of computers and computing, including their theoretical and algorithmic foundations, hardware and software, and their uses for processing information. The discipline of computer science includes the study of algorithms and data structures, computer and network design, modeling data and information processes, and artificial intelligence. Computer science draws some of its foundations from mathematics and engineering and therefore incorporates techniques from areas such as queueing theory, probability and statistics, and electronic circuit design. Computer science also makes heavy use of hypothesis testing and experimentation during the conceptualization, design, measurement, and refinement of new algorithms, information structures, and co

In [3]:
# Gather the set of all unique terms

unique_terms = {term for doc in docs for term in doc["content"].split()}
unique_terms

{'process',
 '(local',
 'people',
 'headquarters',
 'lifeline',
 'review',
 'Mahanadi',
 'more',
 'systems',
 'thinking',
 'last',
 'purpose',
 'N.',
 'Sir',
 'harness',
 'MIT,',
 'successful',
 'care',
 'announced',
 'art,',
 'Plain',
 'University',
 'possibly',
 'Harvard,',
 'Obviously,',
 'train',
 'Watch',
 'His',
 'Act',
 'numbering',
 'Area',
 'bachelor’s',
 'immense',
 'Northern',
 'envisioned',
 'Via',
 'faculty',
 'UNESCO,',
 'System',
 'cities.',
 'situated',
 'evolving',
 'including',
 'encouraged',
 'Tourism',
 'population.',
 'November',
 'towns',
 'PIN',
 'Council),',
 'present',
 'world',
 'Raigarh,',
 'present,',
 'mandir,',
 'government',
 'airport',
 'Pandit',
 'algorithms,',
 'free',
 'Katni,',
 'learning',
 'respectively.',
 'COVID-19',
 'approximately',
 'human-computer',
 'dams,',
 'personnel',
 'twenty-three',
 'Building.',
 'Importance.',
 'while',
 'Hardware',
 'member',
 'International',
 'theories',
 'experts',
 'new',
 'Allahabad,',
 'artificial',
 'sophomor

In [4]:
# Construct an inverted index
# here as a Python dictionary for ease of interpretability

inverted_index = {}
for doc in docs:
    for term in doc["content"].split():
        if term in inverted_index:
            inverted_index[term].add(doc["doc_id"])
        else: inverted_index[term] = {doc["doc_id"]}

inverted_index

{'Our': {'d10_q3', 'd1_q1', 'd9_q3'},
 'editors': {'d10_q3', 'd1_q1', 'd9_q3'},
 'will': {'d10_q3',
  'd11_q1',
  'd15_q2',
  'd18_q2',
  'd1_q1',
  'd20_q2',
  'd2_q2',
  'd3_q1',
  'd6_q1',
  'd9_q3'},
 'review': {'d10_q3', 'd1_q1', 'd9_q3'},
 'what': {'d10_q3', 'd1_q1', 'd3_q1', 'd9_q3'},
 'you’ve': {'d10_q3', 'd1_q1', 'd9_q3'},
 'submitted': {'d10_q3', 'd1_q1', 'd9_q3'},
 'and': {'d10_q1',
  'd10_q3',
  'd11_q1',
  'd13_q1',
  'd14_q1',
  'd14_q2',
  'd15_q1',
  'd15_q2',
  'd16_q1',
  'd17_q3',
  'd18_q1',
  'd18_q2',
  'd18_q3',
  'd19_q1',
  'd19_q2',
  'd1_q1',
  'd1_q2',
  'd1_q3',
  'd20_q2',
  'd20_q3',
  'd2_q1',
  'd3_q1',
  'd3_q2',
  'd3_q3',
  'd4_q2',
  'd4_q3',
  'd5_q3',
  'd6_q1',
  'd6_q2',
  'd7_q1',
  'd7_q2',
  'd7_q3',
  'd8_q1',
  'd8_q2',
  'd9_q1',
  'd9_q2',
  'd9_q3'},
 'determine': {'d10_q3', 'd1_q1', 'd9_q3'},
 'whether': {'d10_q3', 'd1_q1', 'd9_q3'},
 'to': {'d10_q1',
  'd10_q2',
  'd10_q3',
  'd11_q1',
  'd12_q1',
  'd13_q1',
  'd14_q1',
  'd14_q2',
  

In [5]:
# saving the file containing the document ids and corresponding contents (along with the webpage link)
import json

with open("./docs.json", "w") as f:
    f.write(json.dumps(docs, indent=4))

In [6]:
# word with thier frequency in documents

freq = {}
for word in inverted_index:
    if len(inverted_index[word]) in freq:
        freq[len(inverted_index[word])].append(word)
    else:
        freq[len(inverted_index[word])] = [word]
freq

{3: ['Our',
  'editors',
  'review',
  'you’ve',
  'submitted',
  'determine',
  'whether',
  'revise',
  'including',
  'information',
  'its',
  'such',
  'theory,',
  'also',
  'new',
  'more',
  'Courses',
  'help',
  'but',
  'programs',
  'School',
  'systems',
  'computing',
  'others',
  'COVID-19',
  'learn',
  'code',
  'up',
  'find',
  'we',
  'University',
  'offered',
  'under',
  'research,',
  'into',
  'along',
  '6',
  'Education',
  'As',
  'year',
  'autonomous',
  'oversees',
  'seats',
  'all',
  '2021',
  'Council',
  'now',
  'were',
  'development',
  'after',
  'November',
  'Ministry',
  'Human',
  'Resource',
  'NIRF',
  'colleges',
  'country.',
  'formed',
  'least',
  'over',
  'towards',
  'out',
  'these',
  'Raipur',
  'largest'],
 10: ['will', 'science', 'India'],
 4: ['what',
  'science,',
  'study',
  'data',
  'college',
  'our',
  'it',
  'These',
  'education',
  'education,',
  'students',
  'technology',
  'linked',
  'through',
  'administrati

In [7]:
available_freq = sorted(freq)
available_freq

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 16,
 17,
 20,
 21,
 29,
 30,
 36,
 37,
 40]

In [8]:
mostly_used = []
count = 3
for i in range(len(available_freq)-1,-1,-1):
    for item in freq[available_freq[i]]:
        if count==0:
            break
        mostly_used.append(item)
        count-=1
    if count==0:
        break

mostly_used

['the', 'and', 'of']

In [9]:
least_used = []
count = 3
for i in range(0,len(available_freq)):
    for item in freq[available_freq[i]]:
        if count==0:
            break
        least_used.append(item)
        count-=1
    if count==0:
        break

least_used

['article.Computer', 'computers', 'computing,']