# HW4 - Implement an Inverted Index#

### Before You Start
For this problem set, you should download INF0202-HW4.ipynb from bCourses. Create a local copy of the notebook and rename it LASTNAME_FIRSTNAME-HW4.ipynb. Then edit your renamed file directly in your browser by typing:
```
jupyter notebook <name_of_downloaded_file>
```

In [260]:
import numpy as np
import pandas as pd
import nltk
import matplotlib.pyplot as plt
from sklearn import feature_extraction

from nltk import NaiveBayesClassifier
from nltk.tokenize import word_tokenize
from itertools import chain
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split

In [289]:
courses= pd.read_csv("courses_list_final.csv")

#courses['Title'] = courses['Title'].dropna()

courses.head()
len(courses)

2215

In [290]:
title_list= courses['Title'].tolist()
title_list[:5]

['General Game Playing',
 'Big Data Integration and Processing',
 'Windows Server Management and Security',
 'Computational Photography',
 'Localization Essentials']

In [263]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

ps = PorterStemmer() #for stemming - taking care of mistakes etc.
stopWords = set(stopwords.words('english')) #set of stop words 

#making inverted index for titles - we can't apply cosine similarity due to less number of items- 
#cosine similarity does not gave very low similarity indexes
def makeInvertedIndex(strlist):
    inverted_index = {}
    for index, item in enumerate(strlist):
        words = item.split(' ') #['hello','world']
        for word in words:
            if word not in stopWords:#taking care of stop words
                word = ps.stem(word)#Doing stemming
                doc_set = inverted_index.get(word.lower(), set())
                doc_set.add(index)
                inverted_index[word.lower()] = doc_set
            
    return inverted_index

inverted_index = makeInvertedIndex(title_list)

In [264]:
def orSearch(invertedIndex, query):
    result_set= set()
    for item in query:
        if item not in stopWords:
            item = ps.stem(item)
            doc_set = invertedIndex.get(item.lower(), set())
            result_set = result_set.union(doc_set)
        
    return result_set

orSearch(inverted_index, ['machine', 'structure'])

{47,
 71,
 125,
 162,
 200,
 221,
 277,
 303,
 309,
 322,
 353,
 356,
 412,
 434,
 436,
 447,
 486,
 511,
 515,
 577,
 605,
 624,
 643,
 651,
 655,
 659,
 664,
 684,
 722,
 734,
 750,
 792,
 824,
 874,
 893,
 896,
 906,
 942,
 956,
 989,
 1015,
 1041,
 1110,
 1153,
 1154,
 1178,
 1207,
 1236,
 1249,
 1253,
 1313,
 1328,
 1338,
 1342,
 1359,
 1374,
 1418,
 1448,
 1462,
 1504,
 1506,
 1521,
 1537,
 1588,
 1614,
 1635,
 1646,
 1711,
 1731,
 1798,
 1807,
 1823,
 1834,
 1902,
 1950,
 1951,
 1963,
 1973,
 2012,
 2034,
 2055,
 2062,
 2085,
 2116,
 2139,
 2144,
 2148,
 2178,
 2201}

In [265]:
def andSearch(invertedIndex, query):
    result_set= set()
    for item in query:
        if item in stopWords:
            query.remove(item)
    
    for index,item in enumerate(query):
        item = ps.stem(item)
        doc_set = invertedIndex.get(item.lower(), set())
        if index == 0:
            result_set = doc_set
        else:
            result_set = result_set.intersection(doc_set)
        
    return result_set


andSearch(inverted_index, ['machine', 'structure'])
#courses.loc[684]

{684}

In [266]:
def search_input(inverted_index, query):
    words = word_tokenize(query)
    
    doc_ids = list(andSearch(inverted_index, words))
    doc_ids.extend(list(orSearch(inverted_index, words)))
    
    search_results = []
    for id in doc_ids:
        course_info = courses.loc[id]
        course_dict = course_info.to_dict()
        course_converted_dict = {}
        for key, value in course_dict.items():
            try:
                course_converted_dict[key] = np.asscalar(value)
            except AttributeError:
                course_converted_dict[key] = value
        search_results.append(course_converted_dict)
    #right now- our results is dispalyed in a manner that lists 'and' queries first followed by "or" queries 
    #this is useful for presenting ranked response
        
    return search_results
    
search_results = search_input(inverted_index, 'machine structures')
print(len(search_results))

90


In [267]:
# Filter search_results by ShortDuration
def filter_result_by_shortduration(search_results):
    filtered_search_results = []
    for search_result in search_results:
        if search_result['f:ShortDuration']:
            filtered_search_results.append(search_result)
    return filtered_search_results

#filtered_search_results = filter_result_by_shortduration(search_results)
#print(filtered_search_results)


In [268]:
# Filter search_results by free courses
def filter_result_by_free_courses(search_results):
    filtered_search_results = []
    for search_result in search_results:
        if search_result['f:Free']:
            filtered_search_results.append(search_result)
    return filtered_search_results
#filtered_search_results = filter_result_by_shortduration(search_results)
#print(len(filtered_search_results))

In [269]:
# Filter search_results by ShortDuration
def filter_result_by_highworkload(search_results):
    filtered_search_results = []
    for search_result in search_results:
        if search_result['f:HighWorkload']:
            filtered_search_results.append(search_result)
    return filtered_search_results


In [270]:
# Filter search_results by enrollment
def filter_result_by_enrollment(search_results):
    filtered_search_results = []
    for search_result in search_results:
        if search_result['f:Enroll']:
            filtered_search_results.append(search_result)
    return filtered_search_results


In [271]:
# Filter search_results by self pace
def filter_result_by_selfpace(search_results):
    filtered_search_results = []
    for search_result in search_results:
        if search_result['f:Self Paced Courses']:
            filtered_search_results.append(search_result)
    return filtered_search_results


In [272]:
# Filter search_results by rating
def filter_result_by_highrating(search_results):
    filtered_search_results = []
    for search_result in search_results:
        if search_result['f:HighRating']:
            filtered_search_results.append(search_result)
    return filtered_search_results


In [273]:
# Filter search_results by enrollment
def filter_result_by_language(search_results, language='English'):
    filtered_search_results = []
    for search_result in search_results:
        if search_result['f:%s' %(language)]:
            filtered_search_results.append(search_result)
    return filtered_search_results


In [274]:
# Filter search_results by institute and provider
def filter_result_by_institute(search_results, institute='Coursera'):
    filtered_search_results = []
    for search_result in search_results:
        if search_result['f:Offered by %s' %(institute)]:
            filtered_search_results.append(search_result)
    return filtered_search_results

In [275]:
#Filter search_results by paid certifcation
def filter_result_by_paid_certification(search_results):
    filtered_search_results = []
    for search_result in search_results:
        if search_result['f:PaidCertification']:
            filtered_search_results.append(search_result)
    return filtered_search_results

In [276]:
search_results = search_input(inverted_index, 'github machine')
search_results = filter_result_by_highworkload(search_results)
print(len(search_results))
print(search_results)


21
[{'Title': 'Machine Learning Fundamentals', 'Subject': 'Machine Learning', 'Summary': 'Do you want to build systems that learn from experience? Or exploit data to create simple predictive models of the world? In this course, part of the Data Science MicroMasters program, you will learn a variety of supervised and unsupervised learning algorithms, and the theory behind those algorithms. Using real-world case studies, you will learn how to classify images, identify salient topics in a corpus of documents, partition people according to personality profiles, and automatically capture the semantic structure of words and use it to categorize documents. Armed with the knowledge from this course, you will be able to analyze many different types of data and to build descriptive and predictive models. All programming examples and assignments will be in Python, using Jupyter notebooks.', 'Description': 'Understand machine learning’s role in data-driven modeling, prediction, and decision-making

In [277]:
search_results = filter_result_by_paid_certification(search_results)
print(len(search_results))
print(search_results)


7
[{'Title': 'Neural Networks for Machine Learning', 'Subject': 'Machine Learning', 'Summary': nan, 'Description': "Learn about artificial neural networks and how they're being used for machine learning, as applied to speech and object recognition, image segmentation, modeling language and human motion, etc. We'll emphasize both the basic algorithms and the practical tricks needed to get them to work well. This course contains the same content presented on Coursera beginning in 2013. It is not a continuation or update of the original course. It has been adapted for the new platform. Please be advised tha...", 'Provider': 'Coursera', 'Instructors': 'Geoffrey Hinton', 'Institute': 'University of Toronto', 'Reviews': "This is one of those chance in a lifetime courses you have to get to learn from the greats. Geoffrey Hinton was one of the most important and influential researchers to work on artificial intelligence and neural nets back in the 80's. Currently he is working with Google in t

In [278]:
search_results = filter_result_by_institute(search_results, 'Coursera')
print(len(search_results))
print(search_results)
#To query with - edX, Udacity, FutureLearn,Stanford University,NPTEL,gacco,Microsoft,Georgia Institute of Technology,Massachusetts Institute of Technology
#To query with -'University of California, Berkeley', 'University of Washington' , 'University of Michigan','Google', 

6
[{'Title': 'Neural Networks for Machine Learning', 'Subject': 'Machine Learning', 'Summary': nan, 'Description': "Learn about artificial neural networks and how they're being used for machine learning, as applied to speech and object recognition, image segmentation, modeling language and human motion, etc. We'll emphasize both the basic algorithms and the practical tricks needed to get them to work well. This course contains the same content presented on Coursera beginning in 2013. It is not a continuation or update of the original course. It has been adapted for the new platform. Please be advised tha...", 'Provider': 'Coursera', 'Instructors': 'Geoffrey Hinton', 'Institute': 'University of Toronto', 'Reviews': "This is one of those chance in a lifetime courses you have to get to learn from the greats. Geoffrey Hinton was one of the most important and influential researchers to work on artificial intelligence and neural nets back in the 80's. Currently he is working with Google in t

In [279]:
search_results = filter_result_by_shortduration(search_results)
print(len(search_results))
print(search_results)


2
[{'Title': 'Applying Machine Learning to your Data with GCP', 'Subject': 'Machine Learning', 'Summary': nan, 'Description': 'Want to know how to query and process petabytes of data in seconds? Curious about data analysis that scales automatically as your data grows? Welcome to the Data Insights course! This 1-week, accelerated online course teaches participants how to derive insights through data analysis and visualization using the Google Cloud Platform. The course features interactive scenarios and hands-on labs where participants explore, mine, load, visualize, and extract insights from diverse Google BigQuery ...', 'Provider': 'Coursera', 'Instructors': 'Google Cloud Training', 'Institute': 'Google Cloud', 'Reviews': nan, 'ReviewCount': 0, 'Rating': 0.0, 'Session Start Date': '1st Jan, 2018', 'Course Pace': 'Upcoming', 'Language': 'English', 'Duration (weeks)': 0, 'Commitment': 7, 'Certification': 'Paid Certificate Available', 'Price': ' Free Online Course (Audit) ', 'URL': 'http

In [280]:
search_results = filter_result_by_free_courses(search_results)
print(len(search_results))
print(search_results)

2
[{'Title': 'Applying Machine Learning to your Data with GCP', 'Subject': 'Machine Learning', 'Summary': nan, 'Description': 'Want to know how to query and process petabytes of data in seconds? Curious about data analysis that scales automatically as your data grows? Welcome to the Data Insights course! This 1-week, accelerated online course teaches participants how to derive insights through data analysis and visualization using the Google Cloud Platform. The course features interactive scenarios and hands-on labs where participants explore, mine, load, visualize, and extract insights from diverse Google BigQuery ...', 'Provider': 'Coursera', 'Instructors': 'Google Cloud Training', 'Institute': 'Google Cloud', 'Reviews': nan, 'ReviewCount': 0, 'Rating': 0.0, 'Session Start Date': '1st Jan, 2018', 'Course Pace': 'Upcoming', 'Language': 'English', 'Duration (weeks)': 0, 'Commitment': 7, 'Certification': 'Paid Certificate Available', 'Price': ' Free Online Course (Audit) ', 'URL': 'http

In [281]:
search_results = filter_result_by_language(search_results,'English')
print(len(search_results))
print(search_results)

2
[{'Title': 'Applying Machine Learning to your Data with GCP', 'Subject': 'Machine Learning', 'Summary': nan, 'Description': 'Want to know how to query and process petabytes of data in seconds? Curious about data analysis that scales automatically as your data grows? Welcome to the Data Insights course! This 1-week, accelerated online course teaches participants how to derive insights through data analysis and visualization using the Google Cloud Platform. The course features interactive scenarios and hands-on labs where participants explore, mine, load, visualize, and extract insights from diverse Google BigQuery ...', 'Provider': 'Coursera', 'Instructors': 'Google Cloud Training', 'Institute': 'Google Cloud', 'Reviews': nan, 'ReviewCount': 0, 'Rating': 0.0, 'Session Start Date': '1st Jan, 2018', 'Course Pace': 'Upcoming', 'Language': 'English', 'Duration (weeks)': 0, 'Commitment': 7, 'Certification': 'Paid Certificate Available', 'Price': ' Free Online Course (Audit) ', 'URL': 'http

In [282]:
search_results = filter_result_by_enrollment(search_results)
print(len(search_results))
print(search_results)

0
[]


In [283]:
search_results = filter_result_by_selfpace(search_results)
print(len(search_results))
print(search_results)

0
[]


In [284]:
search_results = filter_result_by_highrating(search_results)
print(len(search_results))
print(search_results)

0
[]


In [285]:
import json
def clean_json(search_results):
    # converting our output to json format

    # Removing f: keys for cleaner display to %%HTML
    clean_search_results = []
    for item_dict in search_results:
        new_item_dict = {}
        for key,value in item_dict.items():
            if 'f:' not in key:
                new_item_dict[key] = value
            
        clean_search_results.append(new_item_dict)

    json_str = json.dumps(clean_search_results)
    return json_str
    

In [286]:
print(clean_json(search_results))

[]


In [287]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def cosine_similarity_description(doc_id=0, total_ranks=5):
    tfidf = TfidfVectorizer().fit_transform(courses['Description'].values.astype('U'))
    cosine_similarities = cosine_similarity(tfidf[doc_id], tfidf).flatten()
    
    total_ranks = int(0 - total_ranks - 1)
    most_similar_courses = cosine_similarities.argsort()[:total_ranks:-1]

    search_results=[]
    for id in most_similar_courses:
        course_info = courses.loc[id]
        course_dict = course_info.to_dict()
        course_converted_dict = {}
        for key, value in course_dict.items():
            try:
                course_converted_dict[key] = np.asscalar(value)
            except AttributeError:
                course_converted_dict[key] = value
        search_results.append(course_converted_dict)

    return search_results

search_results = cosine_similarity_description(0,5)
print(search_results)
print(len(search_results))

[{'Title': 'General Game Playing', 'Subject': 'Algorithms and Data Structures', 'Summary': nan, 'Description': 'Learn about General Game Playing (GGP) and develop GGP programs capable of competing against humans and other programs in GGP competitions .', 'Provider': 'Coursera', 'Instructors': 'Michael Genesereth', 'Institute': 'Stanford University', 'Reviews': nan, 'ReviewCount': 2, 'Rating': 5.0, 'Session Start Date': '28th Mar, 2016', 'Course Pace': 'Finished', 'Language': 'English', 'Duration (weeks)': 8, 'Commitment': 12, 'Certification': 'Certificate Available', 'Price': ' Free Online Course (Audit) ', 'URL': 'https://www.coursera.org/browse?source=deprecated_spark_cdp?siteID=SAyYsTvLiGQ-diCC35.6s0AbxQpugorlhg&utm_content=10&utm_medium=partners&utm_source=linkshare&utm_campaign=SAyYsTvLiGQ', 'f:ShortDuration': 0, 'f:HighWorkload': 1, 'f:PaidCertification': False, 'f:HighRating': 1, 'f:Self Paced Courses': False, 'f:Free': True, 'f:Enroll': False, 'f:Italian': False, 'f:Spanish': F

In [288]:
print(clean_json(search_results))

[{"Title": "General Game Playing", "Subject": "Algorithms and Data Structures", "Summary": NaN, "Description": "Learn about General Game Playing (GGP) and develop GGP programs capable of competing against humans and other programs in GGP competitions .", "Provider": "Coursera", "Instructors": "Michael Genesereth", "Institute": "Stanford University", "Reviews": NaN, "ReviewCount": 2, "Rating": 5.0, "Session Start Date": "28th Mar, 2016", "Course Pace": "Finished", "Language": "English", "Duration (weeks)": 8, "Commitment": 12, "Certification": "Certificate Available", "Price": " Free Online Course (Audit) ", "URL": "https://www.coursera.org/browse?source=deprecated_spark_cdp?siteID=SAyYsTvLiGQ-diCC35.6s0AbxQpugorlhg&utm_content=10&utm_medium=partners&utm_source=linkshare&utm_campaign=SAyYsTvLiGQ"}, {"Title": "Learn to Program: Crafting Quality Code", "Subject": "Programming", "Summary": NaN, "Description": "Not all programs are created equal. In this course, we'll focus on writing quali