In [None]:
from bs4 import BeautifulSoup
import requests
from bs4.element import Comment
import re
from nltk.tokenize import word_tokenize
import csv
import json
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pickle
import joblib

In [None]:
base_url = 'https://en.wikipedia.org'

In [None]:
# Get the links in the paragraphs for wikipedia URL
def get_links_for_url(url):
    url_title = url[url.rfind('wiki')+5:]
    print("reading page: " + url_title)
    page = requests.get(url)
    try:
        soup = BeautifulSoup(page.text, "html5lib")
        tags = soup.find_all('p')
        res_links = []
        for tag in tags:
            links = tag.find_all('a', attrs={'href': re.compile("^/wiki/")})
            for link in links:
                res_links.append(link.get('href'))
        return res_links
    except Exception as e:
        print("Error loading page")
        return None
        

In [None]:
# Get cleaned text in the paragraph for wikipedia URL
def get_text_for_url(url):
    url_title = url[url.rfind('wiki')+5:]
    print("reading page: " + url_title)
    page = requests.get(url)
    try:
        soup = BeautifulSoup(page.text, "html5lib")
        text = soup.find_all('p')
        full_text = ""
        num = 0
        for t in text:
            if num > 5:
                break
            if not t.find('img') and (t.name == 'p' or t.name == 'ul'):
                text = str(t.getText().replace('\n', ''))
                full_text += re.sub("[\(\[].*?[\)\]]", "", text) + " "
                num += 1
        return full_text
    except Exception as e:
        print("Error loading page")
        return None

In [None]:
links = get_links_for_url(base_url + '/wiki/Computer_science')
links

In [None]:
text = get_text_for_url(base_url + '/wiki/Computer_science')
text

In [None]:
def reduce(u, v):
    return (u[0].union(v[0]), u[1] + v[1])

def clean_text(text):
    if text is None:
        return None
    regex = re.compile('[^a-zA-Z ]')
    text = regex.sub(' ', text)
    tokenized_text = word_tokenize(text.lower())
    return tokenized_text


def clean_text_for_vectorizer(text):
    if text is None:
        return None
    regex = re.compile('[^a-zA-Z ]')
    text = regex.sub(' ', text)
    return text.lower()

In [None]:
def generate_representation_for_course_description(course_description, branching_factor, search_depth, representation):
    '''
    course_description: [str], string of text description for a course
    branching_factor: float, in range(0, 1), what percentage of the links we want to traverse at each step
    search_depth: the maximum depth of link traversals starting from an original word in the course_description
    
    Returns: (links, representation)
    '''
    course_description
    res = (set(), "")
    explored_urls = set()
    for word in course_description:
        url = '/wiki/' + word
        res = reduce(res, generate_representation_for_word(explored_urls, url, branching_factor, search_depth, ""))
    return res
    
    

def generate_representation_for_word(explored_urls, url, branching_factor, search_depth, representation):
    '''
    explored_urls: set, of urls that have already been explored
    url: the url we wish to find the wikipedia article on, 
    if word is multiple words, it will appear as /wiki/computer_science, /wiki/computer_graphics, etc
    branching_factor: float, in range(0, 1), which percentage of the links we want to traverse at each step
    search_depth: int, the maximum depth of link traversals starting from word
    
    return: (links, represenation)
    links: list of strings
    representation: string
    '''
    if search_depth == 0 or url in explored_urls:
        return (set(), "")
    explored_urls.add(url)
    links = get_links_for_url(base_url + url)
    if links is None: # Only if the url is not valid
        return (explored_urls, representation)
    text = get_text_for_url(base_url + url)
    representation += " " + text
    next_links = links[:min(8, int(branching_factor * len(links)))]
    
    res = (explored_urls, representation)
    for link in next_links:
        res = reduce(res, generate_representation_for_word(explored_urls, link, branching_factor, search_depth - 1, ""))
        
    return res

In [None]:
rep_for_cs_course = generate_representation_for_course_description(["Computer_science", "biology"], 0.05, 2, "")

In [None]:
rep_for_cs_course

In [None]:
#rep_for_Computer_science = generate_representation_for_word(set(), '/wiki/Computer_science', 0.05, 2, "")
# rep_for_Computer_science

In [None]:
with open("../../../data/courseroster/full_json.txt") as f:
    cornell_course_descriptions = json.load(f)

In [None]:
cornell_course_descriptions.keys()

In [None]:
cornell_course_descriptions['CS'][0]

In [None]:
corpus = []
course_codes = []
all_major_courses = [cornell_course_descriptions[key] for key in cornell_course_descriptions.keys()]
print(all_major_courses[0:100])
all_courses = []
for major_courses in all_major_courses:
    all_courses += major_courses

for i, course_data in enumerate(all_courses):
    course_number = course_data['courseNumber']
    course_title = course_data['courseTitle']
    course_desc = course_data['description']
    cleaned_course_desc = clean_text_for_vectorizer(course_desc)
    if cleaned_course_desc != None and cleaned_course_desc != []: 
        corpus.append(cleaned_course_desc)
        course_codes.append(course_data['subject'] + ' ' + course_number)
    course_outcome = None
    try:
        course_outcome = course_data['outcome']
    except Exception as e:
        course_outcome = ""

In [None]:
print(len(corpus))
print(corpus[4000])
print(len(course_codes))

In [None]:
vectorizer = TfidfVectorizer(lowercase= True, stop_words='english', max_df=0.7, min_df = 2, smooth_idf=True)
X = vectorizer.fit_transform(corpus)
print(X.shape)
doc_by_vocab = vectorizer.fit_transform([d for d in corpus]).toarray()

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [None]:
def get_top_n_tfidf_terms_from_text(vectorizer, text, n):
    print("S")
    if text is None:
        return []
    feature_array = np.array(vectorizer.get_feature_names())
    transformed_text = vectorizer.transform([text])
    tfidf_sorting = np.argsort(transformed_text.toarray()).flatten()[::-1]
    k = len(text.split())
    top_n = feature_array[tfidf_sorting][:min(k, n)]
    return top_n

def get_top_n_important_terms_from_text(vectorizer, text, n):
    pass
    

In [None]:
cs4110 = "Introduction to the design of systems programs, with emphasis on multiprogrammed operating systems. Topics include concurrency, synchronization, deadlocks, memory management, protection, input-output methods, networking, file systems and security. The impact of network and distributed computing environments on operating systems is also discussed."
cs2300 = "Web programming requires the cooperation of two machines: the one in front of the viewer (client) and the one delivering the content (server). INFO 1300 concentrates almost exclusively on the client side. The main emphasis in INFO 2300 is learning about server side processing. Students begin with a short overview of the PHP server-side scripting language, then look at interactions with databases, learning about querying via the database language SQL. Through a succession of projects, students learn how to apply this understanding to the creation of an interactive, data-driven site via PHP and the MYSQL database. Also considered are technologies such as Javascript and Ajax and techniques to enhance security and privacy. Design and usability issues are emphasized. A major component of the course is the creation of a substantial web site."
cs2110 = "Intermediate programming in a high-level language and introduction to computer science. Topics include object-oriented programming (classes, objects, subclasses, types), graphical user interfaces, algorithm analysis (asymptotic complexity, big O notation), recursion, testing, program correctness (loop invariants), searching/sorting, data structures (lists, trees, stacks, queues, heaps, search trees, hash tables, graphs), graph algorithms. Java is the principal programming language."
psych1101 = "This course provides an introduction to the science of the mind.  Everyone knows what it's like to think and perceive, but this subjective experience provides little insight into how minds emerge from physical intities like brains.  To address this issue, cognitive science integrates work from at least five disciplines: Psychology, Neuroscience, Computer Science, Linguistics, and Philosophy.  This course introduces students to the insights these disciplines offer into the workings of the mind by exploring visual perception, attention, memory, learning, problem solving, language, and consciousness."
math2940 = "linear algebra and its applications  topics include matrices  determinants  vector spaces  eigenvalues and eigenvectors  orthogonality and inner product spaces  applications include brief introductions to difference equations  markov chains  and systems of linear ordinary differential equations  may include computer use in solving problems"

print(get_top_n_tfidf_terms_from_text(vectorizer, clean_text_for_vectorizer(cs2300), 15))

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
test_x = vectorizer.transform([cs2300])
print(test_x.shape)

res = cosine_similarity(X, test_x).flatten()
top_res = np.argsort(res)[::-1]
print(top_res[0:15])    
for res in top_res[0:15]:
    print(corpus[res])
    print("\n")

In [None]:
# pickle.dump(vectorizer, open("vectorizer.pkl", "wb"))
# pickle.dump(X, open("tdm.pkl", "wb"))
#pickle.dump(corpus, open("corpus.pkl", "wb"))
pickle.dump(course_codes, open("course_codes.pkl", "wb"))

In [None]:
major_codes = list(cornell_course_descriptions.keys())
print(major_codes)
all_courses = []
for major_code in major_codes:
    for course_data in cornell_course_descriptions[major_code]:
        all_courses.append(course_data)
print(len(all_courses))

In [None]:
course_representations = {}
for course_data in all_courses:
    course_number = course_data['courseNumber']
    dept = course_data['subject']
    course_desc = course_data['description']
    cleaned_course_desc = clean_text_for_vectorizer(course_desc)
    print(course_number)
    print(course_desc)
    print(cleaned_course_desc)
    top_tfidf_words = get_top_n_tfidf_terms_from_text(vectorizer, cleaned_course_desc, 20)
    print(top_tfidf_words)
    course_representation = ' '.join(top_tfidf_words)
    course_representations[dept + ' ' +str(course_number)] = course_representation
    print(course_representation)
    print("************************")

In [None]:
course_representations['CS 3410']

In [None]:
pickle.dump(course_representations, open("all_courses_20_tfidf_representations.p", "wb" ) )

In [None]:
tag_json = {'tags': []}
added_tags = []
for major in cornell_course_descriptions.keys():
    major_desc = ""
    for course_data in cornell_course_descriptions[major]:
        if course_data['description'] != None:
            major_desc += " " + clean_text_for_vectorizer(course_data['description'])
    top_tfidf_words = get_top_n_tfidf_terms_from_text(vectorizer, major_desc, 50)
#     print(top_tfidf_words)
    for word in top_tfidf_words:
        if word not in added_tags:
            tag_json['tags'].append({'tag': word})
            added_tags.append(word)
print (tag_json['tags'])

In [None]:
json.dump(tag_json, open( "tag_json.txt", "w" ) )