In [None]:
import pandas as pd
import ast
from tabulate import tabulate
import re
import numpy as np
import math

In [None]:
# Create a Pandas dataframe to act as our "table" for now

# Import csv file of scraped data from Class Roster
# Each row is a Cornell class
classes_df = pd.read_csv("./roster_api_data.csv", na_filter= False)

# Find all classes with Subject ENGRD and number 2110

subject = "ENGRD"
number = 2110

# Result is a table with one row

result = classes_df[(classes_df["subject"] == subject) & (classes_df["number"] == number)]

result.loc[result.index[0], "subject"] = "INFO, ECON, SOC, CS"
result.loc[result.index[0], "number"] = "2040, 2090, 2850, 2040"

result = result[(result["subject"].str.contains("ECON")) & (result["number"].str.contains("2040"))]

#result = classes_df[(classes_df["id"] == 366268)]

# Display that table
display(result)

# Since result is a table with one row
#   we need .iloc[0] to choose only that first row
#   then we get the "professors" column and "cast" it to a list
#   using ast.literal_eval
professor_list = ast.literal_eval(result.iloc[0]["professors"])

print(professor_list)



In [None]:
# Import csv file of scraped data from RateMyProfessor.com
ratemyprof_df = pd.read_csv("./ratemyprofessor_api_data.csv", na_filter= False)

# Get only the rows for professors that have reviews and tags
reviewed_professors = ratemyprof_df[ratemyprof_df["review"] != ""]

display(reviewed_professors)

In [None]:
np_dat = pd.read_csv("./roster_api_data.csv", na_filter= False).to_numpy()
# COLUMNS id,subject,number,title,description,outcomes,professors

#headers = ["id", "subject", "number", "title", "description", "outcomes", "professors"]

# tabulate data
#table = tabulate(np_dat, headers, tablefmt="fancy_grid")
#print(table)

In [None]:

#get individual columns as np.arrays

np_ids = np.array(np_dat[:, 0])
np_subject = np.array(np_dat[:, 1])
np_number = np.array(np_dat[:, 2])
np_title = np.array(np_dat[:, 3])
np_descriptions = np.array(np_dat[:, 4])
np_outcomes = np.array(np_dat[:, 5])
np_professors = np.array(np_dat[:, 6])

In [None]:
print(np_outcomes[100])

In [None]:
id_to_index = {}

for i in range(len(np_ids)):
    id_to_index[np_ids[i]] = i

In [None]:
inverted_dict = {}
regex = r'\w+' # regular expression to find words (MAY NEED TO REVISE/EDIT THIS)
    
for i in range(len(np_descriptions)):
#for i in range(len(np_outcomes)):
    #if len(np_outcomes[i]) > 0:
     #   match = ''.join(np_outcomes[i])
    #else:
   #     match = np_descriptions[i]
    toks = re.findall(regex, np_descriptions[i].lower())
    seen = set()
    for t in toks:
        if t in inverted_dict.keys():
            if t not in seen:
                count = toks.count(t)
                inverted_dict[t].append((id_to_index[np_ids[i]], count))
                seen.add(t)
        else:
            count = toks.count(t)
            inverted_dict[t] = [(id_to_index[np_ids[i]], count)]
            seen.add(t)
    seen.clear()        


In [None]:
inverted_dict['the'] #gives list of tuples (doc_index, frequency of 'the' in doc)

In [None]:
min_df = 0
max_df_ratio = 0.17 #tuned this down until it printed out only very common words for a course description
num_docs = np.shape(np_dat)[0]

idf_dict = {}

for t in inverted_dict.keys():
    df = len(inverted_dict[t])
    if float(df/num_docs) < max_df_ratio:
        idf_dict[t] = math.log(num_docs/(1 + df), 2)
    else:
        print(t)

In [None]:
norms = np.zeros(num_docs)

for i in idf_dict:
    for tup in inverted_dict[i]:
        desc_idx = tup[0]
        desc_term_freq = tup[1]
        norms[desc_idx] += (desc_term_freq * idf_dict[i]) ** 2
        
norms = np.sqrt(norms)

In [None]:
original_query = "computer programming"

query = original_query.split()
tuples = list()
    
query_norm_sum = 0

for q in query:
    if q in idf_dict.keys():
        q_count = query.count(q)
        q_idf = idf_dict[q]
        query_norm_sum += (q_count*q_idf) ** 2
            
query_norm = math.sqrt(query_norm_sum)
    
doc_scores = {}

for q in query: #iterate over each query term
    if q in idf_dict.keys(): #if q has inverted doc frequency val
        for (doc_idx, value) in inverted_dict[q]: #iterate over each tuple in inverted_index[query_term]
            if doc_idx not in doc_scores.keys():
                doc_scores[doc_idx] = query.count(q) * idf_dict[q] * value #begin accumulator
            else:
                doc_scores[doc_idx] += query.count(q) * idf_dict[q] * value #add to accumulator
            #Additional score for query term in title
           # score_boost = 0.1
            #if q in np_title[doc_idx]:
             #   print(q, np_title[doc_idx])
              #  doc_scores[doc_idx] += score_boost
    
    #GET FROM DICT TO LIST OF TUPLES WHILE DIVIDING BY NORMS
    
for doc_idx, value in doc_scores.items():
    tuples.append((value/(query_norm*norms[doc_idx]), doc_idx)) 
        
tuples = sorted(tuples, key=lambda x: x[0], reverse=True)


##### THIS WHOLE BLOCK IS COPIED IN THE NEXT ONE DOWN AS A FUNCITON cosine_sim

In [None]:
def cosine_sim(original_query):
    query = original_query.split()
    tuples = list()
    
    query_norm_sum = 0

    for q in query:
        if q in idf_dict.keys():
            q_count = query.count(q)
            q_idf = idf_dict[q]
            query_norm_sum += (q_count*q_idf) ** 2
            
    query_norm = math.sqrt(query_norm_sum)
    
    doc_scores = {}

    for q in query: #iterate over each query term
        if q in idf_dict.keys(): #if q has inverted doc frequency val
            for (doc_idx, value) in inverted_dict[q]: #iterate over each tuple in inverted_index[query_term]
                if doc_idx not in doc_scores.keys():
                    doc_scores[doc_idx] = query.count(q) * idf_dict[q] * value #begin accumulator
                else:
                    doc_scores[doc_idx] += query.count(q) * idf_dict[q] * value #add to accumulator
                #Additional score for query term in title
               # score_boost = 0.1
                #if q in np_title[doc_idx]:
                 #   print(q, np_title[doc_idx])
                  #  doc_scores[doc_idx] += score_boost
    
        #GET FROM DICT TO LIST OF TUPLES WHILE DIVIDING BY NORMS
    
    for doc_idx, value in doc_scores.items():
        tuples.append((value/(query_norm*norms[doc_idx]), doc_idx)) 
        
    tuples = sorted(tuples, key=lambda x: x[0], reverse=True)
    return tuples

In [67]:
def cosine_sim_class(class_tag): #input is of the form 'INFO 4300' or 'INFO4300'
    
    subject = "".join(re.split("[^a-zA-Z]*", class_tag)) 
    number = int("".join(re.split("[^0-9]*", class_tag)))

    result = classes_df[(classes_df["subject"] == subject) & (classes_df["number"] == number)]
    
    
    
    original_query = result['description'].item()
    
    print(original_query)
    
    query = original_query.split()
    tuples = list()
    
    query_norm_sum = 0

    for q in query:
        if q in idf_dict.keys():
            q_count = query.count(q)
            q_idf = idf_dict[q]
            query_norm_sum += (q_count*q_idf) ** 2
            
    query_norm = math.sqrt(query_norm_sum)
    
    doc_scores = {}

    for q in query: #iterate over each query term
        if q in idf_dict.keys(): #if q has inverted doc frequency val
            for (doc_idx, value) in inverted_dict[q]: #iterate over each tuple in inverted_index[query_term]
                if doc_idx not in doc_scores.keys():
                    doc_scores[doc_idx] = query.count(q) * idf_dict[q] * value #begin accumulator
                else:
                    doc_scores[doc_idx] += query.count(q) * idf_dict[q] * value #add to accumulator
                #Additional score for query term in title
               # score_boost = 0.1
                #if q in np_title[doc_idx]:
                 #   print(q, np_title[doc_idx])
                  #  doc_scores[doc_idx] += score_boost
    
        #GET FROM DICT TO LIST OF TUPLES WHILE DIVIDING BY NORMS
    
    for doc_idx, value in doc_scores.items():
        tuples.append((value/(query_norm*norms[doc_idx]), doc_idx)) 
        
    tuples = sorted(tuples, key=lambda x: x[0], reverse=True)
    return tuples

In [69]:
cosine_sim_class("INFO 4300")

How to make sense of the vast amounts of information available online, and how to relate it and to the social context in which it appears? This course introduces basic tools for retrieving and analyzing unstructured textual information from the web and social media. Applications include information retrieval (with human feedback), sentiment analysis and social analysis of text. The coursework will include programming projects that play on the interaction between knowledge and social factors.


[(0.2430884969009331, 956),
 (0.1538012704074627, 2706),
 (0.13258361531523577, 4164),
 (0.13192279688331224, 1132),
 (0.12848188880802422, 6553),
 (0.12304136566202581, 1801),
 (0.11600441958326378, 2702),
 (0.11420795624545162, 4055),
 (0.11343238324623284, 3619),
 (0.10766986359962369, 6153),
 (0.10487463483927247, 5760),
 (0.10438980269450861, 2197),
 (0.10107560879559316, 6935),
 (0.09944406103991851, 4422),
 (0.09845275518100138, 3567),
 (0.09697177628482306, 5166),
 (0.09515122588767835, 1109),
 (0.09249632810791074, 5769),
 (0.09238093913120059, 1901),
 (0.09132890593843741, 6790),
 (0.0883499568520541, 2704),
 (0.0880741042551672, 6897),
 (0.0879341459392197, 978),
 (0.08638474053327358, 7035),
 (0.08296170424619226, 5645),
 (0.08279343374153658, 566),
 (0.0821608748170901, 4061),
 (0.08123042479687351, 2717),
 (0.0809342735167813, 963),
 (0.08002748978452369, 6694),
 (0.07998162070210978, 5676),
 (0.07973665303457798, 4154),
 (0.07973665303457798, 4162),
 (0.07922249604451782

In [47]:
#original_query = "computer programming"
#tuples = cosine_sim(original_query)

original_query = "ENGRD2110"

tuples = cosine_sim_class(original_query)



print("#" * len(original_query))
print(original_query)
print("#" * len(original_query))

for score, doc_idx in tuples[:10]:
    print("\n\n")
    print("Score: %s \n" % (score))
    print("Class: %s %s %s \n" % (np_subject[doc_idx], np_number[doc_idx], np_title[doc_idx]))
    print("Description: %s \n" % np_descriptions[doc_idx])
    print("\n\n")

Intermediate programming in a high-level language and introduction to computer science. Topics include object-oriented programming (classes, objects, subclasses, types), graphical user interfaces, algorithm analysis (asymptotic complexity, big "O" notation), recursion, testing, program correctness (loop invariants), searching/sorting, data structures (lists, trees, stacks, queues, heaps, search trees, hash tables, graphs), graph algorithms. Java is the principal programming language.
#########
ENGRD2110
#########



Score: 0.12783628062946864 

Class: ORIE 7390 Selected Topics in Mathematical Programming 

Description: Current research topics in mathematical programming. 







Score: 0.12783628062946864 

Class: ORIE 7391 Selected Topics in Mathematical Programming 

Description: Current research topics in mathematical programming. 







Score: 0.12439170344633148 

Class: ECE 2400 Computer Systems Programming 

Description: Computer systems programming involves developing software

In [60]:
#SVD for query expansion

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse.linalg import svds
from sklearn.preprocessing import normalize


In [61]:
vectorizer = TfidfVectorizer(stop_words = 'english', max_df = max_df_ratio,
                            min_df = 0)
my_matrix = vectorizer.fit_transform([x for x in np_descriptions]).transpose()

In [62]:
print(my_matrix.shape)

(21308, 7346)


In [63]:
u, s, v_trans = svds(my_matrix, k=100)
words_compressed, _, docs_compressed = svds(my_matrix, k=40)
docs_compressed = docs_compressed.transpose()

word_to_index = vectorizer.vocabulary_
index_to_word = {i:t for t,i in word_to_index.items()}

words_compressed = normalize(words_compressed, axis = 1)

In [64]:
def closest_words(word_in, k = 10):
    if word_in not in word_to_index: return "Not in vocab."
    sims = words_compressed.dot(words_compressed[word_to_index[word_in],:])
    asort = np.argsort(-sims)[:k+1]
    return [(index_to_word[i],sims[i]/sims[asort[0]]) for i in asort[1:]]

In [65]:
closest_words("linguistics")

[('ascriptions', 0.780659839194535),
 ('pragmatics', 0.7774734837194036),
 ('plagiarism', 0.7693961522904598),
 ('syntax', 0.7683434542332689),
 ('sociolinguistics', 0.7543891547426924),
 ('resembling', 0.742089059263291),
 ('phonology', 0.7343799589942953),
 ('sentences', 0.7291930116654517),
 ('semantics', 0.7287488875751992),
 ('archaisms', 0.7219146140072756)]

In [56]:
closest_words("finance")

[('investing', 0.9636650474674504),
 ('investment', 0.9591280347435008),
 ('valuation', 0.9576432627742418),
 ('mergers', 0.9526677362660698),
 ('investments', 0.9525508171390219),
 ('capital', 0.9489427449185511),
 ('stock', 0.948834424609913),
 ('financial', 0.9465555788115145),
 ('debt', 0.9440743906913157),
 ('acquisitions', 0.9415530494795988)]