# Text Mining Project: AUC Courses Recommender
## Desrciption
In this notebook you will find the source code for the Amsterdam University College (AUC) Course Recommender which was made for a Project for the course called Text Mining. 

## Code
Imports:

In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import string
from sklearn.metrics.pairwise import cosine_similarity
from gensim.parsing.preprocessing import remove_stopwords
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

In [2]:
pd.options.mode.chained_assignment = None

### Loading and Preprocessing the Data

Loading the data in as a DataFrame

In [5]:
data = pd.read_csv("datasets/recommender_dataset.csv")

In [7]:
print(len(data))

3812


Droping the rows which had noting in the row in the course_catalogue_number column.

In [9]:
courses = data.dropna(subset=['course_catalogue_number', 'is_part_of', 'language_of_instruction', 'course_description'])

In [11]:
print(len(courses))

3345


Getting out all of the duplicates form the data, just in case there are any.

In [14]:
indices = pd.Series(courses.index, index=courses['course_catalogue_number']).drop_duplicates()

Removing the stop words from columns: "is_part_of" and "course_description"

In [19]:
courses = courses.reset_index(drop=True)
for i in range(1, int(len(courses))):
    courses.loc[i, 'course_name'] = courses.loc[i, 'course_name'].lower()
    courses.loc[i, 'college_graduate'] = courses.loc[i, 'college_graduate'].lower()
    courses.loc[i, 'language_of_instruction'] = courses.loc[i, 'language_of_instruction'].lower()
    courses.loc[i, 'is_part_of'] = courses['is_part_of'][i].lower()
    courses.loc[i, 'is_part_of'] = remove_stopwords(courses.loc[i, 'is_part_of'])
    courses.loc[i, 'course_description'] = courses.iloc[i, 7].lower()
    courses.loc[i, 'course_description'] = remove_stopwords(courses.loc[i, 'course_description'])

In [21]:
print(courses['course_name'][3000])

the climate emergency. the history of a crisis


Removing punctuation from the text

In [24]:
for i in range(len(courses)):
    courses.loc[i, 'course_name'] = re.sub(r'[^\w\s]+', '', courses['course_name'][i])
    courses.loc[i, 'is_part_of'] = re.sub(r'[^\w\s]+', '', courses['is_part_of'][i])
    courses.loc[i, 'college_graduate'] = re.sub(r'[^\w\s]+', '', courses['college_graduate'][i])
    courses.loc[i, 'course_description'] = re.sub(r'[^\w\s]+', '', courses['course_description'][i])

In [26]:
print(len(courses))

3345


In [28]:
print(courses['course_name'][3000])

the climate emergency the history of a crisis


### Tokenization

Tokenization of columns: 'course_name', 'is_part_of', 'college_graduate', and 'course_description'

In [30]:
for i in range(len(courses)):
    courses.loc[i, 'course_name'] = nltk.tokenize.WordPunctTokenizer().tokenize(courses['course_name'][i])
    courses.loc[i, 'is_part_of'] = nltk.tokenize.WordPunctTokenizer().tokenize(courses['is_part_of'][i])
    courses.loc[i, 'college_graduate'] = nltk.tokenize.WordPunctTokenizer().tokenize(courses['college_graduate'][i])
    courses.loc[i, 'course_description'] = nltk.tokenize.WordPunctTokenizer().tokenize(courses['course_description'][i])

Making a new column with all of the text from the row.

In [32]:
courses['combined_text'] = courses['course_name'] + courses['is_part_of'] + courses['college_graduate'] + courses['course_description']

### Vectorization

Tagging the column courses['combined_text'] in order to be able to vectorize it with doc2vec.

In [36]:
tagged_data = [TaggedDocument(words=word_tokenize(str(_d)), tags=[str(i)]) for i, _d in enumerate(courses['combined_text'])]

Below I am vectorizing the column courses['combined_text'].

In [38]:
params = {
    'vector_size': 100, # dimension of embeddings
    'window': 5, # window -/+ before and after focus word
    'epochs': 5, # number of iterations over the corpus
    'min_count': 5, # filter on words whose frequency is below this count
    'workers': 4, # how many cores to use
    'alpha': 0.05 # initial learning rate for SGD. This is lambda in the class notes
}

model = Doc2Vec(**params)
  
model.build_vocab(tagged_data)

max_epochs = 100

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=1)
    model.alpha = 0.2
    model.min_alpha = model.alpha

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
iteration 50
iteration 51
iteration 52
iteration 53
iteration 54
iteration 55
iteration 56
iteration 57
iteration 58
iteration 59
iteration 60
iteration 61
iteration 62
iteration 63
iteration 64
iteration 65
iteration 66
iteration 67
iteration 68
iteration 69
iteration 70
iteration 71
iteration 72
iteration 73
iteration 74
iteration 75
iteration 76
iteration

### Recommender

In [87]:
def recommend_courses(li_course_ids, model, courses_df, top_n=5):
    #selecting the courses from the data
    selected_texts = []
    for cid in li_course_ids:
        row = courses_df[courses_df['course_catalogue_number'] == cid]
        if not (len(row) == 0):
            selected_texts.append(word_tokenize(str(row.iloc[0]['combined_text'])))

    #getting the vectors in form the doc2vec model and making one vector out of all the courses in the li_course_ids
    vectors = [model.infer_vector(tokens) for tokens in selected_texts]
    av_vector = np.mean(vectors, axis=0).reshape(1, -1)
    all_vectors = np.array([model.dv[i] for i in range(len(model.dv))])
    
    #calculating the cosine similarities and flattening them to one dimension
    cosine_sim = cosine_similarity(av_vector, all_vectors).flatten()
    
    #making the cosine similarities a list and sorting it
    sim_scores = list(enumerate(cosine_sim))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    #removing the courses from the sim_scores which were in the li_course_ids
    idex_to_id = courses_df['course_catalogue_number'].tolist()
    selected_indicies = [idex_to_id.index(cid) for cid in li_course_ids if cid in idex_to_id]
    sim_scores = [s for s in sim_scores if s[0] not in selected_indicies]
    
    #getting the top 5 courses from the similaritie scores
    course_indices = [i[0] for i in sim_scores[:top_n]]
    
    #returning the 
    return courses_df.iloc[course_indices][['course_name', 'course_catalogue_number']]

### Testing

Opening of the test dataset.

In [79]:
majors = []
tracks = []
li_coourse_ids = []
with open("datasets/test_set.txt", 'r') as f:
    for l in f:
        if l[0] == '[':
            li_coourse_ids.append(eval(l))
        elif l.isupper():
            majors.append(l)
        elif l.islower():
            tracks.append(l)

In [131]:
print(majors[0])
print(tracks[0])
recommend_courses(li_coourse_ids[0], model, courses)

SCIENCE

math



Unnamed: 0,course_name,course_catalogue_number
1741,"[machine, learning]",6012B0809Y
1746,"[machine, learning, 2]",52042MAL6Y
554,"[community, project]",900201CICY
1572,"[introduction, to, machine, learning, 2]",50824ITM6Y
1206,"[french, 2, advanced]",11112L366Y


In [123]:
print(tracks[1])
recommend_courses(li_coourse_ids[1], model, courses)

biomed



Unnamed: 0,course_name,course_catalogue_number
394,"[biomedical, systems, biology]",5234BISB6Y
2181,"[nutrition, and, health]",900271SCIY
2592,"[research, master, thesis]",7055R301KY
3330,"[workshop, selfcare, in, academia, and, beyond]",7525W008AY
1298,"[global, mental, health]",739400140Y


In [143]:
print(tracks[2])
recommend_courses(li_coourse_ids[2], model, courses)

physics



Unnamed: 0,course_name,course_catalogue_number
2209,"[optimisation, of, business, processes]",53348OPB6Y
2274,"[philosophy, of, science]",5354PHSC6Y
2142,"[natural, resource, economics]",6414M0505Y
2604,"[research, masters, internship, linguistics, and]",176527036Y
1703,"[literary, ecologies]",900202HUMY


In [167]:
print(tracks[3])
recommend_courses(li_coourse_ids[3], model, courses)

bio/environment



Unnamed: 0,course_name,course_catalogue_number
2276,"[philosophy, of, science]",7311E0030Y
399,[biotechnology],5234BIOT6Y
3317,"[words, sounds, images, the, anthropology, of,...",7312E0056Y
2524,"[quantum, information, theory]",5334QUIT8Y
445,"[care, and, capitalism, the, welfare, state, a...",111221316Y


In [169]:
print(tracks[4])
recommend_courses(li_coourse_ids[4], model, courses)

information/neuro



Unnamed: 0,course_name,course_catalogue_number
2194,"[objects, in, context, an, interdisciplinary]",138120016Y
1024,"[esg, advanced, environment]",6614ZE015Y
1947,"[masterclasses, green, life, sciences]",5224MGLS3Y
2969,"[team, development, through, action, research]",7204MA33XY
110,"[advanced, private, law, in, context]",3254AVLCVY


In [153]:
print(majors[1])
print(tracks[5])
recommend_courses(li_coourse_ids[5], model, courses)

SOCIAL SCIENCE

economics



Unnamed: 0,course_name,course_catalogue_number
1923,"[masters, thesis, philosophy, of, the, humanit...",189429002Y
2392,"[presentation, anthropology, of, health, care,...",7313TP090Y
2082,"[modern, hebrew, language, acquisition, 1]",126111046Y
754,"[curating, art, and, cultures, internship]",151618055Y
3210,"[tutoring, and, study, guidance, bachelors, in...",11802A025Y


In [155]:
print(tracks[6])
recommend_courses(li_coourse_ids[6], model, courses)

psychology/economics



Unnamed: 0,course_name,course_catalogue_number
1191,"[foundation, appreciating, the, complexity, of]",738100001Y
2569,"[religion, violence, conflict, resolution]",153416027Y
1355,"[history, and, philosophy, of, the, humanities]",187421516Y
2080,"[modern, greek, language, and, literature, 1]",130221106Y
2860,"[spectroscopic, analysis]",5254SPAN6Y


In [161]:
print(tracks[7])
recommend_courses(li_coourse_ids[7], model, courses)

political science/law



Unnamed: 0,course_name,course_catalogue_number
758,"[curatorial, practices, in, the, contemporary,...",15161E036Y
3054,"[the, politics, of, choreography]",145423106Y
2856,"[specialisation, urban, geography]",73435E521Y
1447,"[internal, conflict, in, clan, societies, peace]",73230233LY
2161,[neurosystems],5052NEU12Y


In [145]:
print(tracks[8])
recommend_courses(li_coourse_ids[8], model, courses)

history/philosophy



Unnamed: 0,course_name,course_catalogue_number
1533,"[interventions, in, sport, and, performance, p...",7204MA36XY
2443,"[professional, skills, science, connect]",5224PSSC2Y
2502,"[research, skills, ethnography, and, beyond]",75250075FY
714,"[criminalistics, applied, to, forensic, chemis...",5274CATF6Y
2423,"[probability, and, statistics]",900229SCIY


In [121]:
print(majors[2])
print(tracks[8])
recommend_courses(li_coourse_ids[8], model, courses)

HUMANITIES

history/philosophy



Unnamed: 0,course_name,course_catalogue_number
10,"[1, current, issues, in, medical, informatics,...",4604MM101Y
167,"[algorithms, beyond, the, worst, case]",5334ABTW6Y
1820,"[master, thesis, ai]",5204MTA48Y
2765,"[seminar, heritage, and, public, history]",13832E016Y
2173,"[nonlinear, partial, differential, equations]",5334NPDE8Y


In [123]:
print(tracks[9])
recommend_courses(li_coourse_ids[9], model, courses)

cultural analysis



Unnamed: 0,course_name,course_catalogue_number
167,"[algorithms, beyond, the, worst, case]",5334ABTW6Y
10,"[1, current, issues, in, medical, informatics,...",4604MM101Y
2765,"[seminar, heritage, and, public, history]",13832E016Y
2173,"[nonlinear, partial, differential, equations]",5334NPDE8Y
1615,"[inverse, problems, in, imaging]",5334IPII6Y


In [203]:
print(tracks[10])
recommend_courses(li_coourse_ids[10], model, courses)

media/film



Unnamed: 0,course_name,course_catalogue_number
936,"[economic, methodology]",6012B0454Y
2946,"[sustainable, and, inclusive, economics]",74040007FY
2807,"[social, movements, change, from, below]",7303S4002Y
786,"[data, analytics]",6314M0413Y
1663,"[language, proficiency, sign, language, of, the]",135221106Y


In [205]:
print(tracks[11])
recommend_courses(li_coourse_ids[11], model, courses)

art history/history



Unnamed: 0,course_name,course_catalogue_number
1357,"[history, lab]",9002601HUY
2537,"[race, class, and, gender, intersectionality]",900374SSCY
2044,"[migration, citizenship]",7332B005AY
2372,"[postcolonial, encounters, in, arts, and, cult...",111221596Y
1635,[journalism],900258HUMY
