## This notebook:
- Try deep learning method on content based filtering


In [1]:
import pandas as pd
import numpy as np
import pickle
import sklearn
import faiss
import spacy
from sentence_transformers import SentenceTransformer
import scipy.spatial

In [2]:
online = pd.read_csv('assets/original/2021-10-19-MichiganOnline-courses.csv')
f_21 = pd.read_csv('assets/f_21_merge.csv')
w_22 = pd.read_csv('assets/w_22_merge.csv')

In [3]:
def concat_prepare(f_df, w_df):
    f_df['semester'] = 'fall'
    w_df['semester'] = 'winter'
    
    # Concat
    df = pd.concat([f_df, w_df])
    
    # Clean
    df = df.fillna('').drop_duplicates(subset=['course']).reset_index().drop(columns='index')

    # Remove description with no information
    df['description'].replace('(Hybrid, Synchronous)', '', inplace = True)
    
    # Merge all the text data
    df['text'] = df['Subject'] + ' ' \
                + df['Course Title'] + ' ' \
                + df['sub_title'] +' '\
                + df['description']
    
    return df

fw = concat_prepare(f_21, w_22)


In [4]:
def store_model(df):
    corpus = df['text'].tolist()
    embedder = SentenceTransformer('bert-base-nli-mean-tokens')
    corpus_embeddings = embedder.encode(corpus)
    with open('corpus_embeddings.pkl', "wb") as fOut:
        pickle.dump({'corpus': corpus, 'embeddings': corpus_embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL)
    

## Bert Sentence Transformer 

In [6]:
%%time

#Load sentences & embeddings from disc
with open('corpus_embeddings.pkl', "rb") as fIn:
    stored_data = pickle.load(fIn)
    stored_corpus = stored_data['corpus']
    stored_embeddings = stored_data['embeddings']

CPU times: user 5.45 ms, sys: 38.7 ms, total: 44.1 ms
Wall time: 54.6 ms


In [7]:
len(stored_corpus), len(fw['text'].to_list())

(8334, 8334)

## Deep learning content based filtering

In [None]:
def dl_free_text(df, embeddings, corpus, query, k = 10):
    # df: dataset
    # embeddings: stored_embeddings
    # corpus: stored_corpus or df['text'].tolist() -- should be the same
    # query: list of string (['Korean history'])
    # k = number of recommendation
    
    # If the len of corpus doesn't match the len of input df text, can't process the rec sys properly. 
    if len(corpus) != len(df['text']):
        print('Stored corpus and the text of the input dataset are different.')
        return None
    
    else:
        d = 768
        index = faiss.IndexFlatL2(d)

        index.add(np.stack(embeddings, axis=0))

        embedder = SentenceTransformer('bert-base-nli-mean-tokens')


        query_embedding = embedder.encode(query)
        D, I = index.search(query_embedding, k)     # actual search


        distances, indices = index.search(np.asarray(query_embedding).reshape(1,768),k)

        print("Query:", query)

        rec_df = df.iloc[indices[0],:]
        return rec_df[:k]

In [10]:
query1 = ['data analytics python']
dl_content(fw, stored_embeddings, stored_corpus, query1, k=5)

Query: ['data analytics python']


Unnamed: 0,Class Nbr,course,Term,Session,Acad Group,Subject,Course Title,description,Component,Time,...,Units,sub_title,credits,requirements_distribution,consent,advisory_prerequisites,other_course_info,repeatability,semester,text
4265,27500,STATS 507,Fall 2021,Regular Academic Session,"Literature, Sci, and the Arts",Statistics (STATS) Open Sections,Data Sci in Python,,LEC,830-10AM,...,3.0,,,,,,,,fall,Statistics (STATS) Open Sections Data Sci in P...
1057,30324,DATASCI 507,Fall 2021,Regular Academic Session,"Literature, Sci, and the Arts",Data Science (DATASCI) Open Sections,Data Sci in Python,,LEC,830-10AM,...,3.0,,,,,,,,fall,Data Science (DATASCI) Open Sections Data Sci ...
2364,10485,IOE 373,Fall 2021,Regular Academic Session,Engineering,Industrial And Operations Engineering (IOE) Op...,Data Analyt Tls&Tecn,Introduction to the computing tools necessary ...,LAB,230-330PM,...,4.0,,4.0,BS,,ENGR 101.,,May not be repeated for credit.,fall,Industrial And Operations Engineering (IOE) Op...
4056,29444,SI 564,Fall 2021,Regular Academic Session,Information,Information (SI) Open Sections,SQL & Databases,,LEC,4-530PM,...,1.5,,,,,,,,fall,Information (SI) Open Sections SQL & Databases
4093,34852,SI 721,Fall 2021,Regular Academic Session,Information,Information (SI) Open Sections,Data Mining,,DIS,10-1130AM,...,3.0,,,,,,,,fall,Information (SI) Open Sections Data Mining
