## Import Libraries

In [7]:
!pip install openai --quiet
!pip install tenacity --quiet

In [38]:
import json

import pandas as pd
import numpy as np
import re

import openai
from tenacity import retry, wait_random_exponential, stop_after_attempt


### Load Data

In [9]:
## Load data
path = 'data/Coursera.csv'
data = pd.read_csv(path)

In [10]:
data.head()

Unnamed: 0,course_name,university,difficulty_level,course_rating,course_url,description,skills
0,Write A Feature Length Screenplay For Film Or ...,Michigan State University,Beginner,4.8,https://www.coursera.org/learn/write-a-feature...,Write a Full Length Feature Film Script In th...,Drama Comedy peering screenwriting film D...
1,Business Strategy: Business Model Canvas Analy...,Coursera Project Network,Beginner,4.8,https://www.coursera.org/learn/canvas-analysis...,"By the end of this guided project, you will be...",Finance business plan persona (user experien...
2,Silicon Thin Film Solar Cells,�cole Polytechnique,Advanced,4.1,https://www.coursera.org/learn/silicon-thin-fi...,This course consists of a general presentation...,chemistry physics Solar Energy film lambda...
3,Finance for Managers,IESE Business School,Intermediate,4.8,https://www.coursera.org/learn/operational-fin...,"When it comes to numbers, there is always more...",accounts receivable dupont analysis analysis...
4,Retrieve Data using Single-Table SQL Queries,Coursera Project Network,Beginner,4.6,https://www.coursera.org/learn/single-table-sq...,In this course you�ll learn how to effectively...,Data Analysis select (sql) database manageme...


In [11]:
data.columns

Index(['course_name', 'university', 'difficulty_level', 'course_rating',
       'course_url', 'description', 'skills'],
      dtype='object')

In [12]:
## check for missing values
print(data.isnull().sum())

course_name         0
university          0
difficulty_level    0
course_rating       0
course_url          0
description         0
skills              0
dtype: int64


### Create embeddings for the courses name

In [3]:
def get_embeddings(text):
    response = openai.Embedding.create(
        input = text,
        model = "text-embedding-ada-002",
    )["data"][0]["embedding"]
    return response

In [14]:
## read api key
key = json.load(open('api.json'))

# Set up your OpenAI API credentials
openai.api_key = key['open_api']

In [6]:
## get embeddings for all courses
data['embedding'] = data['course_name'].apply(get_embeddings)

In [37]:
data.head()

Unnamed: 0,course_name,university,difficulty_level,course_rating,course_url,description,skills,embedding
0,Write A Feature Length Screenplay For Film Or ...,Michigan State University,Beginner,4.8,https://www.coursera.org/learn/write-a-feature...,Write a Full Length Feature Film Script In th...,Drama Comedy peering screenwriting film D...,"[0.004957424942404032, -0.013018687255680561, ..."
1,Business Strategy: Business Model Canvas Analy...,Coursera Project Network,Beginner,4.8,https://www.coursera.org/learn/canvas-analysis...,"By the end of this guided project, you will be...",Finance business plan persona (user experien...,"[-0.011336499825119972, -0.022729190066456795,..."
2,Silicon Thin Film Solar Cells,�cole Polytechnique,Advanced,4.1,https://www.coursera.org/learn/silicon-thin-fi...,This course consists of a general presentation...,chemistry physics Solar Energy film lambda...,"[0.002505358075723052, -0.006338656414300203, ..."
3,Finance for Managers,IESE Business School,Intermediate,4.8,https://www.coursera.org/learn/operational-fin...,"When it comes to numbers, there is always more...",accounts receivable dupont analysis analysis...,"[-0.0029222306329756975, -0.03425585851073265,..."
4,Retrieve Data using Single-Table SQL Queries,Coursera Project Network,Beginner,4.6,https://www.coursera.org/learn/single-table-sq...,In this course you�ll learn how to effectively...,Data Analysis select (sql) database manageme...,"[-0.016929104924201965, 0.013173501938581467, ..."


In [38]:
## save the csv file with embeddings
data.to_csv('data/Coursera_embeddings.csv', index=False)

## Testing

In [19]:
data = pd.read_csv('data/Coursera_embeddings2.csv')

In [20]:
# save the file with embeddings
data.head()

Unnamed: 0,course_name,university,difficulty_level,course_rating,course_url,description,skills,embedding
0,Write A Feature Length Screenplay For Film Or ...,Michigan State University,Beginner,4.8,https://www.coursera.org/learn/write-a-feature...,Write a Full Length Feature Film Script In th...,Drama Comedy peering screenwriting film D...,"[0.004957424942404032, -0.013018687255680561, ..."
1,Business Strategy: Business Model Canvas Analy...,Coursera Project Network,Beginner,4.8,https://www.coursera.org/learn/canvas-analysis...,"By the end of this guided project, you will be...",Finance business plan persona (user experien...,"[-0.011336499825119972, -0.022729190066456795,..."
2,Silicon Thin Film Solar Cells,�cole Polytechnique,Advanced,4.1,https://www.coursera.org/learn/silicon-thin-fi...,This course consists of a general presentation...,chemistry physics Solar Energy film lambda...,"[0.002505358075723052, -0.006338656414300203, ..."
3,Finance for Managers,IESE Business School,Intermediate,4.8,https://www.coursera.org/learn/operational-fin...,"When it comes to numbers, there is always more...",accounts receivable dupont analysis analysis...,"[-0.0029222306329756975, -0.03425585851073265,..."
4,Retrieve Data using Single-Table SQL Queries,Coursera Project Network,Beginner,4.6,https://www.coursera.org/learn/single-table-sq...,In this course you�ll learn how to effectively...,Data Analysis select (sql) database manageme...,"[-0.016929104924201965, 0.013173501938581467, ..."


In [27]:
data.columns

Index(['course_name', 'university', 'difficulty_level', 'course_rating',
       'course_url', 'description', 'skills', 'embedding'],
      dtype='object')

In [44]:
import ast

## convert string to list
data['embedding'] = data['embedding'].apply(ast.literal_eval)

In [45]:
def get_embeddings(text):
    response = openai.Embedding.create(
        input = text,
        model = "text-embedding-ada-002",
    )["data"][0]["embedding"]
    return response

In [46]:
## function to get the similarity score between embeddings
def get_similarity_score(embedding1, embedding2):
    return np.inner(embedding1, embedding2)

In [47]:
## convert the string to nested list
type(data['embedding'][0])

list

In [48]:
## get the recommendations for a given course
def get_recommendation(user_prompt, data, similarity_score_threshold=0.5, top_n=5):
    user_embedding = get_embeddings(user_prompt)
    data['similarity_score'] = data['embedding'].apply(lambda x: get_similarity_score(user_embedding, x))
    data = data[data['similarity_score'] >= similarity_score_threshold]
    data = data.sort_values(by='similarity_score', ascending=False)
    
    ## drop the similarity score column
    data.drop('similarity_score', axis=1, inplace=True)
    return data.head(top_n)

In [53]:
## get the recommendations for a given course
course = 'Natural language processing'
get_recommendation(course, data)

Unnamed: 0,course_name,university,difficulty_level,course_rating,course_url,description,skills,embedding
1897,Natural Language Processing,National Research University Higher School of ...,Intermediate,4.0,https://www.coursera.org/learn/language-proces...,This course covers a wide range of tasks in Na...,language modeling n-gram named-entity recog...,"[-0.014771533198654652, 0.01521559339016676, 0..."
1866,Clinical Natural Language Processing,University of Colorado System,Advanced,2.4,https://www.coursera.org/learn/clinical-natura...,This course teaches you the fundamentals of cl...,linguistics corpora Natural Language Process...,"[-0.010762782767415047, 0.033246081322431564, ..."
2115,Natural Language Processing in TensorFlow,DeepLearning.AI,Beginner,4.5,https://www.coursera.org/learn/natural-languag...,If you are a software developer who wants to b...,Natural Language Processing natural language ...,"[-0.019726021215319633, 0.005436666309833527, ..."
1457,Natural Language Processing with Attention Models,DeepLearning.AI,Beginner,3.4,https://www.coursera.org/learn/attention-model...,In Course 4 of the Natural Language Processing...,speech synthesis language natural language ...,"[-0.029537010937929153, 0.031132875010371208, ..."
354,Natural Language Processing with Probabilistic...,DeepLearning.AI,Beginner,4.7,https://www.coursera.org/learn/probabilistic-m...,In Course 2 of the Natural Language Processing...,natural language Part-Of-Speech Tagging lang...,"[-0.007217774633318186, 0.0032080893870443106,..."
