In [1]:
import tiktoken
import pandas as pd
import openai
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# To get the tokeniser corresponding to a specific model in the OpenAI API:
enc = tiktoken.encoding_for_model("gpt-4")

#set up OpenAI API credentials
openai.api_key = "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"

In [2]:
user_dataset = pd.read_csv("data/userData.csv", encoding= 'unicode_escape')
course_dataset = pd.read_csv("data/courseData.csv", encoding= 'unicode_escape')
ratings_df = pd.read_csv("data/ratingData.csv", encoding= 'unicode_escape')

In [3]:
#for user
#creating a new dataset dataset using the degree1, degree1specializations and key skills, career objective and the userid
llm_user_dataset = pd.DataFrame()
llm_user_dataset['userid'] = user_dataset['userid']
llm_user_dataset['degree_1'] = user_dataset['degree_1']
llm_user_dataset['degree_1_specializations'] = user_dataset['degree_1_specializations']
llm_user_dataset['career_objective'] = user_dataset['career_objective']
llm_user_dataset['key_skills_str'] = user_dataset['key_skills_str']

#store the career objective and key skills in description
llm_user_dataset['description'] = llm_user_dataset['degree_1'] + llm_user_dataset['degree_1_specializations'] 
#llm_user_dataset.head(5)

In [4]:
#for course
#creating a new dataset dataset using the degree1, degree1specializations and key skills and the course id
llm_course_dataset = pd.DataFrame()
llm_course_dataset['sr_'] = course_dataset['sr_']
llm_course_dataset['campus'] = course_dataset['campus'] #campus is added so we dont have to add it later on
llm_course_dataset['degree_1'] = course_dataset['degree_1']
llm_course_dataset['degree_1_specializations'] = course_dataset['degree_1_specializations']
llm_course_dataset['key_skills_str'] = course_dataset['key_skills_str']

#store the career objective and key skills in description
llm_course_dataset['description'] = llm_course_dataset['degree_1'] + llm_course_dataset['degree_1_specializations'] 

In [5]:
#put them in a combined dataframe
comb = pd.DataFrame()
comb['description'] = llm_user_dataset['description'] + llm_course_dataset['description']

In [6]:
llm_user_dataset['description'][0]

'B.E.Computer Science & Engineering'

In [7]:
user = "1200"
llm_user_dataset["description"].loc[llm_user_dataset["userid"] == user].values[0]

'B.E.Computer Science & Engineering'

In [8]:
#load and preprocess data
data = llm_course_dataset
#data = data.dropna()
corpus = data["description"].tolist()

#extract features using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

#compute pairwise similarities using cosine similarity
similarity_matrix = cosine_similarity(X)

#generate recommendations based on user input
def llm_recommender(user):
    course_recommendations = []
    
    user_input = llm_user_dataset["description"].loc[llm_user_dataset["userid"] == user].values[0] 
    
    user_index = corpus.index(user_input)
    recommendations = similarity_matrix[user_index].argsort()[:-6:-1]
    
    
    #Print top 5 recommendations
    print("Top 5 Recommendations:")
    for i, index in enumerate(recommendations):
        print(f"{i+1}. {data.loc[index]['sr_']}: {data.loc[index]['description']}")
        course_recommendations.append(data.loc[index]['sr_'])
    
    #return the course ids
    return course_recommendations

In [9]:
llm_recs1 = llm_recommender("1001")

Top 5 Recommendations:
1. 7408: B.E.Computer Science & Engineering
2. 2954: B.E.Computer Science & Engineering
3. 8949: B.E.Computer Science & Engineering
4. 8948: B.E.Computer Science & Engineering
5. 8947: B.E.Computer Science & Engineering


In [10]:
llm_recs1

[7408, 2954, 8949, 8948, 8947]

###  For Evaluation

In [11]:
#load and preprocess data
data = llm_course_dataset
data = data.dropna()
corpus = data["description"].tolist()
X_train, X_test = train_test_split(data, test_size=0.2)

#extract features using TF-IDF
vectorizer = TfidfVectorizer()
X_train_features = vectorizer.fit_transform(X_train["description"].tolist())
X_test_features = vectorizer.transform(X_test["description"].tolist())

#compute pairwise similarities using cosine similarity
similarity_matrix = cosine_similarity(X_test_features, X_train_features)

#generate recommendations for each test user
recommendations = []

llm_user_dataset = llm_user_dataset.dropna()

for i in range(len(llm_user_dataset)):
    user_input = llm_user_dataset.iloc[i]["description"]
    user_index = corpus.index(user_input)
    recommended_items = similarity_matrix[i].argsort()[:-6:-1]
    recommendations.append(recommended_items)

#print(recs)

#Compute precision and recall for degree
relevant_items = []
for i in range(len(llm_user_dataset)):
    relevant_items.append(set(X_train.loc[X_train["degree_1"] == X_test.iloc[i]["degree_1"]]["sr_"].tolist()))
recommended_items = [set(X_train.iloc[rec]["sr_"].tolist()) for rec in recommendations]
true_positives = [len(r & a) for r, a in zip(recommended_items, relevant_items)]
true_negatives = [len(a & r) for r, a in zip(recommended_items, relevant_items)]
false_positives = [len(r - a) for r, a in zip(recommended_items, relevant_items)]
false_negatives = [len(a - r) for r, a in zip(recommended_items, relevant_items)]
accuracy = (sum(true_positives) + sum(true_negatives)) / (sum(true_positives) + sum(true_negatives) + sum(false_negatives) + sum(false_positives))
precision = sum(true_positives) / (sum(true_positives) + sum(false_positives))
recall = sum(true_positives) / (sum(true_positives) + sum(false_negatives))
f1_score = 2 * precision * recall / (precision + recall)

#Print evaluation metrics
print("Evaluation for Degree: ")
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1_score:.3f}")

#Compute precision and recall for degree spec
relevant_items = []
for i in range(len(llm_user_dataset)):
    relevant_items.append(set(X_train.loc[X_train["degree_1_specializations"] == X_test.iloc[i]["degree_1_specializations"]]["sr_"].tolist()))
recommended_items = [set(X_train.iloc[rec]["sr_"].tolist()) for rec in recommendations]
true_positives = [len(r & a) for r, a in zip(recommended_items, relevant_items)]
true_negatives = [len(a & r) for r, a in zip(recommended_items, relevant_items)]
false_positives = [len(r - a) for r, a in zip(recommended_items, relevant_items)]
false_negatives = [len(a - r) for r, a in zip(recommended_items, relevant_items)]
accuracy = (sum(true_positives) + sum(true_negatives)) / (sum(true_positives) + sum(true_negatives) + sum(false_negatives) + sum(false_positives))
precision = sum(true_positives) / (sum(true_positives) + sum(false_positives))
recall = sum(true_positives) / (sum(true_positives) + sum(false_negatives))
f1_score = 2 * precision * recall / (precision + recall)

#Print evaluation metrics
print()
print("Evaluation for Degree Specializations: ")
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1_score:.3f}")

#Compute precision and recall for key skills
relevant_items = []
for i in range(len(llm_user_dataset)):
    relevant_items.append(set(X_train.loc[X_train["key_skills_str"] == X_test.iloc[i]["key_skills_str"]]["sr_"].tolist()))
recommended_items = [set(X_train.iloc[rec]["sr_"].tolist()) for rec in recommendations]
true_positives = [len(r & a) for r, a in zip(recommended_items, relevant_items)]
true_negatives = [len(a & r) for r, a in zip(recommended_items, relevant_items)]
false_positives = [len(r - a) for r, a in zip(recommended_items, relevant_items)]
false_negatives = [len(a - r) for r, a in zip(recommended_items, relevant_items)]
accuracy = (sum(true_positives) + sum(true_negatives)) / (sum(true_positives) + sum(true_negatives) + sum(false_negatives) + sum(false_positives))
precision = sum(true_positives) / (sum(true_positives) + sum(false_positives))
recall = sum(true_positives) / (sum(true_positives) + sum(false_negatives))
f1_score = 2 * precision * recall / (precision + recall)

#Print evaluation metrics
print()
print("Evaluation for Key Skills: ")
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1_score:.3f}")

Evaluation for Degree: 
Accuracy: 0.001
Precision: 1.000
Recall: 0.001
F1 Score: 0.001

Evaluation for Degree Specializations: 
Accuracy: 0.004
Precision: 1.000
Recall: 0.002
F1 Score: 0.004

Evaluation for Key Skills: 
Accuracy: 0.005
Precision: 0.325
Recall: 0.003
F1 Score: 0.005


In [12]:
recommendations

[array([ 642, 6069, 6080, 3645, 6078], dtype=int64),
 array([ 642, 6069, 6080, 3645, 6078], dtype=int64),
 array([3999, 5298, 5294, 1051, 5288], dtype=int64),
 array([3999, 5298, 5294, 1051, 5288], dtype=int64),
 array([ 642, 6069, 6080, 3645, 6078], dtype=int64),
 array([ 642, 6069, 6080, 3645, 6078], dtype=int64),
 array([7999, 3866, 3849, 3850, 3853], dtype=int64),
 array([7999, 3866, 3849, 3850, 3853], dtype=int64),
 array([7999, 3866, 3849, 3850, 3853], dtype=int64),
 array([7999, 3866, 3849, 3850, 3853], dtype=int64),
 array([3383, 2668,  628, 2664, 5859], dtype=int64),
 array([ 642, 6069, 6080, 3645, 6078], dtype=int64),
 array([ 642, 6069, 6080, 3645, 6078], dtype=int64),
 array([3383, 2668,  628, 2664, 5859], dtype=int64),
 array([7999, 3866, 3849, 3850, 3853], dtype=int64),
 array([3383, 2668,  628, 2664, 5859], dtype=int64),
 array([3033, 5188, 1710, 6032, 2652], dtype=int64),
 array([3383, 2668,  628, 2664, 5859], dtype=int64),
 array([ 642, 6069, 6080, 3645, 6078], dtype=i