### The recommendation system I'm using focuses on content-based filtering. It looks at the titles of courses and suggests similar ones based on predefined rejection causes

<div style="background-color: transparent; color: black; padding: 10px; border: 2px solid #000; border-radius: 10px;">
    <h2 style="text-align: center;">The Concept: Content-Based Filtering</h2>
    <p>
        This recommendation system relies on analyzing the textual content (titles) of courses and comparing them to predefined rejection causes to suggest similar courses. It doesn't consider user behavior or preferences; instead, it focuses on the cause of rejection to generate recommendations.
    </p>
</div>


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
%matplotlib inline
pd.pandas.set_option('display.max_columns',None)

sns.set_style("whitegrid")
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import OrdinalEncoder

from tqdm import tqdm
import sys
import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv('udemy_output_All_Finance__Accounting_p1_p626.csv')

In [3]:
data

Unnamed: 0,id,title,url,is_paid,num_subscribers,avg_rating,avg_rating_recent,rating,num_reviews,is_wishlisted,num_published_lectures,num_published_practice_tests,created,published_time,discount_price__amount,discount_price__currency,discount_price__price_string,price_detail__amount,price_detail__currency,price_detail__price_string
0,762616,The Complete SQL Bootcamp 2020: Go from Zero t...,/course/the-complete-sql-bootcamp/,True,295509,4.66019,4.67874,4.67874,78006,False,84,0,2016-02-14T22:57:48Z,2016-04-06T05:16:11Z,455.0,INR,₹455,8640.0,INR,"₹8,640"
1,937678,Tableau 2020 A-Z: Hands-On Tableau Training fo...,/course/tableau10/,True,209070,4.58956,4.60015,4.60015,54581,False,78,0,2016-08-22T12:10:18Z,2016-08-23T16:59:49Z,455.0,INR,₹455,8640.0,INR,"₹8,640"
2,1361790,PMP Exam Prep Seminar - PMBOK Guide 6,/course/pmp-pmbok6-35-pdus/,True,155282,4.59491,4.59326,4.59326,52653,False,292,2,2017-09-26T16:32:48Z,2017-11-14T23:58:14Z,455.0,INR,₹455,8640.0,INR,"₹8,640"
3,648826,The Complete Financial Analyst Course 2020,/course/the-complete-financial-analyst-course/,True,245860,4.54407,4.53772,4.53772,46447,False,338,0,2015-10-23T13:34:35Z,2016-01-21T01:38:48Z,455.0,INR,₹455,8640.0,INR,"₹8,640"
4,637930,An Entire MBA in 1 Course:Award Winning Busine...,/course/an-entire-mba-in-1-courseaward-winning...,True,374836,4.47080,4.47173,4.47173,41630,False,83,0,2015-10-12T06:39:46Z,2016-01-11T21:39:33Z,455.0,INR,₹455,8640.0,INR,"₹8,640"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13603,3005408,מושגים בסיסיים באופציות חלק ב,/course/1-ndgzhu/,False,47,0.00000,0.00000,0.00000,0,False,6,0,2020-04-15T10:20:17Z,2020-04-23T02:06:55Z,,,,,,
13604,3467284,Programa de Integridade,/course/iaudit_programa_de_integridade/,False,19,0.00000,0.00000,0.00000,0,False,5,0,2020-09-01T17:57:15Z,2020-09-03T15:26:47Z,,,,,,
13605,3368222,Goodwill and Method of Calculating Goodwill (A...,/course/goodwill-and-method-of-calculating-goo...,False,47,0.00000,0.00000,0.00000,0,False,5,0,2020-07-27T02:29:18Z,2020-07-31T12:14:40Z,,,,,,
13606,3211345,Poderoso Investidor,/course/poderoso-investidor/,False,48,0.00000,0.00000,0.00000,0,False,13,0,2020-06-07T01:29:53Z,2020-07-15T10:35:38Z,,,,,,


In [4]:
import pandas as pd

columns_to_delete = ['avg_rating', 'avg_rating_recent', 'is_wishlisted', 'num_published_practice_tests', 
                     'created', 'published_time', 'discount_price__currency', 'discount_price__price_string', 
                     'price_detail__currency', 'price_detail__price_string','id']

data = data.drop(columns=columns_to_delete)


In [5]:
data

Unnamed: 0,title,url,is_paid,num_subscribers,rating,num_reviews,num_published_lectures,discount_price__amount,price_detail__amount
0,The Complete SQL Bootcamp 2020: Go from Zero t...,/course/the-complete-sql-bootcamp/,True,295509,4.67874,78006,84,455.0,8640.0
1,Tableau 2020 A-Z: Hands-On Tableau Training fo...,/course/tableau10/,True,209070,4.60015,54581,78,455.0,8640.0
2,PMP Exam Prep Seminar - PMBOK Guide 6,/course/pmp-pmbok6-35-pdus/,True,155282,4.59326,52653,292,455.0,8640.0
3,The Complete Financial Analyst Course 2020,/course/the-complete-financial-analyst-course/,True,245860,4.53772,46447,338,455.0,8640.0
4,An Entire MBA in 1 Course:Award Winning Busine...,/course/an-entire-mba-in-1-courseaward-winning...,True,374836,4.47173,41630,83,455.0,8640.0
...,...,...,...,...,...,...,...,...,...
13603,מושגים בסיסיים באופציות חלק ב,/course/1-ndgzhu/,False,47,0.00000,0,6,,
13604,Programa de Integridade,/course/iaudit_programa_de_integridade/,False,19,0.00000,0,5,,
13605,Goodwill and Method of Calculating Goodwill (A...,/course/goodwill-and-method-of-calculating-goo...,False,47,0.00000,0,5,,
13606,Poderoso Investidor,/course/poderoso-investidor/,False,48,0.00000,0,13,,


In [6]:
data.isnull().sum()

title                        0
url                          0
is_paid                      0
num_subscribers              0
rating                       0
num_reviews                  0
num_published_lectures       0
discount_price__amount    1403
price_detail__amount       497
dtype: int64

In [7]:
import pandas as pd
from sklearn.impute import KNNImputer
features_for_imputation = ['discount_price__amount', 'price_detail__amount']
imputer = KNNImputer(n_neighbors=5) 
data[features_for_imputation] = imputer.fit_transform(data[features_for_imputation])
missing_values = data.isnull().sum()
print("Remaining missing values:")
print(missing_values[missing_values > 0])


Remaining missing values:
Series([], dtype: int64)


In [8]:
num_duplicates = data.duplicated().sum()
print("Number of duplicates:", num_duplicates)


Number of duplicates: 0


## Encoding

In [9]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
data['is_paid_encoded'] = label_encoder.fit_transform(data['is_paid'])
for label, encoding in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)):
    print(f"{label}: {encoding}")



False: 0
True: 1


In [10]:
data=data.drop(['is_paid'],axis=1)

In [11]:
data

Unnamed: 0,title,url,num_subscribers,rating,num_reviews,num_published_lectures,discount_price__amount,price_detail__amount,is_paid_encoded
0,The Complete SQL Bootcamp 2020: Go from Zero t...,/course/the-complete-sql-bootcamp/,295509,4.67874,78006,84,455.000000,8640.000000,1
1,Tableau 2020 A-Z: Hands-On Tableau Training fo...,/course/tableau10/,209070,4.60015,54581,78,455.000000,8640.000000,1
2,PMP Exam Prep Seminar - PMBOK Guide 6,/course/pmp-pmbok6-35-pdus/,155282,4.59326,52653,292,455.000000,8640.000000,1
3,The Complete Financial Analyst Course 2020,/course/the-complete-financial-analyst-course/,245860,4.53772,46447,338,455.000000,8640.000000,1
4,An Entire MBA in 1 Course:Award Winning Busine...,/course/an-entire-mba-in-1-courseaward-winning...,374836,4.47173,41630,83,455.000000,8640.000000,1
...,...,...,...,...,...,...,...,...,...
13603,מושגים בסיסיים באופציות חלק ב,/course/1-ndgzhu/,47,0.00000,0,6,493.943794,4646.992602,0
13604,Programa de Integridade,/course/iaudit_programa_de_integridade/,19,0.00000,0,5,493.943794,4646.992602,0
13605,Goodwill and Method of Calculating Goodwill (A...,/course/goodwill-and-method-of-calculating-goo...,47,0.00000,0,5,493.943794,4646.992602,0
13606,Poderoso Investidor,/course/poderoso-investidor/,48,0.00000,0,13,493.943794,4646.992602,0


## Tokenization 

In [13]:
import nltk
from nltk.tokenize import word_tokenize
titles = data['title']
tokenized_titles = []
for title in titles:
    tokens = word_tokenize(title)
    tokenized_titles.append(tokens)
data['tokenized_title'] = tokenized_titles
data 

Unnamed: 0,title,url,num_subscribers,rating,num_reviews,num_published_lectures,discount_price__amount,price_detail__amount,is_paid_encoded,tokenized_title
0,The Complete SQL Bootcamp 2020: Go from Zero t...,/course/the-complete-sql-bootcamp/,295509,4.67874,78006,84,455.000000,8640.000000,1,"[The, Complete, SQL, Bootcamp, 2020, :, Go, fr..."
1,Tableau 2020 A-Z: Hands-On Tableau Training fo...,/course/tableau10/,209070,4.60015,54581,78,455.000000,8640.000000,1,"[Tableau, 2020, A-Z, :, Hands-On, Tableau, Tra..."
2,PMP Exam Prep Seminar - PMBOK Guide 6,/course/pmp-pmbok6-35-pdus/,155282,4.59326,52653,292,455.000000,8640.000000,1,"[PMP, Exam, Prep, Seminar, -, PMBOK, Guide, 6]"
3,The Complete Financial Analyst Course 2020,/course/the-complete-financial-analyst-course/,245860,4.53772,46447,338,455.000000,8640.000000,1,"[The, Complete, Financial, Analyst, Course, 2020]"
4,An Entire MBA in 1 Course:Award Winning Busine...,/course/an-entire-mba-in-1-courseaward-winning...,374836,4.47173,41630,83,455.000000,8640.000000,1,"[An, Entire, MBA, in, 1, Course, :, Award, Win..."
...,...,...,...,...,...,...,...,...,...,...
13603,מושגים בסיסיים באופציות חלק ב,/course/1-ndgzhu/,47,0.00000,0,6,493.943794,4646.992602,0,"[מושגים, בסיסיים, באופציות, חלק, ב]"
13604,Programa de Integridade,/course/iaudit_programa_de_integridade/,19,0.00000,0,5,493.943794,4646.992602,0,"[Programa, de, Integridade]"
13605,Goodwill and Method of Calculating Goodwill (A...,/course/goodwill-and-method-of-calculating-goo...,47,0.00000,0,5,493.943794,4646.992602,0,"[Goodwill, and, Method, of, Calculating, Goodw..."
13606,Poderoso Investidor,/course/poderoso-investidor/,48,0.00000,0,13,493.943794,4646.992602,0,"[Poderoso, Investidor]"


In [14]:
data

Unnamed: 0,title,url,num_subscribers,rating,num_reviews,num_published_lectures,discount_price__amount,price_detail__amount,is_paid_encoded,tokenized_title
0,The Complete SQL Bootcamp 2020: Go from Zero t...,/course/the-complete-sql-bootcamp/,295509,4.67874,78006,84,455.000000,8640.000000,1,"[The, Complete, SQL, Bootcamp, 2020, :, Go, fr..."
1,Tableau 2020 A-Z: Hands-On Tableau Training fo...,/course/tableau10/,209070,4.60015,54581,78,455.000000,8640.000000,1,"[Tableau, 2020, A-Z, :, Hands-On, Tableau, Tra..."
2,PMP Exam Prep Seminar - PMBOK Guide 6,/course/pmp-pmbok6-35-pdus/,155282,4.59326,52653,292,455.000000,8640.000000,1,"[PMP, Exam, Prep, Seminar, -, PMBOK, Guide, 6]"
3,The Complete Financial Analyst Course 2020,/course/the-complete-financial-analyst-course/,245860,4.53772,46447,338,455.000000,8640.000000,1,"[The, Complete, Financial, Analyst, Course, 2020]"
4,An Entire MBA in 1 Course:Award Winning Busine...,/course/an-entire-mba-in-1-courseaward-winning...,374836,4.47173,41630,83,455.000000,8640.000000,1,"[An, Entire, MBA, in, 1, Course, :, Award, Win..."
...,...,...,...,...,...,...,...,...,...,...
13603,מושגים בסיסיים באופציות חלק ב,/course/1-ndgzhu/,47,0.00000,0,6,493.943794,4646.992602,0,"[מושגים, בסיסיים, באופציות, חלק, ב]"
13604,Programa de Integridade,/course/iaudit_programa_de_integridade/,19,0.00000,0,5,493.943794,4646.992602,0,"[Programa, de, Integridade]"
13605,Goodwill and Method of Calculating Goodwill (A...,/course/goodwill-and-method-of-calculating-goo...,47,0.00000,0,5,493.943794,4646.992602,0,"[Goodwill, and, Method, of, Calculating, Goodw..."
13606,Poderoso Investidor,/course/poderoso-investidor/,48,0.00000,0,13,493.943794,4646.992602,0,"[Poderoso, Investidor]"


# Stemming 

In [15]:
import nltk
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
def apply_stemming(tokens):
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens
data['stemmed_title'] = data['tokenized_title'].apply(apply_stemming)
print(data)


                                                   title  \
0      The Complete SQL Bootcamp 2020: Go from Zero t...   
1      Tableau 2020 A-Z: Hands-On Tableau Training fo...   
2                 PMP Exam Prep Seminar -  PMBOK Guide 6   
3             The Complete Financial Analyst Course 2020   
4      An Entire MBA in 1 Course:Award Winning Busine...   
...                                                  ...   
13603                      מושגים בסיסיים באופציות חלק ב   
13604                            Programa de Integridade   
13605  Goodwill and Method of Calculating Goodwill (A...   
13606                                Poderoso Investidor   
13607         Acabou a Previdência e agora? -  Volume 03   

                                                     url  num_subscribers  \
0                     /course/the-complete-sql-bootcamp/           295509   
1                                     /course/tableau10/           209070   
2                            /course/pmp-pmbok6-

## Stop words 

In [16]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
def remove_stopwords(tokens):
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    return filtered_tokens
data['stopwords_removed_title'] = data['stemmed_title'].apply(remove_stopwords)
data


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,title,url,num_subscribers,rating,num_reviews,num_published_lectures,discount_price__amount,price_detail__amount,is_paid_encoded,tokenized_title,stemmed_title,stopwords_removed_title
0,The Complete SQL Bootcamp 2020: Go from Zero t...,/course/the-complete-sql-bootcamp/,295509,4.67874,78006,84,455.000000,8640.000000,1,"[The, Complete, SQL, Bootcamp, 2020, :, Go, fr...","[the, complet, sql, bootcamp, 2020, :, go, fro...","[complet, sql, bootcamp, 2020, :, go, zero, hero]"
1,Tableau 2020 A-Z: Hands-On Tableau Training fo...,/course/tableau10/,209070,4.60015,54581,78,455.000000,8640.000000,1,"[Tableau, 2020, A-Z, :, Hands-On, Tableau, Tra...","[tableau, 2020, a-z, :, hands-on, tableau, tra...","[tableau, 2020, a-z, :, hands-on, tableau, tra..."
2,PMP Exam Prep Seminar - PMBOK Guide 6,/course/pmp-pmbok6-35-pdus/,155282,4.59326,52653,292,455.000000,8640.000000,1,"[PMP, Exam, Prep, Seminar, -, PMBOK, Guide, 6]","[pmp, exam, prep, seminar, -, pmbok, guid, 6]","[pmp, exam, prep, seminar, -, pmbok, guid, 6]"
3,The Complete Financial Analyst Course 2020,/course/the-complete-financial-analyst-course/,245860,4.53772,46447,338,455.000000,8640.000000,1,"[The, Complete, Financial, Analyst, Course, 2020]","[the, complet, financi, analyst, cours, 2020]","[complet, financi, analyst, cours, 2020]"
4,An Entire MBA in 1 Course:Award Winning Busine...,/course/an-entire-mba-in-1-courseaward-winning...,374836,4.47173,41630,83,455.000000,8640.000000,1,"[An, Entire, MBA, in, 1, Course, :, Award, Win...","[an, entir, mba, in, 1, cours, :, award, win, ...","[entir, mba, 1, cours, :, award, win, busi, sc..."
...,...,...,...,...,...,...,...,...,...,...,...,...
13603,מושגים בסיסיים באופציות חלק ב,/course/1-ndgzhu/,47,0.00000,0,6,493.943794,4646.992602,0,"[מושגים, בסיסיים, באופציות, חלק, ב]","[מושגים, בסיסיים, באופציות, חלק, ב]","[מושגים, בסיסיים, באופציות, חלק, ב]"
13604,Programa de Integridade,/course/iaudit_programa_de_integridade/,19,0.00000,0,5,493.943794,4646.992602,0,"[Programa, de, Integridade]","[programa, de, integridad]","[programa, de, integridad]"
13605,Goodwill and Method of Calculating Goodwill (A...,/course/goodwill-and-method-of-calculating-goo...,47,0.00000,0,5,493.943794,4646.992602,0,"[Goodwill, and, Method, of, Calculating, Goodw...","[goodwil, and, method, of, calcul, goodwil, (,...","[goodwil, method, calcul, goodwil, (, account, )]"
13606,Poderoso Investidor,/course/poderoso-investidor/,48,0.00000,0,13,493.943794,4646.992602,0,"[Poderoso, Investidor]","[poderoso, investidor]","[poderoso, investidor]"


## POS tag 

In [17]:
import nltk
from nltk import pos_tag
pos_tagged_titles = []
def pos_tag_tokens(tokens):
    return pos_tag(tokens)  
for token_list in tokenized_titles:
    pos_tags = pos_tag_tokens(token_list)
    pos_tagged_titles.append(pos_tags)
data['stopwords_removed_title'] = pos_tagged_titles



In [18]:
data

Unnamed: 0,title,url,num_subscribers,rating,num_reviews,num_published_lectures,discount_price__amount,price_detail__amount,is_paid_encoded,tokenized_title,stemmed_title,stopwords_removed_title
0,The Complete SQL Bootcamp 2020: Go from Zero t...,/course/the-complete-sql-bootcamp/,295509,4.67874,78006,84,455.000000,8640.000000,1,"[The, Complete, SQL, Bootcamp, 2020, :, Go, fr...","[the, complet, sql, bootcamp, 2020, :, go, fro...","[(The, DT), (Complete, NNP), (SQL, NNP), (Boot..."
1,Tableau 2020 A-Z: Hands-On Tableau Training fo...,/course/tableau10/,209070,4.60015,54581,78,455.000000,8640.000000,1,"[Tableau, 2020, A-Z, :, Hands-On, Tableau, Tra...","[tableau, 2020, a-z, :, hands-on, tableau, tra...","[(Tableau, NN), (2020, CD), (A-Z, JJ), (:, :),..."
2,PMP Exam Prep Seminar - PMBOK Guide 6,/course/pmp-pmbok6-35-pdus/,155282,4.59326,52653,292,455.000000,8640.000000,1,"[PMP, Exam, Prep, Seminar, -, PMBOK, Guide, 6]","[pmp, exam, prep, seminar, -, pmbok, guid, 6]","[(PMP, NNP), (Exam, NNP), (Prep, NNP), (Semina..."
3,The Complete Financial Analyst Course 2020,/course/the-complete-financial-analyst-course/,245860,4.53772,46447,338,455.000000,8640.000000,1,"[The, Complete, Financial, Analyst, Course, 2020]","[the, complet, financi, analyst, cours, 2020]","[(The, DT), (Complete, NNP), (Financial, NNP),..."
4,An Entire MBA in 1 Course:Award Winning Busine...,/course/an-entire-mba-in-1-courseaward-winning...,374836,4.47173,41630,83,455.000000,8640.000000,1,"[An, Entire, MBA, in, 1, Course, :, Award, Win...","[an, entir, mba, in, 1, cours, :, award, win, ...","[(An, DT), (Entire, NNP), (MBA, NNP), (in, IN)..."
...,...,...,...,...,...,...,...,...,...,...,...,...
13603,מושגים בסיסיים באופציות חלק ב,/course/1-ndgzhu/,47,0.00000,0,6,493.943794,4646.992602,0,"[מושגים, בסיסיים, באופציות, חלק, ב]","[מושגים, בסיסיים, באופציות, חלק, ב]","[(מושגים, JJ), (בסיסיים, NNP), (באופציות, NNP)..."
13604,Programa de Integridade,/course/iaudit_programa_de_integridade/,19,0.00000,0,5,493.943794,4646.992602,0,"[Programa, de, Integridade]","[programa, de, integridad]","[(Programa, NNP), (de, FW), (Integridade, NNP)]"
13605,Goodwill and Method of Calculating Goodwill (A...,/course/goodwill-and-method-of-calculating-goo...,47,0.00000,0,5,493.943794,4646.992602,0,"[Goodwill, and, Method, of, Calculating, Goodw...","[goodwil, and, method, of, calcul, goodwil, (,...","[(Goodwill, NNP), (and, CC), (Method, NNP), (o..."
13606,Poderoso Investidor,/course/poderoso-investidor/,48,0.00000,0,13,493.943794,4646.992602,0,"[Poderoso, Investidor]","[poderoso, investidor]","[(Poderoso, NNP), (Investidor, NNP)]"


 ## SBERT bi-encoder

In [19]:
pip install sentence-transformers


Note: you may need to restart the kernel to use updated packages.


In [20]:
pip install --upgrade transformers


Note: you may need to restart the kernel to use updated packages.


In [21]:
pip show transformers


Name: transformers
Version: 4.40.2
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: C:\Users\user\anaconda3\Lib\site-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: sentence-transformers
Note: you may need to restart the kernel to use updated packages.


In [22]:
pip install gensim


Note: you may need to restart the kernel to use updated packages.


In [23]:
title_df = data[['title']]
title_df

Unnamed: 0,title
0,The Complete SQL Bootcamp 2020: Go from Zero t...
1,Tableau 2020 A-Z: Hands-On Tableau Training fo...
2,PMP Exam Prep Seminar - PMBOK Guide 6
3,The Complete Financial Analyst Course 2020
4,An Entire MBA in 1 Course:Award Winning Busine...
...,...
13603,מושגים בסיסיים באופציות חלק ב
13604,Programa de Integridade
13605,Goodwill and Method of Calculating Goodwill (A...
13606,Poderoso Investidor


<div style="background-color: transparent; color: black; padding: 10px; border: 2px solid #000; border-radius: 10px;">
    <h2 style="text-align: center;">Recommendation Process Overview</h2>
    <ul>
        <li><strong>Text Vectorization:</strong> It uses TF-IDF (Term Frequency-Inverse Document Frequency) to convert the text data into numerical vectors, representing each course title.</li>
        <li><strong>Semantic Encoding:</strong> It utilizes SBERT (Sentence-BERT) to encode the course titles into dense numerical embeddings, capturing their semantic meanings.</li>
        <li><strong>Rejection Causes:</strong> It defines a list of rejection causes, which serve as queries for finding similar courses.</li>
        <li><strong>Recommendation Loop:</strong> It iterates over each rejection cause. For each cause:
            <ul>
                <li>It calculates the cosine similarity between the TF-IDF vectors of the cause and all course titles, identifying the most similar courses based on TF-IDF.</li>
                <li>It calculates the cosine similarity between the SBERT embeddings of the cause and all course titles, identifying the most similar courses based on SBERT embeddings.</li>
                <li>It prints the top 5 recommended courses for each cause based on both TF-IDF and SBERT methods.</li>
            </ul>
        </li>
    </ul>
</div>


<img src="pre-trained-SBERT-Bi-Encoder-model.png" >


In [25]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer

rejection_causes = [
    "Ratio de bénéfice exploitation",
    "credit_decision",
    "Trésorerie nette par action",
    "Stockholders_equity_to_fixed_assets_ratio",
    "Current_debt_ratio",
    "Return_On_Equity_ROE",
    "Number of Employees",
    "Chiffre d'affaires 2019",
    "Evolution 2018-2019",
    "Fonds propres 2019",
    "Revenue(US$ billions)",
    "ISIC_code",
    "Pays_Encodé_WoE",
    "Revenue(US$)"
]

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(title_df['title'])

# SBERT Bi-Encoder Model
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')  # Load the SBERT model
sentence_embeddings = sbert_model.encode(title_df['title'])

for cause in rejection_causes:
    print(f"\nTop 5 courses recommended for the cause of rejection: {cause}\n")
    
    # Calculate cosine similarity between the query vector and all course vectors
    query_title = cause
    query_vector = tfidf_vectorizer.transform([query_title])
    cosine_similarities = np.dot(tfidf_matrix, query_vector.T).toarray().ravel()

    top_indices_tfidf = cosine_similarities.argsort()[-5:][::-1]
    print("Top courses recommended by TF-IDF:")
    for idx in top_indices_tfidf:
        print(title_df.iloc[idx]['title'])

    # Calculate cosine similarity between the query embedding and all course embeddings
    query_embedding = sbert_model.encode([query_title])
    cosine_similarities_sbert = np.dot(sentence_embeddings, query_embedding.T).ravel()

    top_indices_sbert = cosine_similarities_sbert.argsort()[-5:][::-1]
    print("\nTop courses recommended by SBERT Bi-Encoder:")
    for idx in top_indices_sbert:
        print(title_df.iloc[idx]['title'])



Top 5 courses recommended for the cause of rejection: Ratio de bénéfice exploitation

Top courses recommended by TF-IDF:
Introduction to Financial Ratio Analysis
Ratio Analysis for Financial Statements
Investing : Valuation Ratio analysis for profit
The Complete Masterclass: Financial Statement Ratio Analysis
Financial Statement Analysis – Ratio Analysis

Top courses recommended by SBERT Bi-Encoder:
Dividendenstrategie: von steigenden Dividenden profitieren
Financial analysis using ratios
Ratio Analysis for Finance & Operational Business decisions
Analysis of Accounting Ratios
Price Action para Opções Binárias e Gatilho de entrada

Top 5 courses recommended for the cause of rejection: credit_decision

Top courses recommended by TF-IDF:
Acabou a Previdência e agora? -  Volume 03
Reducing Environmental Impacts with W.A.S.T.E. Walks
Forex Trading: A Simple Unique Approach For Massive Gains
Freelance Mindset - Become an Unstoppable Freelance Force!
Debit Spread For Half The Cost - Options

In [26]:
title_df

Unnamed: 0,title
0,The Complete SQL Bootcamp 2020: Go from Zero t...
1,Tableau 2020 A-Z: Hands-On Tableau Training fo...
2,PMP Exam Prep Seminar - PMBOK Guide 6
3,The Complete Financial Analyst Course 2020
4,An Entire MBA in 1 Course:Award Winning Busine...
...,...
13603,מושגים בסיסיים באופציות חלק ב
13604,Programa de Integridade
13605,Goodwill and Method of Calculating Goodwill (A...
13606,Poderoso Investidor


## Testing

In [38]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer

rejection_causes = [
    "finance",
]


sbert_model = SentenceTransformer('all-MiniLM-L6-v2') 
sentence_embeddings = sbert_model.encode(title_df['title'])

for cause in rejection_causes:
    print(f"\nTop 5 courses recommended for the cause of rejection: {cause}\n")
    
    query_title = cause
    cosine_similarities = np.dot(tfidf_matrix, query_vector.T).toarray().ravel()

    query_embedding = sbert_model.encode([query_title])
    cosine_similarities_sbert = np.dot(sentence_embeddings, query_embedding.T).ravel()

    top_indices_sbert = cosine_similarities_sbert.argsort()[-5:][::-1]
    print("\nTop courses recommended by SBERT Bi-Encoder:")
    for idx in top_indices_sbert:
        print(title_df.iloc[idx]['title'])



Top 5 courses recommended for the cause of rejection: finance


Top courses recommended by SBERT Bi-Encoder:
Corporate Finance
Personal Finance
Practical Finance
Fundamentals of Finance
Personal Finance 101


## Deploiement

In [39]:
import joblib

file_path = 'C:/deploiement/credit-scoring-website/deploiementbfifinal/credit-scoring-website/Backend/SentenceTransformer.pkl'
joblib.dump(SentenceTransformer, file_path)



['C:/deploiement/credit-scoring-website/deploiementbfifinal/credit-scoring-website/Backend/SentenceTransformer.pkl']

In [31]:
import joblib

file_path = 'C:/deploiement/credit-scoring-website/deploiementbfifinal/credit-scoring-website/Backend/top_indices_sbert.pkl'
joblib.dump(top_indices_sbert, file_path)

['C:/deploiement/credit-scoring-website/deploiementbfifinal/credit-scoring-website/Backend/top_indices_sbert.pkl']

In [36]:
import joblib
file_path2= 'C:/deploiement/credit-scoring-website/deploiementbfifinal/credit-scoring-website/Backend/cosine_similarities_sbert.pkl'
joblib.dump(cosine_similarities_sbert, file_path2)

['C:/deploiement/credit-scoring-website/deploiementbfifinal/credit-scoring-website/Backend/cosine_similarities_sbert.pkl']