In [6]:
def preprocess(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [7]:
def get_relevant_courses(prompt, df):
    """
    This function returns the relevant courses based on the user's input.
    """
    prompt = prompt.lower()
    relevant_courses = df[df['skill name'].str.lower().str.contains(prompt, case=False) | 
                          df['topic'].str.lower().str.contains(prompt, case=False) | 
                          df['description'].str.lower().str.contains(prompt, case=False)]
    return relevant_courses

In [8]:
# Load pre-trained model and tokenizer
model = RobertaModel.from_pretrained('roberta-base')
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Function to calculate RoBERTa embeddings
def calculate_roberta_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(axis=1).detach().numpy()
    return embeddings[0]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [13]:
# Define a list to store the input history
input_history = []

def collect_messages(_):
    """
    This function collects and processes the user messages.
    """
    global input_history  # Declare the variable as global
    
    prompt = inp.value
    
    # Add the current input to the history
    input_history.append(prompt)
    
    inp.value = ''
    
    # List of phrases that are considered as thanking
    thank_you_phrases = ["thank you", "thanks", "appreciate it", "grateful", "much obliged", 
                         "cheers", "much appreciated", "thanks for the assistance"]

    # Preprocess user input
    user_input_processed = preprocess(prompt)

    # Check if the user_input_processed is in the list of thank you phrases. If it is, say 'you're welcome'
    if any(word in user_input_processed.lower() for word in thank_you_phrases):
        response = "You're welcome! I hope you find them useful in your learning journey!"
        
    # Initial check for input prompt
    elif not prompt.strip():
        response = "Welcome to IBM Skills Build!\nWhat would you like to learn more about?"
    
    else:
        # Check for direct matches in the dataset
        relevant_courses = get_relevant_courses(user_input_processed, df_final)
        
        # Generate the chatbot response based on direct matches
        if not relevant_courses.empty:
            response = generate_direct_matches_response(relevant_courses)
        
        else:
            # Calculate user embedding
            user_embedding = calculate_roberta_embedding(user_input_processed)

            if np.isnan(user_embedding).any():
                response = "Sorry, I couldn't understand your input. Please try again."
            else:
                # Calculate Euclidean distance between user input embedding and row embeddings
                topic_distance = cdist(df_final[[f"embedding_topic_{i}" for i in range(768)]], [user_embedding], metric='euclidean')
                type_distance = cdist(df_final[[f"embedding_type_{i}" for i in range(768)]], [user_embedding], metric='euclidean')
                skill_distance = cdist(df_final[[f"embedding_skill_{i}" for i in range(768)]], [user_embedding], metric='euclidean')
                description_distance = cdist(df_final[[f"embedding_description_{i}" for i in range(768)]], [user_embedding], metric='euclidean')

                # Combine distance scores (take the negative because we want to sort in descending order later)
                total_distance = -(topic_distance + type_distance + skill_distance + description_distance)

                # Retrieve most similar rows based on similarity scores
                top_indices = total_distance.argsort(axis=0)[-10:][::-1]  # Retrieve top 10 matches in descending order

                # Generate the chatbot response
                response = generate_similarity_response(top_indices, df_final)

    # Append the "User" row only when there is a non-empty prompt
    if prompt.strip():
        panels.append(pn.Row('User:', pn.pane.Markdown(prompt, width=800)))
    
    panels.append(
        pn.Row('Assistant:', pn.pane.HTML(response, width=800, styles={'background-color': '#e8f5f8'})))

    return pn.Column(*panels)


def generate_similarity_response(top_indices, df_final, max_responses=10):
    """
    Generates the chatbot response for the input that doesn't directly match something in the dataset, based on similarity scores.
    """
    response = "Based on your interests, here are the top 10 skills we think you'll find useful:<br>Click (opens in new tab) on the skill name or the course name for the relevant IBM Skilld Build website!<br><br>"
    num_responses = min(max_responses, len(top_indices.flatten()))
    for i, idx in enumerate(top_indices.flatten(), 1):
        topic = df_final.loc[idx]['topic']
        skill_name = df_final.loc[idx]['skill name']
        link = df_final.loc[idx]['link']
        type_ = df_final.loc[idx]['type']
        description = df_final.loc[idx]['description']
        specific_link = df_final.loc[idx]['specific link']

        response += f'{i}. <a href="{link}" target="_blank">{topic}</a> - <a href="{specific_link}" target="_blank">{skill_name}</a> ({type_}):'
        response += f'   {description}<br><br>'
    return response


def generate_direct_matches_response(relevant_courses, max_responses=10):
    """
    Generates the chatbot response for the directly matched courses.
    """
    response = "Based on your interests, here are some relevant courses we think you'll find useful:<br> Click (opens in new tab) on the skill name or the course name for the relevant IBM Skilld Build website!<br><br>"
    
    num_responses = min(max_responses, len(relevant_courses))
    for i, course in enumerate(relevant_courses.iterrows(), 1):
        if i > num_responses:
            break
        topic = course[1]['topic']
        skill_name = course[1]['skill name']
        link = course[1]['link']
        type_ = course[1]['type']
        description = course[1]['description']
        specific_link = course[1]['specific link']

        response += f"{i}. <a href='{link}' target='_blank'>{topic}</a> - <a href='{specific_link}' target='_blank'>{skill_name}</a> ({type_}):"
        response += f"   {description}<br><br>"
    return response



In [14]:
# Load the CSV file
df_final = pd.read_csv('FINAL (processed) (roberta).csv')

pn.extension() #panel extension

panels = [] # collect display 
inp = pn.widgets.TextInput(placeholder='Enter a skill...')

# Initialize "Chat!" button 
button_conversation = pn.widgets.Button(name="Chat!")

interactive_conversation = pn.bind(collect_messages, button_conversation)

dashboard = pn.Column(
    inp,                             #input box
    pn.Row(button_conversation),     #followed by the button "Chat!" 
    pn.panel(interactive_conversation, loading_indicator=True, height=400)    #styling
)

dashboard

In [1]:
!pip install openai
!pip install plotly.express
!pip install plotly
!pip install seaborn
!pip install pandas
!pip install requests
!pip install nltk
!pip install spaCy 
!pip install spacy
!pip install scikit-learn
!python -m spacy download en_core_web_sm
!pip install transformers
!pip install torch
!pip install wordcloud

!pip install beautifulsoup4
!pip install selenium
!pip install bs4

Collecting plotly.express
  Downloading plotly_express-0.4.1-py2.py3-none-any.whl (2.9 kB)
Collecting plotly>=4.1.0
  Downloading plotly-5.15.0-py2.py3-none-any.whl (15.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.5/15.5 MB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting statsmodels>=0.9.0
  Downloading statsmodels-0.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m39.5 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hCollecting patsy>=0.5
  Downloading patsy-0.5.3-py2.py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.8/233.8 kB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
Collecting tenacity>=6.2.0
  Downloading tenacity-8.2.2-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, patsy, plotly, statsmodels, plotly.express
Successfully installed patsy-0.5.3 plotl

In [3]:
!pip install panel

Collecting panel
  Downloading panel-1.1.0-py2.py3-none-any.whl (20.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting param>=1.12.0
  Downloading param-1.13.0-py2.py3-none-any.whl (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.3/87.3 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyviz-comms>=0.7.4
  Downloading pyviz_comms-2.3.1-py2.py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Collecting markdown
  Downloading Markdown-3.4.3-py3-none-any.whl (93 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.9/93.9 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
Collecting linkify-it-py
  Downloading linkify_it_py-2.0.2-py3-none-any.whl (19 kB)
Collecting markdown-it-py<3
  Downloading markdown_it_py-2.2.0-py3-none-any.whl (84 kB)


In [5]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cdist, euclidean
import torch
from transformers import BertModel, BertTokenizer, TFBertModel, RobertaModel, RobertaTokenizer
import string
import re
import os
import openai
import pandas as pd
import panel as pn  # GUI
pn.extension()

import pandas as pd
# import seaborn as sns
# import matplotlib
# import matplotlib.cm as cm
# import matplotlib.pyplot as plt
# import plotly as py
# import plotly.io as pio
# import plotly.express as px
# import plotly.figure_factory as ff
# from plotly.subplots import make_subplots
# from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
# init_notebook_mode(connected=True)
# import plotly.graph_objects as go
# from matplotlib.patches import Patch
# from matplotlib.ticker import FormatStrFormatter
# matplotlib.style.use('seaborn')
# %matplotlib inline
# import json
# plt.style.use('seaborn')
# from wordcloud import WordCloud, ImageColorGenerator
# from PIL import Image
# from selenium import webdriver
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# import time
# from selenium.common.exceptions import NoSuchElementException, TimeoutException
# from selenium.webdriver.common.action_chains import ActionChains

from IPython.display import display, clear_output, HTML, Markdown
# import ipywidgets as widgets
# from ipywidgets import HBox, VBox
import textwrap
# import gensim.downloader as api
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
from spacy.lang.en import English

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
