In [None]:
pip install beautifulsoup4

In [None]:
import requests
from bs4 import BeautifulSoup
import time

# Base URL
base_url = "https://www.shl.com/solutions/products/product-catalog/"

scrap_data=[]

# Function to extract data from a single table wrapper
def extract_assessments_from_table(table_div):
    table = table_div.find('table')
    if not table:
        return

    for row in table.find_all('tr')[1:]:  # Skip header
        cols = row.find_all('td')

        if len(cols) >= 4:
            job_solution_link = cols[0].find('a')
            job_solution_text = job_solution_link.text.strip() if job_solution_link else ""
            job_solution_url = job_solution_link['href'] if job_solution_link else ""

            remote_testing = "Yes" if cols[1].find('span', class_='catalogue__circle -yes') else "No"
            adaptive_irt = "Yes" if cols[2].text.strip() == '✓' else "No"

            test_types = [span.text.strip() for span in cols[3].find_all('span', class_='product-catalogue__key')]

            scrap_data.append({
                "Job Solution": job_solution_text,
                "Link": job_solution_url,
                "Remote Testing": remote_testing,
                "Adaptive/IRT": adaptive_irt,
                "Test Types": ", ".join(test_types)
            })

# Loop through pages from start=12 to 132 (12 * 0 to 12 * 11)
for i in range(0, 12):
    start_value = 12 * i
    url = f"{base_url}?start={start_value}&type=2&type=2"

    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    table_wrappers = soup.find_all('div', class_='custom__table-wrapper')
    for table_div in table_wrappers:
        extract_assessments_from_table(table_div)

    time.sleep(1)  # Optional: pause between requests

# Print results
for item in scrap_data:
    print(item)


In [None]:
def get_assessment_duration(detail_url, retries=3, delay=2):
    for attempt in range(retries):
        try:
            headers = {"User-Agent": "Mozilla/5.0"}
            response = requests.get(detail_url, headers=headers, timeout=10)

            if response.status_code == 404:
                print(f"404 Not Found: {detail_url}")
                return None

            if response.status_code != 200:
                print(f"Status code {response.status_code} for {detail_url}")
                continue

            soup = BeautifulSoup(response.content, 'html.parser')
            for row in soup.find_all('div', class_='product-catalogue-training-calendar__row'):
                p_tag = row.find('p')
                if p_tag:
                    match = re.search(r'Approximate Completion Time in minutes\s*=\s*(\d+)', p_tag.text)
                    if match:
                        return int(match.group(1))
            return None

        except requests.exceptions.RequestException as e:
            print(f"Error fetching {detail_url}: {e}")
            time.sleep(delay * (2 ** attempt))  # exponential backoff

    return None


In [None]:
import re
import urllib.parse


def format_job_solution_to_url(job_solution):
    slug = job_solution.lower()
    slug = slug.replace('+', '-')
    slug = slug.replace(".", "-")
    slug = slug.replace("’", "")
    slug = slug.replace("&", "and")

    # Keep parentheses for encoding
    slug = re.sub(r'[^a-z0-9\s()\-]', '', slug)  # Keep (), hyphens, letters/numbers/spaces
    slug = re.sub(r'\s+', '-', slug)             # Replace spaces with hyphens
    slug = re.sub(r'-+', '-', slug)              # Remove duplicate hyphens

    slug = urllib.parse.quote(slug)              # Now encode the final string (turns () into %28 and %29)
    return slug.strip('-') + '/'



In [None]:
base_detail_url = "https://www.shl.com/solutions/products/product-catalog/view/"

for item in scrap_data:
    job_solution = item["Job Solution"]
    job_slug = format_job_solution_to_url(job_solution)
    detail_url = base_detail_url + job_slug
    print(f"Fetching: {detail_url}")

    item['Duration'] = get_assessment_duration(detail_url)

    # Respectful delay between requests
    time.sleep(random.uniform(2, 4))


In [None]:
df=pd.DataFrame(scrap_data)

In [None]:
# Count how many NaN values are in the 'Duration' column
df['Duration'].isna().sum()


In [None]:
# Set duration for a row where Job Solution matches
df.loc[df['Job Solution'] == 'Workplace Safety - Team 7.1 (International)', 'Duration'] = 20

In [None]:
# Save the DataFrame as a CSV file
df.to_csv('shl_data.csv', index=False)

In [None]:
df.fillna({'Duration (min)': None}, inplace=True)

In [None]:
base_url = "https://www.shl.com"
df['Link'] = df['Link'].apply(lambda x: f"{base_url}{x}" if x.startswith('/') else x)

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import gradio as gr

# Load data
df = pd.read_csv("shl_data1.csv")

# Check required columns
required_columns = ['Job Solution', 'Link', 'Remote Testing', 'Adaptive/IRT', 'Duration', 'Test Types']
for col in required_columns:
    if col not in df.columns:
        raise ValueError(f"Missing required column: {col}")

# Load model and embed job solutions
model = SentenceTransformer('all-MiniLM-L6-v2')
df['Embedding'] = df['Job Solution'].apply(lambda x: model.encode(x))

def recommend(query, num_recommendations=10):
    query_emb = model.encode(query).reshape(1, -1)
    similarities = cosine_similarity(query_emb, df['Embedding'].tolist())[0]

    df_temp = df.copy()
    df_temp['Similarity'] = similarities
    top_results = df_temp.sort_values('Similarity', ascending=False).head(num_recommendations)

    # Create HTML table
    html = "<table style='width:100%; border-collapse: collapse;'>"
    html += "<tr><th style='border: 1px solid #ccc;'>Job Solution</th><th style='border: 1px solid #ccc;'>Link</th><th style='border: 1px solid #ccc;'>Remote Testing</th><th style='border: 1px solid #ccc;'>Adaptive/IRT</th><th style='border: 1px solid #ccc;'>Duration</th><th style='border: 1px solid #ccc;'>Test Types</th></tr>"

    for _, row in top_results.iterrows():
        html += f"""
        <tr>
            <td style='border: 1px solid #ccc;'>{row['Job Solution']}</td>
            <td style='border: 1px solid #ccc;'><a href="{row['Link']}" target="_blank">Click here</a></td>
            <td style='border: 1px solid #ccc;'>{row['Remote Testing']}</td>
            <td style='border: 1px solid #ccc;'>{row['Adaptive/IRT']}</td>
            <td style='border: 1px solid #ccc;'>{row['Duration']}</td>
            <td style='border: 1px solid #ccc;'>{row['Test Types']}</td>
        </tr>
        """

    html += "</table>"
    return html

# Gradio interface using HTML output
iface = gr.Interface(
    fn=recommend,
    inputs=[
        gr.Textbox(label="Query", placeholder="e.g. 'Software developer with problem-solving skills'"),
        gr.Slider(1, 20, value=10, step=1, label="Number of Recommendations")
    ],
    outputs=gr.HTML(label="Top SHL Assessment Recommendations"),
    examples=[
        ["Entry-level marketing role", 5],
        ["Senior software engineer position", 3],
        ["Customer service representative", 10]
    ],
    title="SHL Assessment Recommender",
    description="Enter a job description to get relevant SHL assessment suggestions. Links are fully clickable!"
)

iface.launch()


In [44]:
import pandas as pd
from sentence_transformers import SentenceTransformer

# Load the dataset
df = pd.read_csv("shl_data1.csv")

# Check required columns
required_columns = ['Job Solution', 'Link', 'Remote Testing', 'Adaptive/IRT', 'Duration', 'Test Types']
for col in required_columns:
    if col not in df.columns:
        raise ValueError(f"Missing required column: {col}")

# Load pre-trained model and generate embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
df['Embedding'] = df['Job Solution'].apply(lambda x: model.encode(x).tolist())

In [45]:
df.to_csv("preprocess_data.csv", index=False)