<a href="https://colab.research.google.com/github/acrobyte007/Steps-AI-Task/blob/main/Steps_AI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import pandas as pd
import concurrent.futures
import threading
import time

class WebCrawler:
    def __init__(self, base_url, max_depth, max_urls=1000, max_workers=10):
        self.base_url = base_url
        self.max_depth = max_depth
        self.max_urls = max_urls  # Adjusted to limit to around 1000 URLs
        self.visited = set()
        self.data = []
        self.lock = threading.Lock()
        self.url_count = 0
        self.max_workers = max_workers

    def crawl(self, url, depth, origin_url=None):
        with self.lock:
            if self.url_count >= self.max_urls or url in self.visited:
                return
            self.visited.add(url)
            self.url_count += 1

        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()
        except requests.RequestException as e:
            print(f"Failed to retrieve URL: {url} - {e}")
            return

        soup = BeautifulSoup(response.text, 'html.parser')
        self.extract_data(soup, url, origin_url)
        print(f"Finished crawling: {url}")

        if self.url_count < self.max_urls:
            links = self.extract_links(soup, url)
            with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
                futures = [executor.submit(self.crawl, link, depth + 1, origin_url=url) for link in links]
                concurrent.futures.wait(futures)

    def extract_data(self, soup, url, origin_url=None):
        headers = soup.find_all(['h1', 'h2', 'h3'])
        for header in headers:
            topic_name = header.get_text(strip=True)
            link = header.find('a')['href'] if header.find('a') else url
            information_text = topic_name

            # Collect text from the following p tags until another header or end of section is found
            for sibling in header.find_next_siblings():
                if sibling.name == 'p':
                    information_text += ' ' + sibling.get_text(strip=True)
                elif sibling.name in ['h1', 'h2', 'h3']:
                    break

            with self.lock:
                self.data.append({
                    'Topic Name': topic_name,
                    'Links': link,
                    'Information and Text': information_text.strip(),
                    'Originating URL': origin_url if origin_url else url  # Add originating URL
                })

    def extract_links(self, soup, base_url):
        links = set()
        for a_tag in soup.find_all('a', href=True):
            href = a_tag['href']
            if not urlparse(href).netloc:
                href = urljoin(base_url, href)
            if href.startswith(self.base_url):
                links.add(href)
        return links

    def start(self):
        self.crawl(self.base_url, 0, origin_url=self.base_url)  # Start with base_url as origin_url
        return self.data

if __name__ == "__main__":
    base_url = "https://docs.nvidia.com/cuda/"
    max_depth = 5

    start_time = time.time()
    crawler = WebCrawler(base_url, max_depth, max_urls=1000)  # Adjusted to crawl around 1000 URLs
    scraped_data = crawler.start()
    end_time = time.time()

    # Convert the scraped data to a DataFrame
    df = pd.DataFrame(scraped_data)

    # Save the DataFrame to a CSV file with URL references
    df.to_csv('nvidia_cuda_docs_info_1000_urls.csv', index=False)

    print(f"Scraped {len(scraped_data)} pages in {end_time - start_time} seconds.")
    print(f"Data has been successfully extracted and saved to 'nvidia_cuda_docs_info_1000_urls.csv'.")


Finished crawling: https://docs.nvidia.com/cuda/
Finished crawling: https://docs.nvidia.com/cuda/demo-suite/index.html
Finished crawling: https://docs.nvidia.com/cuda/debugger-api/index.html
Finished crawling: https://docs.nvidia.com/cuda/#installation-guides
Finished crawling: https://docs.nvidia.com/cuda/turing-tuning-guide/index.html
Finished crawling: https://docs.nvidia.com/cuda/ampere-tuning-guide/index.html
Finished crawling: https://docs.nvidia.com/cuda/cuda-runtime-api/index.html
Finished crawling: https://docs.nvidia.com/cuda/demo-suite/index.html#notice
Finished crawling: https://docs.nvidia.com/cuda/ptx-compiler-api/index.html
Finished crawling: https://docs.nvidia.com/cuda/demo-suite/index.html#oceanfftFinished crawling: https://docs.nvidia.com/cuda/demo-suite/index.html#introduction

Finished crawling: https://docs.nvidia.com/cuda/index.htmlFinished crawling: https://docs.nvidia.com/cuda/demo-suite/index.html#devicequeryFinished crawling: https://docs.nvidia.com/cuda/npp/

In [None]:
import pandas as pd


In [None]:
df=pd.read_csv('nvidia_cuda_docs_info_1000_urls.csv')

In [None]:
df

Unnamed: 0,Topic Name,Links,Information and Text
0,CUDA Toolkit Documentation 12.5 Update 1ï,#cuda-toolkit-documentation-v12-4,CUDA Toolkit Documentation 12.5 Update 1ï De...
1,Installation Guidesï,#installation-guides,Installation Guidesï
2,Programming Guidesï,#programming-guides,Programming Guidesï
3,CUDA API Referencesï,#cuda-api-references,CUDA API Referencesï
4,PTX Compiler API Referencesï,#ptx-compiler-api-references,PTX Compiler API Referencesï
...,...,...,...
31928,"21.2.3.Multi-Node, Single-GPUï",#multi-node-single-gpu,"21.2.3.Multi-Node, Single-GPUï Beyond memory..."
31929,22.Noticesï,#notices,22.Noticesï
31930,22.1.Noticeï,#notice,22.1.Noticeï This document is provided for i...
31931,22.2.OpenCLï,#opencl,22.2.OpenCLï OpenCL is a trademark of Apple ...


In [None]:
df=df.drop_duplicates()

In [None]:
df

Unnamed: 0,Topic Name,Links,Information and Text
0,CUDA Toolkit Documentation 12.5 Update 1ï,#cuda-toolkit-documentation-v12-4,CUDA Toolkit Documentation 12.5 Update 1ï De...
1,Installation Guidesï,#installation-guides,Installation Guidesï
2,Programming Guidesï,#programming-guides,Programming Guidesï
3,CUDA API Referencesï,#cuda-api-references,CUDA API Referencesï
4,PTX Compiler API Referencesï,#ptx-compiler-api-references,PTX Compiler API Referencesï
...,...,...,...
31452,"21.2.3.Multi-Node, Single-GPUï",#multi-node-single-gpu,"21.2.3.Multi-Node, Single-GPUï Beyond memory..."
31459,22.Noticesï,#notices,22.Noticesï
31466,22.1.Noticeï,#notice,22.1.Noticeï This document is provided for i...
31497,22.2.OpenCLï,#opencl,22.2.OpenCLï OpenCL is a trademark of Apple ...


In [None]:
df['Topic Name'].value_counts()

Topic Name
Search Results                                 398
Variables                                      252
Public Variables                               115
Enumerations                                    82
Functions                                       74
                                              ... 
10.15.Special Registers: %cluster_nctaidï      1
10.14.Special Registers: %cluster_ctaidï       1
10.13.Special Registers: %nclusteridï          1
10.12.Special Registers: %clusteridï           1
22.3.Trademarksï                               1
Name: count, Length: 1937, dtype: int64

In [None]:
import string

# Printable characters
printable = set(string.printable)


In [None]:
 ## Function to remove non-printable characters
def remove_non_printable(s):
    return ''.join(filter(lambda x: x in printable, s))

# Apply the function to the "Topic Name" column
df["Topic Name"] = df["Topic Name"].apply(remove_non_printable)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Topic Name"] = df["Topic Name"].apply(remove_non_printable)


In [None]:
df['Information and Text']=df['Information and Text'].apply(remove_non_printable)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Information and Text']=df['Information and Text'].apply(remove_non_printable)


In [None]:
df

Unnamed: 0,Topic Name,Links,Information and Text
0,CUDA Toolkit Documentation 12.5 Update 1,#cuda-toolkit-documentation-v12-4,CUDA Toolkit Documentation 12.5 Update 1 Devel...
1,Installation Guides,#installation-guides,Installation Guides
2,Programming Guides,#programming-guides,Programming Guides
3,CUDA API References,#cuda-api-references,CUDA API References
4,PTX Compiler API References,#ptx-compiler-api-references,PTX Compiler API References
...,...,...,...
31452,"21.2.3.Multi-Node, Single-GPU",#multi-node-single-gpu,"21.2.3.Multi-Node, Single-GPU Beyond memory al..."
31459,22.Notices,#notices,22.Notices
31466,22.1.Notice,#notice,22.1.Notice This document is provided for info...
31497,22.2.OpenCL,#opencl,22.2.OpenCL OpenCL is a trademark of Apple Inc...


In [None]:
df['Topic Name']

0        CUDA Toolkit Documentation 12.5 Update 1
1                             Installation Guides
2                              Programming Guides
3                             CUDA API References
4                     PTX Compiler API References
                           ...                   
31452               21.2.3.Multi-Node, Single-GPU
31459                                  22.Notices
31466                                 22.1.Notice
31497                                 22.2.OpenCL
31504                             22.3.Trademarks
Name: Topic Name, Length: 3374, dtype: object

In [None]:
 df.to_csv('clean_data.csv', index=False)

In [None]:
# Drop duplicates where both "Topic Name" and "Information and Text" are the same
df_cleaned = df.drop_duplicates(subset=["Topic Name", "Information and Text"])

# Display the cleaned DataFrame
df_cleaned


Unnamed: 0,Topic Name,Links,Information and Text
0,CUDA Toolkit Documentation 12.5 Update 1,#cuda-toolkit-documentation-v12-4,CUDA Toolkit Documentation 12.5 Update 1 Devel...
1,Installation Guides,#installation-guides,Installation Guides
2,Programming Guides,#programming-guides,Programming Guides
3,CUDA API References,#cuda-api-references,CUDA API References
4,PTX Compiler API References,#ptx-compiler-api-references,PTX Compiler API References
...,...,...,...
31452,"21.2.3.Multi-Node, Single-GPU",#multi-node-single-gpu,"21.2.3.Multi-Node, Single-GPU Beyond memory al..."
31459,22.Notices,#notices,22.Notices
31466,22.1.Notice,#notice,22.1.Notice This document is provided for info...
31497,22.2.OpenCL,#opencl,22.2.OpenCL OpenCL is a trademark of Apple Inc...


In [None]:
df_cleaned.to_csv('clean_data_2.csv', index=False)

In [1]:
import pandas as pd

In [2]:
new_data=pd.read_csv('clean_data_2.csv')

In [3]:
new_data.columns

Index(['Topic Name', 'Links', 'Information and Text'], dtype='object')

In [4]:
!pip install transformers
!pip install torch
!pip install milvus
!pip install pymilvus


Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [5]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/227.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m225.3/227.1 kB[0m [31m7.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-3.0.1


In [6]:
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.cluster import KMeans
import pandas as pd

# Load the data
new_data= pd.read_csv('clean_data_2.csv')

# Load pre-trained SentenceTransformer model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Generate embeddings for each text
embeddings = model.encode(new_data['Information and Text'].tolist())

# Cluster embeddings
num_clusters = 10
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(embeddings)
cluster_assignment = clustering_model.labels_

# Add cluster assignments to data
new_data['Cluster'] = cluster_assignment




  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]



In [9]:
new_data.columns

Index(['Topic Name', 'Links', 'Information and Text', 'Cluster'], dtype='object')

In [11]:
# Function to lowercase and remove hashtags
def clean_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove hashtags
    text = text.replace("#", "")
    return text

# Apply the function to the 'Topic Name' and 'Information and Text' columns
new_data['Topic Name'] = new_data['Topic Name'].apply(clean_text)
new_data['Information and Text'] = new_data['Information and Text'].apply(clean_text)


In [14]:
new_data = new_data.drop_duplicates(subset=["Topic Name", "Information and Text"])

In [15]:
new_data.to_csv('clustered_data_2.csv', index=False)