### First of all importing the neccessary libraries to webscrap the dataset

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import csv
import json
from time import sleep

In [2]:
# Step 1: Obtain a list of the top-20 "Text Classification" models
classification_url = "https://huggingface.co/models?pipeline_tag=text-classification&sort=downloads"
classification_models = []

response = requests.get(classification_url)
soup = BeautifulSoup(response.content, "html.parser")
model_list = soup.find_all("article", class_="overview-card-wrapper")
for model in model_list[:20]:
    model_name = model.find("header", class_="flex items-center mb-0.5").text.strip()
    model_details  = model.find("div", class_="mr-1 flex items-center overflow-hidden whitespace-nowrap text-sm leading-tight text-gray-400").text.replace("\t","*").replace('\n', '*').replace('• ',"*").strip('').split("*")
    model_details =list(filter(bool, model_details))
    m_type= model_details[0]
    updated = model_details[2]
    downloaded= model_details[3]
    liked= model_details[4]
    dic = {
        "model_name": model_name,
        "model_type": m_type,
        "updated" : updated,
        "downloaded" : downloaded,
        "liked" : liked
    }
    # print(dic,end="\n")
    classification_models.append(dic)

In [3]:
# Obtain a list of the top-20 text generation models
generation_url = "https://huggingface.co/models?pipeline_tag=text-generation&sort=downloads"
generation_models = []

response = requests.get(generation_url)
soup = BeautifulSoup(response.content, "html.parser")
model_list = soup.find_all("article", class_="overview-card-wrapper")
for model in model_list[:20]:
    model_name = model.find("header", class_="flex items-center mb-0.5").text.strip()
    model_details  = model.find("div", class_="mr-1 flex items-center overflow-hidden whitespace-nowrap text-sm leading-tight text-gray-400").text.replace("\t","*").replace('\n', '*').replace('• ',"*").strip('').split("*")
    model_details =list(filter(bool, model_details))
    m_type= model_details[0]
    updated = model_details[2]
    downloaded= model_details[3]
    liked= model_details[4]
    dic = {
        "model_name": model_name,
        "model_type": m_type,
        "updated" : updated,
        "downloaded" : downloaded,
        "liked" : liked
    }
    # print(dic,end="\n")
    generation_models.append(dic)

In [4]:
print ( json.dumps(classification_models),end="\n \n")
print ( json.dumps(generation_models),end="\n \n")

[{"model_name": "distilbert-base-uncased-finetuned-sst-2-english", "model_type": "Text Classification", "updated": "Oct 26", "downloaded": "37.4M", "liked": "351"}, {"model_name": "cardiffnlp/twitter-roberta-base-irony", "model_type": "Text Classification", "updated": "Aug 2", "downloaded": "6.78M", "liked": "13"}, {"model_name": "mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis", "model_type": "Text Classification", "updated": "Mar 16", "downloaded": "6.52M", "liked": "122"}, {"model_name": "lxyuan/distilbert-base-multilingual-cased-sentiments-student", "model_type": "Text Classification", "updated": "Jun 24", "downloaded": "6.51M", "liked": "53"}, {"model_name": "SamLowe/roberta-base-go_emotions", "model_type": "Text Classification", "updated": "Oct 4", "downloaded": "6.51M", "liked": "187"}, {"model_name": "Ashishkr/query_wellformedness_score", "model_type": "Text Classification", "updated": "18 days ago", "downloaded": "6.43M", "liked": "14"}, {"model_name": "marie

In [5]:

# creating a csv file for classification models and it's features

filename = 'classification_models.csv'  # Specify the filename for the CSV file
fieldnames = list(classification_models[0].keys())  # Specify the field names for the CSV file

with open(filename, 'w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()  # Write the header row with field names
    writer.writerows(classification_models)  # Write the data rows to the CSV file

In [6]:
# creating a csv file for generative models and it's features
g_filename = 'generation_models.csv'  # Specify the filename for the CSV file
g_fieldnames = list(generation_models[0].keys())  # Specify the field names for the CSV file

with open(g_filename, 'w', newline='') as g_files:
    writer = csv.DictWriter(g_files, fieldnames=g_fieldnames)
    writer.writeheader()  # Write the header row with field names
    writer.writerows(generation_models)  # Write the data rows to the CSV file

In [7]:
# function for finding the targeted element
def find_elements(html, parent_selector):
    soup = BeautifulSoup(html, 'html.parser')
    parent_tag = soup.select_one(parent_selector)
    # print(parent_tag)
    # none check json
    temp={
    "linkedSpaces" : []
    }
    return parent_tag.get('data-props') if parent_tag is not None else json.dumps(temp)
  

In [8]:
# Step 2: Obtain and compare the number of ML apps ("spaces") for each "Text Classification" model
classification_app_count = []
for model_name in classification_models[:]:
    # individual model url
    model_url = f"https://huggingface.co/{model_name['model_name']}"
    # print(model_url)
    response = requests.get(model_url)
   
    
    #parent css selector 
    p_selector='[data-target="LinkedSpacesList"]'
   
    list_of_apps= find_elements(response.content,p_selector)  
   
    data= [ item['id'] for item in json.loads(list_of_apps)['linkedSpaces'] ] if list_of_apps is not None else []
    dic={
        "model_name": model_name['model_name'],
        "model_url":model_url,
        "space_list" : data,
        "total_app" : len(data)       
    }
    classification_app_count.append(dic)
print(json.dumps(classification_app_count, indent=2))

[
  {
    "model_name": "distilbert-base-uncased-finetuned-sst-2-english",
    "model_url": "https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english",
    "space_list": [
      "microsoft/HuggingGPT",
      "society-ethics/model-card-regulatory-check",
      "optimum/optimum-benchmark-ui",
      "nazneen/interactive-model-cards",
      "ccolas/TastyPiano",
      "ANDRYHA/FakeNewsClassifier",
      "nazneen/seal",
      "taesiri/HuggingGPT-Lite",
      "CK42/sentiment-model-comparison",
      "EnzoBustos/IC-2022-Classificacao-de-Dados-Financeiros",
      "nazneen/error-analysis",
      "fxmarty/bettertransformer-demo",
      "echarlaix/openvino-export",
      "autoevaluate/error-analysis",
      "tinkoff-ai/caif",
      "chinhon/Speech_Sentiment_Analysis",
      "butterswords/nlc-explorer",
      "kyleledbetter/responsibleGPT",
      "Olivernyu/sentiment_analysis_app",
      "Uvini/Hotel-Reviews",
      "Endre/SemanticSearch-HU",
      "abidlabs/call-sentiment-blocks-2",


In [9]:
print(len(classification_app_count))

20
