In [18]:
TOKEN = "Huggingface Token"

In [12]:
!pip install datasets



In [13]:
!pip install kagglehub huggingface_hub tqdm



In [19]:
from huggingface_hub import login, whoami
login(TOKEN, True)
whoami()



{'type': 'user',
 'id': '675047b890ba48ec35e04e36',
 'name': 'Alaamer',
 'fullname': 'The First',
 'email': 'ahmedmuhmmed239@gmail.com',
 'emailVerified': True,
 'canPay': False,
 'periodEnd': None,
 'isPro': False,
 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/no-auth/gelK-ZhS7T9nWeSNQvgI5.png',
 'orgs': [],
 'auth': {'type': 'access_token',
  'accessToken': {'displayName': 'Mediumdataset',
   'role': 'write',
   'createdAt': '2025-03-02T13:15:32.245Z'}}}

In [22]:
KAGGLE_DATASETS = {
            "aiswaryaramachandran/medium-articles-with-content": "/content/1/medium-articles-with-content/2/Medium_AggregatedData.csv",
            "hsankesara/medium-articles": "/content/2/medium-articles/1/articles.csv",
            "meruvulikith/1300-towards-datascience-medium-articles-dataset": "/content/3/1300-towards-datascience-medium-articles-dataset/1/medium.csv"
}

HUGGINGFACE_DATASET = [
    "fabiochiu/medium-articles",
    # Requires Hugging face auth
    "Falah/medium_articles_posts"]

MY_DATASET_NAME = "Alaamer/medium-articles-posts-with-content"

# Save locally as Parquet
PARQUET_PATH = "large_dataset.parquet"

In [7]:
import kagglehub
import shutil
import os

def move_ds(dataset_path, des):
    """Moves the downloaded dataset to the specified destination folder.

    Args:
        dataset_path (str): The path to the downloaded dataset.
        des (str): The destination folder.
    """
    # Create the destination directory if it doesn't exist
    os.makedirs(des, exist_ok=True)
    # Move the dataset to the destination
    shutil.move(dataset_path, des)

def _delete_on_exist(dataset_path):
    """Deletes the downloaded dataset if it already exists in the destination folder.

    Args:
        dataset_path (str): The path to the downloaded dataset.
    """
    # Check if the dataset already exists in the destination folder
    if os.path.exists(dataset_path):
        # Delete the dataset
        # Use shutil.rmtree to delete a directory and its contents
        if os.path.isdir(dataset_path):
            shutil.rmtree(dataset_path)
        else:  # If it's a file, use os.remove
            os.remove(dataset_path)

def download_kaggle_ds(dataset, des, delete_on_exist=False):
    """Downloads a Kaggle dataset and moves it to the specified destination.

    Args:
        dataset (str): The name of the Kaggle dataset.
        des (str): The destination folder.
    """
    # Download the latest version of the dataset
    # Pass the specific file path to _delete_on_exist,
    # not the parent directory
    if delete_on_exist:
        _delete_on_exist(os.path.join(des, dataset.split("/")[-1]))
    dataset_path = kagglehub.dataset_download(dataset)
    print("Path to dataset files:", dataset_path)
    # Extract the dataset name from the path
    dataset_name = dataset.split("/")[-1]
    # Move the downloaded dataset to the destination folder
    move_ds(dataset_path, os.path.join(des, dataset_name))

def get_file_size_mb(file_path):
  """Gets the size of a file in megabytes (MB).

  Args:
    file_path: The path to the file.

  Returns:
    The size of the file in megabytes, or -1 if the file does not exist.
  """
  try:
    size_bytes = os.path.getsize(file_path)
    size_mb = size_bytes / (1024 * 1024)  # Convert bytes to MB
    return size_mb
  except FileNotFoundError:
    return -1

file_path = '/content/large_dataset.parquet'
file_size_mb = get_file_size_mb(file_path)

if file_size_mb != -1:
  print(f"The size of {file_path} is {file_size_mb:.2f} MB.")  # Format to 2 decimal places
else:
  print(f"File not found: {file_path}")

In [8]:
from datasets import load_dataset
import pandas as pd

def normalized_df(df):
   # Print the shape before deleting rows
    print("Shape before:", df.shape)

    # Handle both 'text' and 'Text' column names
    text_col = None
    if 'text' in df.columns:
        text_col = 'text'
    elif 'Text' in df.columns:
        text_col = 'Text'

    if text_col:
        # Drop rows with null text values
        df = df[df[text_col].notna()]

        # Drop duplicate rows based on the text column, keeping the first occurrence
        df.drop_duplicates(subset=[text_col], keep='first', inplace=True)

    # Print the shape after deleting rows
    print("Shape after:", df.shape)
    return df

def read_kaggle_and_normalize_df(file_path):
    # Read the CSV file into a pandas DataFrame
    df = pd.read_csv(file_path, on_bad_lines='skip')
    return normalized_df(df)

def load_huggigface_and_normalize_ds(d):
  # Load the dataset
  dataset = load_dataset(d)

  # Convert the dataset to a pandas DataFrame
  df = dataset["train"].to_pandas()
  return normalized_df(df)

In [9]:
# Set the maximum column width to display the full text
pd.set_option('display.max_colwidth', None)

In [10]:
from pandas import DataFrame
from tqdm.auto import tqdm

def rich_kaggle_df():
  combined_kaggle_df = DataFrame()

  for i, (d, out_path) in enumerate(tqdm(KAGGLE_DATASETS.items(), desc="Processing Kaggle Datasets")):
    print(f"Downloading {d}")
    download_kaggle_ds(d,f"/content/{i + 1}", True)
    df = read_kaggle_and_normalize_df(out_path)
    combined_kaggle_df = pd.concat([combined_kaggle_df, df], ignore_index=True) if combined_kaggle_df is not None else df
  print("Combined Huggingface dataset Shape :", combined_kaggle_df.shape)
  return combined_kaggle_df


def rich_huggingface_df():
  combined_huggingface_df = DataFrame()

  for d in tqdm(HUGGINGFACE_DATASET, desc="Processing Huggingface data"):
    print(f"Downloading {d}")
    df = load_huggigface_and_normalize_ds(d)
    combined_huggingface_df = pd.concat([combined_huggingface_df, df], ignore_index=True) if combined_huggingface_df is not None else df
  print("Combined Kaggle dataset Shape :", combined_huggingface_df.shape)
  return combined_huggingface_df

def get_full_dataset():
  # Concatenate the DataFrames vertically
  df1 = rich_kaggle_df()
  df2 = rich_huggingface_df()
  combined_df = pd.concat([df1, df2], ignore_index=True)

  return combined_df

In [13]:
combined_df = get_full_dataset()

Processing Kaggle Datasets:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading aiswaryaramachandran/medium-articles-with-content
Download already complete (229107549 bytes).
Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/aiswaryaramachandran/medium-articles-with-content/versions/2
Shape before: (279577, 50)
Shape after: (72024, 50)
Downloading hsankesara/medium-articles
Downloading from https://www.kaggle.com/api/v1/datasets/download/hsankesara/medium-articles?dataset_version_number=1...



100%|██████████| 1.34M/1.34M [00:00<00:00, 106MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/hsankesara/medium-articles/versions/1
Shape before: (337, 6)
Shape after: (230, 6)
Downloading meruvulikith/1300-towards-datascience-medium-articles-dataset





Downloading from https://www.kaggle.com/api/v1/datasets/download/meruvulikith/1300-towards-datascience-medium-articles-dataset?dataset_version_number=1...



100%|██████████| 2.69M/2.69M [00:00<00:00, 149MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/meruvulikith/1300-towards-datascience-medium-articles-dataset/versions/1





Shape before: (1391, 2)
Shape after: (1391, 2)
Combined Huggingface dataset Shape : (73645, 55)


Processing Huggingface data:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading fabiochiu/medium-articles


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Shape before: (384736, 6)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop_duplicates(subset=[text_col], keep='first', inplace=True)


Shape after: (185474, 6)
Downloading Falah/medium_articles_posts
Shape before: (192368, 6)
Shape after: (185474, 6)
Combined Kaggle dataset Shape : (370948, 6)


In [14]:
print("Combined datasets Shape :", combined_df.shape)

Combined datasets Shape : (444593, 58)


In [None]:
combined_df.to_parquet(PARQUET_PATH)

Generating train split: 0 examples [00:00, ? examples/s]

# Note

**Don't** forget to create new dataset at huggingface hub otherwise this cell will always fail, also you will need a ***`WRITE`*** TOKEN at from your settings

In [23]:
# Upload using the `datasets` library
from datasets import load_dataset

dataset = load_dataset("parquet", data_files=PARQUET_PATH)
dataset.push_to_hub(MY_DATASET_NAME)

Uploading the dataset shards:   0%|          | 0/6 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/75 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/75 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/75 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/75 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/75 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/75 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Alaamer/medium-articles-posts-with-content/commit/5f216219b3416e64564622f3ccda8aa0a967d4ae', commit_message='Upload dataset', commit_description='', oid='5f216219b3416e64564622f3ccda8aa0a967d4ae', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Alaamer/medium-articles-posts-with-content', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Alaamer/medium-articles-posts-with-content'), pr_revision=None, pr_num=None)

In [None]:
from datasets import load_dataset

def load_my_ds():
  # Load the dataset
  dataset = load_dataset(MY_DATASET_NAME)

  # Convert to a Pandas DataFrame (if needed)
  df = dataset['train'].to_pandas()
  return df
