In [1]:
pip install datasets

Defaulting to user installation because normal site-packages is not writeable
Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[K     |████████████████████████████████| 547 kB 5.2 MB/s eta 0:00:01
[?25hCollecting multiprocess
  Downloading multiprocess-0.70.16-py39-none-any.whl (133 kB)
[K     |████████████████████████████████| 133 kB 22.9 MB/s eta 0:00:01
Collecting requests>=2.32.2
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[K     |████████████████████████████████| 64 kB 9.1 MB/s  eta 0:00:01
[?25hCollecting aiohttp
  Downloading aiohttp-3.9.5-cp39-cp39-macosx_11_0_arm64.whl (390 kB)
[K     |████████████████████████████████| 390 kB 32.2 MB/s eta 0:00:01
[?25hCollecting tqdm>=4.66.3
  Downloading tqdm-4.66.4-py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 20.4 MB/s eta 0:00:01
[?25hCollecting pyarrow-hotfix
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.9,>=0.3.0
  Downloading dill

In [4]:
import pandas as pd
import numpy as np
from datasets import load_dataset
import matplotlib.pyplot as plt

In [5]:
class TransformDataset:
    def __init__(self, dataset_name: str):
        """
        Initialize the TransformDataset class.

        Args:
            dataset_name (str): The name of the dataset to load from Hugging Face.
        """
        try:
            self.dataset = load_dataset(dataset_name)
        except Exception as e:
            raise ValueError(f"Error loading dataset: {e}")

    def hug_to_pandas(self) -> pd.DataFrame:
        """
        Convert Hugging Face dataset to a pandas DataFrame.

        Returns:
            pd.DataFrame: The converted DataFrame.
        """
        try:
            return pd.DataFrame(self.dataset["train"])
        except Exception as e:
            raise ValueError(f"Error converting dataset to DataFrame: {e}")

    def filter_df(self, df: pd.DataFrame, cols: list) -> pd.DataFrame:
        """
        Filter the DataFrame to include only specified columns.

        Args:
            df (pd.DataFrame): The DataFrame to filter.
            cols (list): The list of columns to include.

        Returns:
            pd.DataFrame: The filtered DataFrame.
        """
        return df[cols]

    def add_past_year_item(self, df: pd.DataFrame, main_col: str) -> pd.DataFrame:
        """
        Add a column with the past year's data for each 'cik' and 'company'.

        Args:
            df (pd.DataFrame): The DataFrame to process.
            main_col (str): The column name containing the data to shift.

        Returns:
            pd.DataFrame: The DataFrame with the added past year's data column.
        """
        df = df.sort_values(by=['cik', 'company', 'date'])
        df['past_year_data'] = df.groupby(['cik', 'company'])[main_col].shift(1)
        df=df.dropna()
        return df






In [6]:
Dataset=TransformDataset("jlohding/sp500-edgar-10k")

Downloading readme: 100%|██████████| 1.26k/1.26k [00:00<00:00, 2.03MB/s]
Downloading data: 100%|██████████| 68.8M/68.8M [00:01<00:00, 37.5MB/s]
Downloading data: 100%|██████████| 69.8M/69.8M [00:01<00:00, 48.1MB/s]
Downloading data: 100%|██████████| 71.0M/71.0M [00:01<00:00, 40.9MB/s]
Downloading data: 100%|██████████| 71.6M/71.6M [00:01<00:00, 36.1MB/s]
Downloading data: 100%|██████████| 72.9M/72.9M [00:02<00:00, 33.0MB/s]
Downloading data: 100%|██████████| 72.2M/72.2M [00:01<00:00, 46.1MB/s]
Downloading data: 100%|██████████| 74.6M/74.6M [00:01<00:00, 41.2MB/s]
Downloading data: 100%|██████████| 74.8M/74.8M [00:01<00:00, 44.1MB/s]
Downloading data: 100%|██████████| 76.7M/76.7M [00:01<00:00, 53.9MB/s]
Downloading data: 100%|██████████| 76.9M/76.9M [00:01<00:00, 39.1MB/s]
Downloading data: 100%|██████████| 75.6M/75.6M [00:01<00:00, 51.8MB/s]
Downloading data: 100%|██████████| 80.2M/80.2M [00:02<00:00, 36.8MB/s]
Downloading data: 100%|██████████| 79.2M/79.2M [00:02<00:00, 34.3MB/s]
Gene

In [7]:
Dataset.hug_to_pandas()

Unnamed: 0,cik,sic,company,date,item_1,item_1A,item_1B,item_2,item_3,item_4,...,5_day_return,10_day_return,20_day_return,40_day_return,60_day_return,80_day_return,100_day_return,150_day_return,252_day_return,__index_level_0__
0,1043277,4731,C H ROBINSON WORLDWIDE INC,2010-03-01,Item 1. BUSINESS\nOverview\nC.H. Robinson Worl...,ITEM 1A. RISK FACTORS\nYou should consider car...,ITEM 1B. UNRESOLVED STAFF COMMENTS\nNone.,ITEM 2. PROPERTIES\nOur corporate headquarters...,ITEM 3. LEGAL PROCEEDINGS\nGender Discriminati...,ITEM 4. RESERVED\nPART II,...,1.000262,1.010768,1.052226,1.130642,1.076802,1.082564,1.119114,1.316981,1.373999,2
1,11199,2670,BEMIS CO INC,2010-03-01,"ITEM 1 - BUSINESS\nBemis Company, Inc., a Miss...","ITEM 1A - RISK FACTORS\nThe following factors,...",ITEM 1B - UNRESOLVED STAFF COMMENTS\nNone.,ITEM 2 - PROPERTIES\nProperties utilized by th...,ITEM 3 - LEGAL PROCEEDINGS\nThe Company is inv...,"ITEM 4 - [RESERVED]\nPART II - ITEMS 5, 6, 7, ...",...,1.002702,1.002364,0.988180,1.034786,0.963769,0.976024,1.009387,1.096707,1.144034,20
2,822416,1531,PULTE HOMES INC/MI/,2010-02-19,,ITEM 1A. RISK FACTORS\nDiscussion of our busin...,ITEM 1B. UNRESOLVED STAFF COMMENTS\nThis Item ...,ITEM 2. PROPERTIES\nOur homebuilding and corpo...,ITEM 3. LEGAL PROCEEDINGS\nWe are involved in ...,ITEM 4. SUBMISSION OF MATTERS TO A VOTE OF SEC...,...,0.947507,0.982502,0.979877,0.972003,1.019248,0.868766,0.747157,0.715661,0.678915,38
3,1021860,3533,NATIONAL OILWELL VARCO INC,2010-02-26,ITEM 1. BUSINESS\nGeneral\nNational Oilwell Va...,ITEM 1A. RISK FACTORS\nYou should carefully co...,ITEM 1B. UNRESOLVED STAFF COMMENTS\nNone.,ITEM 2. PROPERTIES\nThe Company owned or lease...,ITEM 3. LEGAL PROCEEDINGS\nWe have various cla...,ITEM 4. SUBMISSION OF MATTERS TO A VOTE OF SEC...,...,1.004371,1.013125,0.920435,1.070998,0.831666,0.829176,0.846518,1.030842,1.853791,39
4,1111711,4931,NISOURCE INC/DE,2010-02-26,ITEM 1.\nBUSINESS\nNiSource Inc.\nNiSource is ...,ITEM 1A.\nRISK FACTORS\nNiSource Inc.\nThere a...,ITEM 1B. UNRESOLVED STAFF COMMENTS\nNone.,ITEM 2.\nPROPERTIES\nNiSource Inc.\nDiscussed ...,ITEM 3.\nLEGAL PROCEEDINGS\nNiSource Inc.\n1. ...,ITEM 4.\nSUBMISSION OF MATTERS TO A VOTE OF SE...,...,1.023968,1.031292,1.050599,1.105193,0.995317,1.018275,1.068919,1.191144,1.330613,42
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6277,1868275,4911,Constellation Energy Corp,2022-02-25,"ITEM 1.\nGeneral\nOn February 21, 2021, the bo...",ITEM 1A. RISK FACTORS\nWe operate in a complex...,ITEM 1B. UNRESOLVED STAFF COMMENTS\nNone.,ITEM 2. PROPERTIES\nThe following table presen...,ITEM 3. LEGAL PROCEEDINGS\nWe are parties to v...,ITEM 4. MINE SAFETY DISCLOSURES\nNot Applicabl...,...,1.161467,1.123207,1.225233,1.418128,1.292399,1.351996,1.236685,1.902839,1.902839,6395
6278,78003,2834,PFIZER INC,2022-02-24,ITEM 1. BUSINESS\nABOUT PFIZER\nPfizer Inc. is...,,,ITEM 2. PROPERTIES\nWe own and lease space glo...,ITEM 3. LEGAL PROCEEDINGS\nCertain legal proce...,,...,1.040688,1.070496,1.144256,1.047215,1.150707,1.055089,1.114522,0.976100,0.976100,6401
6279,66740,3841,3M CO,2022-02-09,Item 1. Business.\n3M Company was incorporated...,Item 1A. Risk Factors\nProvided below is a cau...,Item 1B. Unresolved Staff Comments.\nNone.,"Item 2. Properties.\nIn the U.S., 3M’s general...",Item 3. Legal Proceedings.\nDiscussion of lega...,Item 4. Mine Safety Disclosures.\nPursuant to ...,...,0.951051,0.886950,0.888369,0.925526,0.922934,0.911592,0.809528,0.733305,0.733305,6445
6280,1524472,3561,Xylem Inc.,2022-02-25,ITEM 1. BUSINESS\nBusiness Overview\nXylem is ...,ITEM 1A. RISK FACTORS\nIn evaluating our busin...,ITEM 1B. UNRESOLVED STAFF COMMENTS.\nNone.,ITEM 2. PROPERTIES\nWe have approximately 345 ...,"ITEM 3. LEGAL PROCEEDINGS\nFrom time to time, ...",ITEM 4. MINE SAFETY DISCLOSURES\nNot applicabl...,...,0.937534,0.919609,0.953371,0.924337,0.893434,0.833604,0.915728,0.967472,0.967472,6456
