#### Library Imports
Addressing a problem I discovered when attempting to interpolate sentiment data. -Bobby


In [21]:
import pandas as pd

In [22]:
def check_duplicate_dates(df, date_column):
    """
    Checks for duplicate dates in a given DataFrame.

    Parameters:
        df (pd.DataFrame): The DataFrame to check.
        date_column (str): The name of the column containing date values.

    Returns:
        pd.DataFrame or None: DataFrame with duplicate dates, or None if no duplicates are found.
    """
    # Ensure the date column is in datetime format
    df[date_column] = pd.to_datetime(df[date_column])

    # Check for duplicates
    duplicates = df[df[date_column].duplicated(keep=False)]

    if not duplicates.empty:
        return duplicates
    else:
        return None
    

#### AAPL

In [36]:
a_sec = pd.read_csv('../../data/AAPL_SEC.csv')
a_fin = pd.read_csv('../../data/AAPL_FULL_FINANCIAL.csv')
a_news = pd.read_csv('../../data/cleaned_AAPL_NEWS.csv')

In [37]:
result = check_duplicate_dates(a_sec, 'filing_date')
if result is not None:
    print(f"Duplicates found in DataFrame a_sec:")
    print(result)
else:
    print(f"No duplicates found in DataFrame a_sec.")

Duplicates found in DataFrame a_sec:
    Unnamed: 0 filing_date                                           sec_text
53          25  2024-05-03  Section: Item 7.01: \n\nItem 7.01   Regulation...
54           1  2024-05-03  Section: Item 1: \nLegal Proceedings\nDigital ...


In [25]:
result = check_duplicate_dates(a_fin, 'Date')
if result is not None:
    print(f"Duplicates found in DataFrame a_fin:")
    print(result)
else:
    print(f"No duplicates found in DataFrame a_fin.")

No duplicates found in DataFrame a_fin.


  df[date_column] = pd.to_datetime(df[date_column])


In [26]:
result = check_duplicate_dates(a_news, 'date')
if result is not None:
    print(f"Duplicates found in DataFrame a_news:")
    print(result)
else:
    print(f"No duplicates found in DataFrame a_news.")

No duplicates found in DataFrame a_news.


#### JNJ

In [27]:
j_sec = pd.read_csv('../../data/JNJ_SEC.csv')
j_fin = pd.read_csv('../../data/JNJ_FULL_FINANCIAL.csv')
j_news = pd.read_csv('../../data/cleaned_JNJ_NEWS.csv')

In [28]:
result = check_duplicate_dates(j_sec, 'filing_date')
if result is not None:
    print(f"Duplicates found in DataFrame j_sec:")
    print(result)
else:
    print(f"No duplicates found in DataFrame j_sec.")

Duplicates found in DataFrame j_sec:
    Unnamed: 0 filing_date                                           sec_text
6           14  2020-04-29  Section: Item 1A: \nRISK FACTORS\nThe Company ...
7           78  2020-04-29                                                NaN
37          55  2022-04-29  Section: Item 5.02: \nItem 5.02 Departure of D...
38           8  2022-04-29                                                NaN
67          31  2023-12-05  Section: Item 5.02: \nItem 5.02(d). Departure ...
68          32  2023-12-05  Section: Item 8.01: \nItem 8.01    Other Event...
73          27  2024-05-01  Section: Item 2.02: \nItem 2.02     Results of...
74           2  2024-05-01                                                NaN


In [29]:
result = check_duplicate_dates(j_fin, 'Date')
if result is not None:
    print(f"Duplicates found in DataFrame j_fin:")
    print(result)
else:
    print(f"No duplicates found in DataFrame j_fin.")

No duplicates found in DataFrame j_fin.


  df[date_column] = pd.to_datetime(df[date_column])


In [30]:
result = check_duplicate_dates(j_news, 'date')
if result is not None:
    print(f"Duplicates found in DataFrame j_news:")
    print(result)
else:
    print(f"No duplicates found in DataFrame j_news.")

No duplicates found in DataFrame j_news.


#### Addressing SEC Text Duplicates

In [31]:
def condense_duplicates(df, date_column, text_column):
    """
    Condenses duplicate date entries by combining their text values into a single entry.

    Parameters:
        df (pd.DataFrame): The DataFrame to process.
        date_column (str): The name of the column containing date values.
        text_column (str): The name of the column containing text values.

    Returns:
        pd.DataFrame: A DataFrame with condensed duplicate date entries.
    """
    # Ensure the date column is in datetime format
    df[date_column] = pd.to_datetime(df[date_column])

    # Ensure the text column is string-typed
    df[text_column] = df[text_column].fillna("").astype(str)

    # Group by the date column and aggregate the text values
    condensed_df = df.groupby(date_column, as_index=False).agg({
        text_column: ' '.join  # Combine text values into a single string
    })

    return condensed_df

In [39]:
a_sec_new = condense_duplicates(a_sec, 'filing_date', 'sec_text')
result = check_duplicate_dates(a_sec_new, 'filing_date')
if result is not None:
    print(f"Duplicates found in DataFrame a_sec:")
    print(result)
else:
    print(f"No duplicates found in DataFrame a_sec.")

No duplicates found in DataFrame a_sec.


In [38]:
j_sec_new = condense_duplicates(j_sec, 'filing_date', 'sec_text')
result = check_duplicate_dates(j_sec_new, 'filing_date')
if result is not None:
    print(f"Duplicates found in DataFrame j_sec:")
    print(result)
else:
    print(f"No duplicates found in DataFrame j_sec.")

No duplicates found in DataFrame j_sec.


In [40]:
j_sec_new.to_csv('../../data/JNJ_SEC.csv')
a_sec_new.to_csv('../../data/AAPL_SEC.csv')