In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta

# Load the data
csv_file_path = "/Users/balmeru/Downloads/ggplot.csv"
df = pd.read_csv(csv_file_path)

# Ensure 'rdq' is properly formatted as datetime and 'fyr' as numeric
df['rdq'] = pd.to_datetime(df['rdq'], errors='coerce')
df['fyr'] = pd.to_numeric(df['fyr'], errors='coerce')

# Sort by 'tic' (ticker), 'fyearq' (fiscal year), and 'fqtr' (fiscal quarter)
df = df.sort_values(by=['tic', 'fyearq', 'fqtr']).reset_index(drop=True)


In [2]:
# Identify ticker changes and detect fiscal year changes per ticker
ticker_change = df['tic'] != df['tic'].shift(1)
df['fyr_change_dummy'] = ((df['fyr'] != df['fyr'].shift(1)) & (~ticker_change)).astype(int)

# Ensure the first row of each group (per ticker) doesn't indicate a fiscal year change
df.loc[df.groupby('tic').head(1).index, 'fyr_change_dummy'] = 0

In [3]:
# Remove duplicate rows based on unique combinations of 'tic', 'fyearq', and 'fqtr'
df = df.drop_duplicates(subset=['tic', 'fyearq', 'fqtr'], keep='first').reset_index(drop=True)


In [4]:
len(df)


1075450

In [5]:
def propagate_dummy_values(df):
    # Iterate through each unique ticker
    for tic in df['tic'].unique():
        # Filter for the specific ticker
        tic_df = df[df['tic'] == tic]
       
        # Iterate through each row of the filtered DataFrame
        for index, row in tic_df.iterrows():
            if row['fyr_change_dummy'] == 1:
                start_year = row['fyearq']
                start_quarter = row['fqtr']
               
                # Calculate the next year and the quarter indices to update
                for q in range(1, 4):  # Next 3 quarters
                    current_quarter = start_quarter + q
                    current_year = start_year
                   
                    if current_quarter > 4:
                        current_quarter -= 4
                        current_year += 1

                    # Update the DataFrame based on the new year and quarter
                    mask = (df['tic'] == tic) & (df['fyearq'] == current_year) & (df['fqtr'] == current_quarter)
                    df.loc[mask, 'fyr_change_dummy'] = 1

    return df

# Run the function and print the updated DataFrame
df = propagate_dummy_values(df)
print(df)

         GVKEY    datadate  fyearq  fqtr  fyr indfmt consol popsrc datafmt  \
0         2484  1983-07-31    1983     3   10   INDL      C      D     STD   
1         2484  1983-10-31    1983     4   10   INDL      C      D     STD   
2         2484  1984-01-31    1984     1   10   INDL      C      D     STD   
3         2484  1984-04-30    1984     2   10   INDL      C      D     STD   
4         2484  1984-07-31    1984     3   10   INDL      C      D     STD   
...        ...         ...     ...   ...  ...    ...    ...    ...     ...   
1075445  30165  1999-06-30    1999     2   12   INDL      C      D     STD   
1075446  30165  1999-09-30    1999     3   12   INDL      C      D     STD   
1075447  30165  1999-12-31    1999     4   12   INDL      C      D     STD   
1075448  30165  2000-03-31    2000     1   12   INDL      C      D     STD   
1075449  30165  2000-06-30    2000     2   12   INDL      C      D     STD   

           tic  ...      lseq       ltq pstkq teqq txdbq  dvpy 

In [6]:
df.to_csv('/Users/balmeru/Downloads/propagated_data.csv', index=False)

In [7]:
def handle_missing_data(df):
    # Define the columns to check for missing values
    columns_to_check = ['ajexq', 'prccq', 'cshoq']
    
    # Check if columns exist in the DataFrame
    missing_columns = [col for col in columns_to_check if col not in df.columns]
    
    if missing_columns:
        print(f"Warning: The following columns are missing in the DataFrame: {missing_columns}")
        columns_to_check = [col for col in columns_to_check if col in df.columns]  # Adjust the list

    # Display missing data summary before filling
    missing_data_summary_before = df[columns_to_check].isnull().sum()
    print("Missing data summary before filling:\n", missing_data_summary_before)
    
    # Calculate and display missing data percentage before filling
    missing_data_percentage_before = df[columns_to_check].isnull().mean() * 100
    print("Missing data percentage before filling:\n", missing_data_percentage_before)

    # Group by 'tic' (ticker) and interpolate missing values
    for column in columns_to_check:
        if column in df.columns:
            df[column] = df.groupby('tic')[column].transform(lambda x: x.interpolate(method='linear'))

    missing_data_summary_after = df[columns_to_check].isnull().sum()
    print("Missing data summary after filling:\n", missing_data_summary_after)
    
    # Calculate and display missing data percentage after filling
    missing_data_percentage_after = df[columns_to_check].isnull().mean() * 100
    print("Missing data percentage after filling:\n", missing_data_percentage_after)

    return df

# Handle missing data
df = handle_missing_data(df)

Missing data summary before filling:
 ajexq        3
prccq    26043
cshoq    45672
dtype: int64
Missing data percentage before filling:
 ajexq    0.000279
prccq    2.421591
cshoq    4.246780
dtype: float64
Missing data summary after filling:
 ajexq        0
prccq     1962
cshoq    11840
dtype: int64
Missing data percentage after filling:
 ajexq    0.000000
prccq    0.182435
cshoq    1.100934
dtype: float64


In [8]:
df.to_csv('/Users/balmeru/Downloads/handle_missing_values.csv', index=False)