In [1]:
import pandas as pd
import numpy as np
csv_file_path = "/Users/balmeru/Downloads/with_diff.csv"
df = pd.read_csv(csv_file_path)
# Define columns to keep
columns_to_keep = [
    'datadate', 'fyearq', 'fqtr', 'fyr', 'tic', 'ajexq', 'datafqtr', 
    'rdq', 'atq', 'ceqq', 'cshoq', 'dvpq', 'ibq', 'lseq', 'ltq', 
    'pstkq', 'teqq', 'txdbq', 'prccq', 'fyr_change_dummy', 'diff', 
    'quarterly_report', 'annual_report', 'quarter_distance', 
    'annual_distance', 'market_cap', 'lower_bound', 'upper_bound'
]

# Keep only the selected columns
df = df[columns_to_keep]

# Data preparation
df['txdbq'] = df['txdbq'].fillna(0)
df['dvpq'] = df['dvpq'].fillna(0)
df['rdq'] = pd.to_datetime(df['rdq'], errors='coerce')

# Create dummy variables
df['income_good'] = ~df['ibq'].isna()
df['se1_good'] = ~df['teqq'].isna()
df['se2_good'] = ~df['ceqq'].isna() & ~df['pstkq'].isna()
df['se3_good'] = ~df['atq'].isna() & ~df['ltq'].isna()  # Fixed logical error

# Filter DataFrame based on conditions
filtered_df = df[
    df['income_good'] & (df['se1_good'] | df['se2_good'] | df['se3_good'])
]

# Initialize median_distance column
filtered_df['median_distance'] = np.nan

# Function to find median distance for a given row
def find_median_distance(row, report_type):
    distance_col = 'quarter_distance' if report_type == 'quarterly' else 'annual_distance'
    mask = (
        (filtered_df['tic'] != row['tic']) &
        (filtered_df['market_cap'].between(row['lower_bound'], row['upper_bound'])) &
        (filtered_df['fyearq'].between(row['fyearq'] - 1, row['fyearq'] + 1)) &
        (filtered_df['fyr_change_dummy'] == 0) &
        (filtered_df[f'{report_type}_report'] == 1)
    )
    
    distances = filtered_df.loc[mask, distance_col]
    return distances.median() if not distances.empty else np.nan

# Vectorized operation to calculate median_distance
for report_type in ['quarterly', 'annual']:
    mask = (filtered_df['fyr_change_dummy'] == 1) & (filtered_df[f'{report_type}_report'] == 1)
    filtered_df.loc[mask, 'median_distance'] = filtered_df[mask].apply(
        lambda row: find_median_distance(row, report_type), axis=1
    )

# Save the filtered DataFrame to CSV
filtered_df.to_csv('/Users/balmeru/Desktop/mediana.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['median_distance'] = np.nan


In [2]:



# Count the number of fiscal year changes
fyr_change_count = filtered_df['fyr_change_dummy'].sum()

# Count the number of valid median distances (non-NaN)
median_distance_count = filtered_df['median_distance'].count()

# Display the results
print(f"Number of fiscal year changes: {fyr_change_count}")
print(f"Number of valid median distances: {median_distance_count}")

Number of fiscal year changes: 6740
Number of valid median distances: 6723


In [3]:
filtered_df['quarterly_median_distance'] = np.where(
    (filtered_df['fyr_change_dummy'] == 1) & (filtered_df['quarterly_report'] == 1),
    filtered_df['median_distance'],
    np.nan
)

filtered_df['annual_median_distance'] = np.where(
    (filtered_df['fyr_change_dummy'] == 1) & (filtered_df['annual_report'] == 1),
    filtered_df['median_distance'],
    np.nan
)

# Generate a summary
quarterly_summary = filtered_df['quarterly_median_distance'].describe()
annual_summary = filtered_df['annual_median_distance'].describe()

# Count the number of non-NaN values in each new column
quarterly_count = filtered_df['quarterly_median_distance'].count()
annual_count = filtered_df['annual_median_distance'].count()

# Display the results
print("Quarterly Median Distance Summary:")
print(quarterly_summary)
print(f"Number of quarterly median distances: {quarterly_count}")

print("\nAnnual Median Distance Summary:")
print(annual_summary)
print(f"Number of annual median distances: {annual_count}")


Quarterly Median Distance Summary:
count    4995.000000
mean       89.868368
std         1.838368
min        35.000000
25%        89.000000
50%        90.000000
75%        91.000000
max        96.000000
Name: quarterly_median_distance, dtype: float64
Number of quarterly median distances: 4995

Annual Median Distance Summary:
count    1728.000000
mean      112.804398
std         8.665013
min        98.000000
25%       105.000000
50%       112.000000
75%       119.000000
max       156.000000
Name: annual_median_distance, dtype: float64
Number of annual median distances: 1728


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['quarterly_median_distance'] = np.where(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['annual_median_distance'] = np.where(


In [7]:
# Filter the DataFrame where fyr_change is equal to 1
see = filtered_df[filtered_df['fyr_change_dummy'] == 1]

# Print the first few rows of the filtered DataFrame
print(see.head())


      datadate  fyearq  fqtr  fyr    tic     ajexq datafqtr        rdq  \
22  1988-09-30    1989     1    6  0015B  4.049999   1989Q1 1989-03-10   
23  1988-12-31    1989     2    6  0015B  4.049999   1989Q2 1989-06-07   
24  1989-03-31    1989     3    6  0015B  4.049999   1989Q3        NaT   
25  1989-06-30    1989     4    6  0015B  4.049999   1989Q4        NaT   
58  1998-08-31    1998     1    5  0015B  1.000000   1998Q1 1998-09-23   

        atq     ceqq  ...  market_cap  lower_bound  upper_bound  income_good  \
22  344.741  167.541  ...   59.957619    47.966095    71.949143         True   
23  371.280  195.335  ...   49.831616    39.865293    59.797939         True   
24  338.460  196.967  ...   59.710664    47.768531    71.652797         True   
25  329.104  193.809  ...   67.422199    53.937759    80.906638         True   
58  917.784  504.163  ...  991.896313   793.517050  1190.275575         True   

    se1_good  se2_good  se3_good  median_distance  quarterly_median_distan

In [11]:
import pandas as pd
import numpy as np

# Load the filtered DataFrame from the CSV file
filtered_df = pd.read_csv('/Users/balmeru/Desktop/mediana.csv', parse_dates=['rdq'])

# Define a function to calculate the previous quarter given the current quarter and fiscal year
def calculate_previous_quarter(fqtr, fyearq):
    if fqtr > 1:
        return fqtr - 1, fyearq
    else:
        return 4, fyearq - 1

# Calculate expected_date for fyr_change_dummy == 1
mask_fyr_change = (filtered_df['fyr_change_dummy'] == 1) & (filtered_df['median_distance'].notna())
for idx, row in filtered_df.loc[mask_fyr_change].iterrows():
    prev_fqtr, prev_fyearq = calculate_previous_quarter(row['fqtr'], row['fyearq'])
    prev_row = filtered_df[(filtered_df['tic'] == row['tic']) & (filtered_df['fyearq'] == prev_fyearq) & (filtered_df['fqtr'] == prev_fqtr)]
    if not prev_row.empty:
        prev_rdq = prev_row.iloc[0]['rdq']
        if pd.notna(prev_rdq):
            expected_date = prev_rdq + pd.Timedelta(days=row['median_distance'])
            filtered_df.at[idx, 'expected_date'] = expected_date

# Calculate expected_date for fyr_change_dummy == 0
mask_no_fyr_change = (filtered_df['fyr_change_dummy'] == 0)
for idx, row in filtered_df.loc[mask_no_fyr_change].iterrows():
    prev_row = filtered_df[(filtered_df['tic'] == row['tic']) & (filtered_df['fyearq'] == row['fyearq'] - 1) & (filtered_df['fqtr'] == row['fqtr'])]
    if not prev_row.empty:
        prev_rdq = prev_row.iloc[0]['rdq']
        if pd.notna(prev_rdq):
            expected_date = prev_rdq + pd.DateOffset(weeks=52)
            filtered_df.at[idx, 'expected_date'] = expected_date

# Save the updated DataFrame to CSV
filtered_df.to_csv('/Users/balmeru/Desktop/mediana_with_expected_dates.csv', index=False)


In [12]:
just_df = pd.read_csv('/Users/balmeru/Desktop/mediana_with_expected_dates.csv', parse_dates=['rdq'])
non_na_count = just_df['expected_date'].notna().sum()

print(f"Number of non-NA values in 'car': {non_na_count}")


Number of non-NA values in 'car': 785538


In [13]:
len(just_df)

963444

In [14]:
785538/963444

0.8153437044602488