In [71]:
import pandas as pd
import numpy as np
#import openpyx1

In [72]:
# Load the excel file
file_path = '/Users/conniechen/Desktop/cfar_data19702003.csv'
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,PERMNO,date,PERMCO,FACPR,FACSHR,PRC,RET,SHROUT,ewretd
0,10000,1985-12-31,7952,,,,,,0.028021
1,10000,1986-01-31,7952,,,-4.375,C,3680.0,0.044071
2,10000,1986-02-28,7952,,,-3.25,-0.257143,3680.0,0.060381
3,10000,1986-03-31,7952,,,-4.4375,0.365385,3680.0,0.047192
4,10000,1986-04-30,7952,,,-4.0,-0.098592,3793.0,0.01614


In [73]:
# Convert 'date' to datetime for easier manipulation
df['date'] = pd.to_datetime(df['date'])

In [74]:
# Sort values by 'PERMNO' and 'date' to ensure correct chronological order
df.sort_values(by=['PERMNO', 'date'], inplace=True)

In [75]:
# 转换RET列到数值，无法转换的设置为NaN
df['RET'] = pd.to_numeric(df['RET'], errors='coerce')

# 计算市场价值（ME）
df['ME'] = df['PRC'] * df['SHROUT']

In [76]:
# 定义一个函数来计算累计回报率，作为动量
def calculate_momentum(returns_series, lookback_periods=6):
    # 如果过去6个月没有足够的数据则返回NaN
    if len(returns_series) < lookback_periods + 2:
        return np.nan
    
    # 获取过去6个月的数据（不包括最近的2个月）
    past_returns = returns_series[-(lookback_periods + 2):-2]
    
    # 计算累计回报率
    momentum = (1 + past_returns).prod() - 1
    return momentum

In [77]:
# Function to calculate momentum

def calculate_momentum(row):
    # Assuming 'PRC' is the stock price column
    # Calculate the momentum as described before
    momentum = (row['PRC'] - row['PRC'].shift(7)) / row['PRC'].shift(7)
    return momentum
    
# Add the 'Momentum' column to the DataFrame
df['Momentum'] = df.groupby('PERMNO').apply(calculate_momentum).reset_index(drop=True)


In [78]:
# 对每个PERMNO分组，然后应用动量计算函数
grouped_data = df.groupby('PERMNO')['RET']
df['MOM'] = grouped_data.transform(lambda x: calculate_momentum(x))

# 删除含有NaN的行，以确保因变量和自变量行对齐
df.dropna(subset=['RET', 'ME', 'MOM'], inplace=True)


KeyError: 'PRC'

In [None]:
# Replace infinite values with NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)


In [None]:
# Display the DataFrame with the 'Momentum' column
print(df.head())

In [None]:
df

In [None]:

# Function to calculate 'Size'

def calculate_size(row):
    # Assuming 'PRC' is the stock price column and 'SHROUT' is the number of shares outstanding
    # Check if both 'PRC' and 'SHROUT' are positive and not NaN
    if pd.notna(row['PRC']) and pd.notna(row['SHROUT']) and row['PRC'] > 0 and row['SHROUT'] > 0:
        size = np.log(row['PRC'] * row['SHROUT'])
    else:
        size = np.nan  # Set to NaN if 'PRC' or 'SHROUT' is missing, zero, or negative
    return size

# Add the 'Size' column to the DataFrame
df['Size'] = df.apply(calculate_size, axis=1)  # Calculate 'Size' for each row

# Display the DataFrame with the 'Size' column
print(df.head())


In [None]:
df

In [None]:
# Function to calculate 'SHRCHG'
#def calculate_shrchg(row):
#    # Assuming 'SHROUT' is the number of shares outstanding
#    # Calculate 'SHRCHG' as the difference between shares outstanding in the current row and the previous row
#    shrchg = row['SHROUT'] - row['SHROUT'].shift(1)
#    return shrchg

# Add the 'SHRCHG' column to the DataFrame
#df['SHRCHG'] = df.groupby('PERMNO').apply(calculate_shrchg).reset_index(drop=True)

# Display the DataFrame with the 'SHRCHG' column
#print(df.head())

In [None]:
# Function to calculate 'AdjustedShares' based on 'FactorToAdjustShares'
def calculate_adjusted_shares(row):
    # Assuming 'FactorToAdjustShares' is the factor to adjust shares outstanding
    # Calculate 'AdjustedShares' as the product of 'FactorToAdjustShares' and 'SHROUT'
    adjusted_shares = row['FACSHR'] * row['SHROUT']
    return adjusted_shares

# Add the 'AdjustedShares' column to the DataFrame
df['AdjustedShares'] = df.apply(calculate_adjusted_shares, axis=1)

# Display the DataFrame with the 'AdjustedShares' column
print(df.head())

In [None]:
def calculate_shrchg(row):
    # Assuming 'AdjustedShares' is the number of adjusted shares outstanding
    # Calculate 'SHRCHG' as the difference between the natural logarithm of the adjusted shares at time t
    # and the natural logarithm of the adjusted shares at time t-11
    shrchg = np.log(row['AdjustedShares']) - np.log(row['AdjustedShares'].shift(11))
    return shrchg

# Add the 'SHRCHG' column to the DataFrame
df['SHRCHG'] = df.groupby('PERMNO').apply(calculate_shrchg).reset_index(drop=True)

# Display the DataFrame with the 'SHRCHG' column
print(df.head())

In [None]:
# To recreate Panel A: Simple Statistics, we will calculate the Mean, 25th Percentile, Median, 75th Percentile, and Standard Deviation
# for the 'Size', 'Momentum', and 'SHRCHG' columns.
# We will also handle missing values as they cannot be used in these calculations.

# Dropping the rows where 'Size', 'Momentum', or 'SHRCHG' is NaN since we cannot calculate statistics on NaN values
#df_stats = df.dropna()
#print(df_stats)


In [None]:
# Calculate the required statistics for Panel A


# Dropping the rows where 'Size', 'Momentum', or 'SHRCHG' is NaN since we cannot calculate statistics on NaN values
df_stats = df.dropna(subset=['ME', 'MOM', 'SHRCHG', 'RET'])

# Calculate the required statistics for Panel A
panel_a_stats = {
    'Variable': ['ME', 'MOM', 'SHRCHG', 'RET'],
    'Mean': [df_stats['ME'].mean(), df_stats['MOM'].mean(), df_stats['SHRCHG'].mean(), df_stats['RET'].mean()],
    '25th Percentile': [df_stats['ME'].quantile(0.25), df_stats['MOM'].quantile(0.25), df_stats['SHRCHG'].quantile(0.25), df_stats['RET'].quantile(0.25)],
    'Median': [df_stats['ME'].median(), df_stats['MOM'].median(), df_stats['SHRCHG'].median(), df_stats['RET'].median()],
    '75th Percentile': [df_stats['ME'].quantile(0.75), df_stats['MOM'].quantile(0.75), df_stats['SHRCHG'].quantile(0.75), df_stats['RET'].quantile(0.75)],
    'Standard Deviation': [df_stats['ME'].std(), df_stats['MOM'].std(), df_stats['SHRCHG'].std(), df_stats['RET'].std()]
}


# Convert this dictionary into a DataFrame
panel_a_df = pd.DataFrame(panel_a_stats)

# Display Panel A: Simple Statistics
panel_a_df



In [None]:
# Check the count of non-NaN values in the 'Size', 'Momentum', and 'SHRCHG' columns
non_nan_counts = df[['MOM', 'SHRCHG']].count()

# Additionally, let's check for any non-NaN values to ensure there's data to calculate statistics on
non_nan_counts, df[['MOM', 'SHRCHG']].describe()


In [None]:
# Since 'SHRCHG' has no non-NaN values, we'll exclude it from Panel A statistics
# Let's calculate the Panel A statistics again, this time only for 'Size' and 'Momentum'

# Calculate the required statistics for Panel A without 'SHRCHG'
panel_a_stats = {
    'Variable': ['Size', 'Momentum'],
    'Mean': [df['Size'].mean(), df['Momentum'].mean()],
    '25th Percentile': [df['Size'].quantile(0.25), df['Momentum'].quantile(0.25)],
    'Median': [df['Size'].median(), df['Momentum'].median()],
    '75th Percentile': [df['Size'].quantile(0.75), df['Momentum'].quantile(0.75)],
    'Standard Deviation': [df['Size'].std(), df['Momentum'].std()]
}

# Convert this dictionary into a DataFrame
panel_a_df = pd.DataFrame(panel_a_stats)

# Display Panel A: Simple Statistics without 'SHRCHG'
panel_a_df


In [None]:
print(df)

In [None]:
# Calculate the pairwise correlation of the specified columns, excluding NaN values
correlation_matrix = df[['ME', 'MOM', 'SHRCHG', 'RET']].corr()

# Display the correlation matrix
print(correlation_matrix)

In [None]:

# Create lagged columns for the variables you're interested in.
# Here, we're creating a 1-month lag for illustration. You'll adjust this according to your data.
df['ME_lag1'] = df['ME'].shift(1)  # Lag by 1 period
df['MOM_lag1'] = df['MOM'].shift(1)  # Lag by 1 period
df['SHRCHG_lag1'] = df['SHRCHG'].shift(1)  # Lag by 1 period
df['RET_lag1'] = df['RET'].shift(1)  # Lag by 1 period

# You can create more lagged columns as needed for your analysis.

# Drop the initial rows where any lagged values would be NaN due to the shift.
df = df.dropna(subset=['ME_lag1', 'MOM_lag1', 'SHRCHG_lag1', 'RET_lag1'])

# Calculate the non-contemporaneous correlation matrix between the current and lagged values.
non_contemporaneous_correlation_matrix = df[['ME', 'ME_lag1', 'MOM', 'MOM_lag1', 'SHRCHG', 'SHRCHG_lag1', 'RET', 'RET_lag1']].corr()

# Display the non-contemporaneous correlation matrix
print(non_contemporaneous_correlation_matrix)

In [None]:
#Table 3
import statsmodels.api as sm
from statsmodels.regression.linear_model import OLS

# 进行Fama-MacBeth回归分析
# 首先定义自变量和因变量
y = df['RET']  # 因变量为回报率
X = df[['ME', 'MOM']]  # 自变量为市场价值、动量
X = sm.add_constant(X)  # 添加常数项

# 进行回归分析
model = OLS(y, X).fit()

# 输出回归系数的结果
print(model.summary())

# 展示包含市场价值（ME）和动量（MOM）的数据框的前几行
print(df[['PERMNO', 'PRC', 'SHROUT', 'ME', 'RET', 'MOM']].head())


In [None]:
#Table 4

# Calculate rolling returns for different periods
df['RET_1M'] = df.groupby('PERMNO')['RET'].rolling(window=21).sum().reset_index(level=0, drop=True)
df['RET_1Q'] = df.groupby('PERMNO')['RET'].rolling(window=63).sum().reset_index(level=0, drop=True)
df['RET_6M'] = df.groupby('PERMNO')['RET'].rolling(window=126).sum().reset_index(level=0, drop=True)
df['RET_1Y'] = df.groupby('PERMNO')['RET'].rolling(window=252).sum().reset_index(level=0, drop=True)

# Define a function to calculate cumulative return as momentum
def calculate_momentum(returns_series, lookback_periods=6):
    if len(returns_series) < lookback_periods + 2:
        return np.nan
    past_returns = returns_series[-(lookback_periods + 2):-2]
    momentum = (1 + past_returns).prod() - 1
    return momentum

# Ensure there are no zero or negative numbers in the SHROUT column
df = df[df['SHROUT'] > 0]

# Calculate stock change amount (SHRCHG)
df['Log_SHROUT'] = np.log(df['SHROUT'])
df['SHRCHG'] = df['Log_SHROUT'].diff(-6) - df['Log_SHROUT'].diff(-17)

# Remove rows with NaN
df.dropna(subset=['RET_1M', 'RET_1Q', 'RET_6M', 'RET_1Y', 'ME', 'MOM', 'SHRCHG'], inplace=True)

# Independent variables
X = df[['ME', 'MOM', 'SHRCHG']]
X = sm.add_constant(X)  # Add a constant term

# Regression for 1-month return
y_1M = df['RET_1M']
model_1M = sm.OLS(y_1M, X).fit()
print("Regression results for 1-month return:")
print(model_1M.summary())

# Regression for 1-quarter return
y_1Q = df['RET_1Q']
model_1Q = sm.OLS(y_1Q, X).fit()
print("\nRegression results for 1-quarter return:")
print(model_1Q.summary())

# Regression for 6-month return
y_6M = df['RET_6M']
model_6M = sm.OLS(y_6M, X).fit()
print("\nRegression results for 6-month return:")
print(model_6M.summary())

# Regression for 1-year return
y_1Y = df['RET_1Y']
model_1Y = sm.OLS(y_1Y, X).fit()
print("\nRegression results for 1-year return:")
print(model_1Y.summary())

# Display the first few rows of the dataframe
print("\nFirst few rows of the dataframe:")
print(df[['PERMNO', 'PRC', 'SHROUT', 'ME', 'RET', 'MOM', 'SHRCHG']].head())