In [1]:
import pandas as pd

def fill_date_gaps(df):
    filled_df = pd.DataFrame()

    for ticker in df['TICKER'].unique():
        # Filter the dataframe for the current ticker
        ticker_df = df[df['TICKER'] == ticker]

        # Ensure 'DATE' is in the correct format
        ticker_df['DATE'] = pd.to_datetime(ticker_df['DATE'])

        # Create a date range for the current ticker from start to end of available data
        ticker_dates = pd.date_range(start=ticker_df['DATE'].min(), end=ticker_df['DATE'].max(), freq='D')
        
        # Create a new DataFrame with this date range
        ticker_full_df = pd.DataFrame(ticker_dates, columns=['DATE'])
        ticker_full_df['TICKER'] = ticker  # Add the ticker column
        
        # Merge with the existing data, this will introduce NaNs for days without data
        ticker_full_df = ticker_full_df.merge(ticker_df, on=['DATE', 'TICKER'], how='left')
        
        # Forward fill the NaN values
        ticker_full_df.fillna(method='ffill', inplace=True)

        # Append the processed data for this ticker to the filled_df DataFrame
        filled_df = filled_df.append(ticker_full_df, ignore_index=True)

    return filled_df

# Correct usage
file_path = "C:/Users/zhang/Downloads/CFAR/10.26/data/2018-2022.csv"
df = pd.read_csv(file_path)
filled_df = fill_date_gaps(df)
print(filled_df.head())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ticker_df['DATE'] = pd.to_datetime(ticker_df['DATE'])
  filled_df = filled_df.append(ticker_full_df, ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ticker_df['DATE'] = pd.to_datetime(ticker_df['DATE'])
  filled_df = filled_df.append(ticker_full_df, ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.htm

        DATE TICKER  INDEX    DLYRET          CAP     SLOPE
0 2018-01-02      A    0.0  0.009407  21836016.80  0.002514
1 2018-01-03      A    1.0  0.025444  22391607.76 -0.005664
2 2018-01-04      A    2.0 -0.007501  22223638.40 -0.006376
3 2018-01-05      A    3.0  0.015988  22578958.20  0.000637
4 2018-01-06      A    3.0  0.015988  22578958.20  0.000637


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ticker_df['DATE'] = pd.to_datetime(ticker_df['DATE'])
  filled_df = filled_df.append(ticker_full_df, ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ticker_df['DATE'] = pd.to_datetime(ticker_df['DATE'])
  filled_df = filled_df.append(ticker_full_df, ignore_index=True)


In [1]:
filled_df.to_csv('filled.csv', index = False, encoding='utf-8')

NameError: name 'filled_df' is not defined

In [208]:
import pandas as pd
df = pd.read_csv("C:/Users/zhang/Downloads/CFAR/10.26/filled.csv")

In [194]:
start_date = '2018-01-01'
end_date = '2019-12-30'
df =  df[(df['DATE'] >= start_date) & (df['DATE'] <= end_date)]

In [209]:
df.drop(columns=['INDEX'], inplace=True)

In [210]:
df

Unnamed: 0,DATE,TICKER,DLYRET,CAP,SLOPE
0,2018-01-02,A,0.009407,21836016.80,0.002514
1,2018-01-03,A,0.025444,22391607.76,-0.005664
2,2018-01-04,A,-0.007501,22223638.40,-0.006376
3,2018-01-05,A,0.015988,22578958.20,0.000637
4,2018-01-06,A,0.015988,22578958.20,0.000637
...,...,...,...,...,...
1556472,2022-01-15,CPSH,0.008571,50655.50,0.183980
1556473,2022-01-16,CPSH,0.008571,50655.50,0.183980
1556474,2022-01-17,CPSH,0.008571,50655.50,0.183980
1556475,2022-01-18,CPSH,-0.039660,48646.50,-0.219354


In [211]:
def calculate_weekly_return(df):
    """
    Calculate the weekly return for each ticker and shift the 'DATE' forward by one day.
    """
    # Ensure 'DATE' is in the correct datetime format
    df['DATE'] = pd.to_datetime(df['DATE'])

    # Sort the DataFrame by 'TICKER' and 'DATE'
    df.sort_values(by=['TICKER', 'DATE'], inplace=True)

    # Define a function to calculate the weekly return
    def calc_weekly_ret(sub_df):
        weekly_ret = (sub_df['DLYRET'] + 1).prod() - 1
        return pd.Series([weekly_ret], index=['WEEKLYRET'])

    # Calculate the weekly return for each ticker
    weekly_return_df = df.groupby('TICKER').resample('W', on='DATE').apply(calc_weekly_ret)

    # Reset index to make 'TICKER' and 'DATE' columns again
    weekly_return_df = weekly_return_df.reset_index()

    # Shift 'DATE' forward by one day
    weekly_return_df['DATE'] = weekly_return_df['DATE'] + pd.Timedelta(days=1)

    return weekly_return_df

# Usage
weekly_returns = calculate_weekly_return(df)
print(weekly_returns.head())






In [198]:
def get_previous_week_slope(df):
    """
    Get the slope value from the last day of the previous week for each ticker.
    """
    df['DATE'] = pd.to_datetime(df['DATE'])
    df['WEEK'] = df['DATE'].dt.to_period('W')

    # Shift the slope data by 1 day to get the previous week's last day slope
    df['PREV_WEEK_SLOPE'] = df.groupby('TICKER')['SLOPE'].shift(1)

    # Retain only the first entry for each ticker for each week
    last_day_slope = df.drop_duplicates(subset=['TICKER', 'WEEK'])

    return last_day_slope


df = get_previous_week_slope(df)


In [199]:
df

Unnamed: 0,DATE,TICKER,DLYRET,CAP,SLOPE,WEEK,PREV_WEEK_SLOPE
0,2018-01-02,A,0.009407,21836016.80,0.002514,2018-01-01/2018-01-07,
6,2018-01-08,A,0.002146,22627410.90,-0.003199,2018-01-08/2018-01-14,0.000637
13,2018-01-15,A,0.013136,23170081.14,0.002108,2018-01-15/2018-01-21,0.002108
20,2018-01-22,A,0.005611,23735362.64,-0.002766,2018-01-22/2018-01-28,0.001399
27,2018-01-29,A,-0.003876,24074531.54,-0.004535,2018-01-29/2018-02-04,0.006510
...,...,...,...,...,...,...,...
1555029,2019-12-02,CPS,0.009849,483447.69,0.019703,2019-12-02/2019-12-08,-0.006968
1555036,2019-12-09,CPS,-0.003780,488162.61,-0.008084,2019-12-09/2019-12-15,0.002229
1555043,2019-12-16,CPS,0.015251,493214.31,-0.026620,2019-12-16/2019-12-22,0.013270
1555050,2019-12-23,CPS,-0.000301,559728.36,0.072611,2019-12-23/2019-12-29,0.124623


In [200]:
df = df.dropna(subset=['PREV_WEEK_SLOPE'])
print(df)

              DATE TICKER    DLYRET          CAP     SLOPE  \
6       2018-01-08      A  0.002146  22627410.90 -0.003199   
13      2018-01-15      A  0.013136  23170081.14  0.002108   
20      2018-01-22      A  0.005611  23735362.64 -0.002766   
27      2018-01-29      A -0.003876  24074531.54 -0.004535   
34      2018-02-05      A -0.042526  22036287.96 -0.049154   
...            ...    ...       ...          ...       ...   
1555029 2019-12-02    CPS  0.009849    483447.69  0.019703   
1555036 2019-12-09    CPS -0.003780    488162.61 -0.008084   
1555043 2019-12-16    CPS  0.015251    493214.31 -0.026620   
1555050 2019-12-23    CPS -0.000301    559728.36  0.072611   
1555057 2019-12-30    CPS -0.006477    542384.19 -0.002912   

                          WEEK  PREV_WEEK_SLOPE  
6        2018-01-08/2018-01-14         0.000637  
13       2018-01-15/2018-01-21         0.002108  
20       2018-01-22/2018-01-28         0.001399  
27       2018-01-29/2018-02-04         0.006510  
34   

In [201]:
def rank_tickers_by_slope(df):
    """
    Rank the tickers based on the previous week's last day slope and assign deciles.
    """
    # Rank the slopes and assign deciles
    df['DECILE'] = df.groupby('WEEK')['PREV_WEEK_SLOPE'].transform(
        lambda x: pd.qcut(x.rank(method='first'), 5, labels=False) + 1)

    return df

# Usage
# Assuming df is your DataFrame with 'WEEK' and 'PREV_WEEK_SLOPE' columns
ranked_df = rank_tickers_by_slope(df)
print(ranked_df.head())





         DATE TICKER    DLYRET          CAP     SLOPE                   WEEK  \
6  2018-01-08      A  0.002146  22627410.90 -0.003199  2018-01-08/2018-01-14   
13 2018-01-15      A  0.013136  23170081.14  0.002108  2018-01-15/2018-01-21   
20 2018-01-22      A  0.005611  23735362.64 -0.002766  2018-01-22/2018-01-28   
27 2018-01-29      A -0.003876  24074531.54 -0.004535  2018-01-29/2018-02-04   
34 2018-02-05      A -0.042526  22036287.96 -0.049154  2018-02-05/2018-02-11   

    PREV_WEEK_SLOPE  DECILE  
6          0.000637       3  
13         0.002108       3  
20         0.001399       3  
27         0.006510       3  
34        -0.010457       3  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['DECILE'] = df.groupby('WEEK')['PREV_WEEK_SLOPE'].transform(


In [202]:
df = ranked_df

In [188]:
def calculate_portfolio_returns(df, weekly_returns):
    """
    Calculate the portfolio return based on deciles and return a DataFrame with dates as datetime and returns.
    """
    # Merge the slope ranks with the weekly returns
    merged_df = pd.merge(df, weekly_returns, on=['TICKER', 'DATE'])

    # Calculate the average return for the lowest and highest deciles
    decile_returns = merged_df.groupby(['WEEK', 'DECILE'])['WEEKLYRET'].mean().unstack()

    # Portfolio return is the difference between low and high decile returns
    portfolio_return = (decile_returns[1] - decile_returns[5])

    # Create a DataFrame for the result
    result_df = pd.DataFrame({
        'DATE': decile_returns.index.to_series().dt.start_time,
        'PORTFOLIO_RETURN': portfolio_return
    }).reset_index(drop=True)

    return result_df

# Usage
portfolio_returns_df = calculate_portfolio_returns(df, weekly_returns)
print(portfolio_returns_df.head())






        DATE  PORTFOLIO_RETURN
0 2022-01-03         -0.005449
1 2022-01-10         -0.002209
2 2022-01-17          0.000539
3 2022-01-24         -0.018970
4 2022-01-31         -0.036932


In [204]:
df_rf = pd.read_csv("C:/Users/zhang/Downloads/CFAR/10.26/F-F_Research_Data_Factors_weekly.csv",skiprows = 3)

df_rf = df_rf.rename(columns ={ 'Unnamed: 0': "DATE"})

df_rf = df_rf[['DATE','RF']]

# Clean DATE column
df_rf['DATE'] = df_rf['DATE'].str.strip()  # Remove leading and trailing spaces

# Fill NaN values with placeholder
placeholder = "INVALID"
df_rf['DATE'].fillna(placeholder, inplace=True)

# Keep only rows where 'DATE' has numeric values
df_rf = df_rf[df_rf['DATE'].str.isnumeric()]

df_rf['DATE'] = pd.to_datetime(df_rf['DATE'], format='%Y%m%d', errors='coerce')
df_rf['DATE'] = df_rf['DATE'] - pd.Timedelta(days=4)


In [205]:
import pandas as pd
from scipy import stats

# Merge the dataframes on the 'DATE' column
merged_df = pd.merge(portfolio_returns_df, df_rf, on='DATE', how='inner')


In [206]:
# Merge the DataFrames on 'DATE'
merged_df = pd.merge(portfolio_returns_df, df_rf, on='DATE', how='inner')

# Calculate excess returns (portfolio return - risk-free rate)
merged_df['EXCESS_RETURN'] = merged_df['PORTFOLIO_RETURN'] - merged_df['RF']


In [207]:
# Perform t-test
t_stat, p_value = stats.ttest_ind(merged_df['PORTFOLIO_RETURN'], merged_df['RF'], nan_policy='omit')

print(f"T-statistic: {t_stat}, P-value: {p_value}")

# Annualize the average excess return and standard deviation
annual_factor = 52  # Number of weeks in a year

average_excess_return_weekly = merged_df['EXCESS_RETURN'].mean()
std_dev_excess_return_weekly = merged_df['EXCESS_RETURN'].std()

average_excess_return_annualized = average_excess_return_weekly * annual_factor
std_dev_excess_return_annualized = std_dev_excess_return_weekly * (annual_factor ** 0.5)

# Calculate Sharpe Ratio
sharpe_ratio = average_excess_return_annualized / std_dev_excess_return_annualized

mean = portfolio_returns_df.mean()

print(f"weekly mean: {mean}")
print(f"Annualized Sharpe Ratio: {sharpe_ratio}")


T-statistic: -1.6198272437322376, P-value: 0.10841885949176011
weekly mean: PORTFOLIO_RETURN   -0.130161
dtype: float64
Annualized Sharpe Ratio: -1.6192953665378411


  mean = portfolio_returns_df.mean()
