# Sheet

In [None]:
import pandas as pd

In [None]:
merged_data_2007_to_2012 = pd.read_csv("/data/workspace_files/lazy_price_replication/10k_final_with_ticker_name_filtered_w_similarity.csv", index_col=0)
ticker_prices = pd.read_csv("/data/workspace_files/lazy_price_replication/all_ticker_prices.csv", index_col=0)

In [None]:
data_sent  = pd.read_csv('/data/workspace_files/lazy_price_replication/10k_final_with_ticker_name_filtered_w_similarity_and_sentiment.csv')
data_sent.columns

#### Data Description
Each row represents the data for one ticker with the following information:
1. cik: the cik of the stock
2. report_period_end_date{year}: the ending period that the 10k report is for
3. file_date{year}: the date the 10k report was filed and became public. NOTE: for a given report_period_end_date for the same year (e.g. 2007), the file_date could be next year (e.g. 2008)
4. statement{year}: the actual content of the 10k statement
5. company_name: the company's name
6. sic: the sic for the compnay
7. form_type: the given form type
8. cusip_full: the 9 digit cusip for the compnay
9. cusip: the 8 digit for the company
10. ticker: the ticker name for the company

## Consine Similarity

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
def consine_similarity(input1: str, input2: str)->float:
    # Create a TfidfVectorizer to convert the text documents into TF-IDF vectors
    vectorizer = TfidfVectorizer()

    # Fit and transform the documents into TF-IDF matrix
    tfidf_matrix = vectorizer.fit_transform([input1, input2])

    # Compute the cosine similarity between the two documents
    similarity_matrix = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])

    # Extract the similarity score
    similarity_score = similarity_matrix[0][0]
    return similarity_score

In [None]:
# Define the two documents as strings
doc1 = "We expect demand to increase"
doc2 = "We expect worldwide demand to increase"
doc3 = "We expect weakness in sales"

In [None]:
similarity_score_1 = consine_similarity(doc1, doc3)
print(f"Cosine Similarity between the documents: {similarity_score_1}")

## Jaccard Similarity

In [None]:
import re

In [None]:
def preprocess(text: str)->str:
    # Convert text to lowercase and remove non-alphanumeric characters
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    return text

def jaccard_similarity(input1: str, input2: str)->float:
    # Preprocess and split documents into sets of unique words
    set1 = set(preprocess(input1).split())
    set2 = set(preprocess(input2).split())

    # Calculate the intersection and union of the sets
    intersection = set1.intersection(set2)
    union = set1.union(set2)

    # Compute the Jaccard similarity
    similarity = len(intersection) / len(union)
    return similarity

In [None]:
jaccard_score_1 = jaccard_similarity(doc1, doc2)
print(f"Jaccard Similarity between the documents: {jaccard_score_1}")

## Min Edit Distance Similarity

In [None]:
import Levenshtein

In [None]:
distance = Levenshtein.distance(doc1, doc2)
print(f"Levenshtein Distance: {distance}")

### Populate similarity for different approaches

In [None]:
from enum import Enum

In [None]:
class SimilarityMethod(Enum):
    COSINE = "consine"
    JACCARD = "jaccard"
    EDIT_DISTANCE = "min_edit_distance"

In [None]:
def compute_similary(similary_method: SimilarityMethod)->None:
    years = list(range(2007, 2013))
    for i in range(1, len(years)):
        year1 = years[i-1]
        year2 = years[i]
        col_name = f"{similary_method.value}_similarity_{year1}_to_{year2}"
        print(f"processing {year1} to {year2} for {similary_method} with {col_name=}")
        # vals = []
        if similary_method == SimilarityMethod.COSINE:
            merged_data_2007_to_2012[col_name] = merged_data_2007_to_2012.apply(lambda row: consine_similarity(str(row[f"statement{year1}"]), str(row[f"statement{year2}"])), axis=1)
        elif similary_method == SimilarityMethod.JACCARD:
            merged_data_2007_to_2012[col_name] = merged_data_2007_to_2012.apply(lambda row: jaccard_similarity(str(row[f"statement{year1}"]), str(row[f"statement{year2}"])), axis=1)
        elif similary_method.EDIT_DISTANCE:
            merged_data_2007_to_2012[col_name] = merged_data_2007_to_2012.apply(lambda row: Levenshtein.distance(str(row[f"statement{year1}"]), str(row[f"statement{year2}"])), axis=1)
        else:
            raise Exception(f"unsupported type: {similary_method}")

In [None]:
compute_similary(SimilarityMethod.COSINE)

In [None]:
compute_similary(SimilarityMethod.JACCARD)

In [None]:
compute_similary(SimilarityMethod.EDIT_DISTANCE)

In [None]:
data_sim = pd.read_csv('/data/workspace_files/lazy_price_replication/10k_final_unfiltered.csv')

In [None]:
data_sim.columns

# Sheet 2

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
data_sim = pd.read_csv('/data/workspace_files/lazy_price_replication/10k_final_with_ticker_name_filtered_w_similarity.csv')

In [None]:
text = data_sim['statement2007'].to_list()

In [None]:
data_sim = data_sim.dropna(subset=['ticker'])

In [None]:
date_columns = [f'file_date{year}' for year in range(2007, 2013)]

# Initialize an empty list to store all the filtered dates
all_dates = []

# Loop through each file_date column
for col in date_columns:
    filings_date = data_sim[col].to_list()
    date_obj = pd.to_datetime(filings_date, format='%Y%m%d', errors='coerce')
    all_dates.extend(date_obj)

In [None]:
date_columns = [f'report_period_end_date{year}' for year in range(2007, 2013)]

# Initialize an empty list to store all the filtered dates
all_dates_rep = []

# Loop through each reporting_date column
for col in date_columns:
    filings_date = data_sim[col].to_list()
    date_obj = pd.to_datetime(filings_date, format='%Y%m%d', errors='coerce')
    all_dates_rep.extend(date_obj)

In [None]:
# Convert the list of dates into a pandas DataFrame for easier manipulation
dates_df = pd.DataFrame(all_dates, columns=['date'])

# Add a column for the year and the quarter
dates_df['year'] = dates_df['date'].dt.year
dates_df['quarter'] = dates_df['date'].dt.to_period('Q')  # This creates values like '2007Q1', '2007Q2', etc.

# Count the number of dates in each quarter
quarter_counts = dates_df['quarter'].value_counts().sort_index()

# Plot the histogram
plt.figure(figsize=(10, 6))
quarter_counts.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Number of Filings per Quarter (2007-2012)', fontsize=16)
plt.xlabel('Quarter', fontsize=12)
plt.ylabel('Number of Dates', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
# Convert the list of dates into a pandas DataFrame for easier manipulation
dates_df = pd.DataFrame(all_dates_rep, columns=['date'])

# Add a column for the year and the quarter
dates_df['year'] = dates_df['date'].dt.year
dates_df['quarter'] = dates_df['date'].dt.to_period('Q')  # This creates values like '2007Q1', '2007Q2', etc.

# Count the number of dates in each quarter
quarter_counts = dates_df['quarter'].value_counts().sort_index()

# Plot the histogram
plt.figure(figsize=(10, 6))
quarter_counts.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Reporting period per Quarter (2007-2012)', fontsize=16)
plt.xlabel('Quarter', fontsize=12)
plt.ylabel('Number of Dates', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
import pandas as pd
import numpy as np

# Define the years you want to analyze
years = [2008, 2009, 2010, 2011, 2012]

# Initialize an empty list to store the data dictionaries
data_list = []

# Loop through each year
for year in years:
    # Convert the 'file_date' column for the year to datetime format
    data_sim[f'file_date{year}'] = pd.to_datetime(data_sim[f'file_date{year}'], format='%Y%m%d')
    
    # Define the start and end dates for each quarter and calculate trading dates
    quarters = [
        {'start': f'{year-1}-12-31', 'end': f'{year}-03-31'},  # Q1
        {'start': f'{year}-04-01', 'end': f'{year}-06-30'},    # Q2
        {'start': f'{year}-07-01', 'end': f'{year}-09-30'},    # Q3
        {'start': f'{year}-10-01', 'end': f'{year}-12-31'},    # Q4
    ]

    # Loop through each quarter for the current year
    for quarter in quarters:
        start_date = pd.to_datetime(quarter['start'])
        end_date = pd.to_datetime(quarter['end'])

        # Calculate the trading start and end dates (next quarter)
        if quarter['start'] == f'{year-1}-12-31':  # Q1
            trading_start = pd.to_datetime(f'{year}-04-01')
            trading_end = pd.to_datetime(f'{year}-06-30')
            trading_quarter = 'Q2'
            trading_year = year
        elif quarter['start'] == f'{year}-04-01':  # Q2
            trading_start = pd.to_datetime(f'{year}-07-01')
            trading_end = pd.to_datetime(f'{year}-09-30')
            trading_quarter = 'Q3'
            trading_year = year
        elif quarter['start'] == f'{year}-07-01':  # Q3
            trading_start = pd.to_datetime(f'{year}-10-01')
            trading_end = pd.to_datetime(f'{year}-12-31')
            trading_quarter = 'Q4'
            trading_year = year
        else:  # Q4
            trading_start = pd.to_datetime(f'{year+1}-01-01')
            trading_end = pd.to_datetime(f'{year+1}-03-31')
            trading_quarter = 'Q1'
            trading_year = year + 1

        # Filter the DataFrame for the specified time period
        filtered_data = data_sim[(data_sim[f'file_date{year}'] >= start_date) & (data_sim[f'file_date{year}'] <= end_date)]
        
        # Sort the filtered data by similarity metrics
        sorted_data_consine = filtered_data.sort_values(by=f'consine_similarity_{year-1}_to_{year}', ascending=True)
        sorted_data_jac = filtered_data.sort_values(by=f'jaccard_similarity_{year-1}_to_{year}', ascending=True)
        sorted_data_min_edit = filtered_data.sort_values(by=f'min_edit_distance_similarity_{year-1}_to_{year}', ascending=True)
        
        # Function to get the top and bottom quintiles
        def get_quintiles(sorted_df, col_name):
            quintile_size = int(np.ceil(len(sorted_df) * 0.20))
            top_quintile = sorted_df.head(quintile_size)[col_name].to_list()
            bottom_quintile = sorted_df.tail(quintile_size)[col_name].to_list()
            return top_quintile, bottom_quintile

        # Get the top and bottom quintile for each similarity measure
        top_consine, bottom_consine = get_quintiles(sorted_data_consine, 'ticker')
        top_jac, bottom_jac = get_quintiles(sorted_data_jac, 'ticker')
        top_min_edit, bottom_min_edit = get_quintiles(sorted_data_min_edit, 'ticker')
        # Determine the trading quarter based on the trading_start date

        quarter_str = f"{trading_start.year}-{trading_quarter}"
        # Create a dictionary to store the data for each quarter
        data_dict = {
            'year': year,
            'quarter': quarter_str,
            'start_date': start_date,
            'end_date': end_date,
            'trading_start': trading_start,
            'trading_end': trading_end,
            'Top Quintile Consine Similarity': top_consine,
            'Bottom Quintile Consine Similarity': bottom_consine,
            'Top Quintile Jaccard Similarity': top_jac,
            'Bottom Quintile Jaccard Similarity': bottom_jac,
            'Top Quintile Min Edit Distance Similarity': top_min_edit,
            'Bottom Quintile Min Edit Distance Similarity': bottom_min_edit
        }

        # Append the dictionary to the list
        data_list.append(data_dict)

# Create a DataFrame from the list of dictionaries
result_df = pd.DataFrame(data_list)

# Display the resulting DataFrame
result_df

In [None]:
price_data = pd.read_csv('/data/workspace_files/lazy_price_replication/all_ticker_prices.csv', index_col=0)

# Convert 'Date' to datetime
price_data['Date'] = pd.to_datetime(price_data['Date'])

# Extract year and quarter from 'Date'
price_data['year'] = price_data['Date'].dt.year
price_data['quarter'] = price_data['Date'].dt.quarter

# Group by ticker, year, and quarter, then calculate quarterly return
quarterly_returns = price_data.groupby(['ticker', 'year', 'quarter']).apply(
    lambda x: (x['Close'].iloc[-1] - x['Close'].iloc[0]) / x['Close'].iloc[0]
).reset_index(name='quarterly_return')

# Create a new DataFrame with quarterly periods and returns
quarterly_returns['quarter'] = quarterly_returns['year'].astype(str) + '-Q' + quarterly_returns['quarter'].astype(str)

# Drop the 'year' column if it's not needed
quarterly_prices = quarterly_returns[['ticker', 'quarter', 'quarterly_return']]

# Display the new DataFrame
price_df = quarterly_prices

In [None]:
import pandas as pd

# Assuming results_df is your stock_df and returns_df is your quarterly_prices

# Initialize an empty list to store portfolio returns
portfolio_returns = []

# Loop through each row of results_df to create the portfolio
for index, row in result_df.iterrows():
    # Get the tickers for long and short positions
    long_stocks = row['Top Quintile Consine Similarity']
    short_stocks = row['Bottom Quintile Consine Similarity']
    
    # Get the returns for the long stocks
    long_returns = price_df[price_df['ticker'].isin(long_stocks) & (price_df['quarter'] == row['quarter'])]['quarterly_return']
    
    # Get the returns for the short stocks
    short_returns = price_df[price_df['ticker'].isin(short_stocks) & (price_df['quarter'] == row['quarter'])]['quarterly_return']
    
    # Calculate the portfolio return for this quarter
    if len(long_returns) > 0 and len(short_returns) > 0:
        long_weight = 1 / len(long_stocks)  # Equal weight for long positions
        short_weight = -1 / len(short_stocks)  # Equal weight for short positions
        
        # Portfolio return calculation
        portfolio_return = (long_returns.sum() * long_weight) + (short_returns.sum() * short_weight)
        portfolio_returns.append({
            'quarter': row['quarter'],
            'portfolio_return': portfolio_return
        })

# Create a DataFrame from the portfolio returns
portfolio_df = pd.DataFrame(portfolio_returns)

# Display the resulting portfolio returns DataFrame
print(portfolio_df)

In [None]:

# Get the tickers for long and short positions
long_stocks = result_df['Top Quintile Jaccard Similarity'].iloc[0]
short_stocks = result_df['Bottom Quintile Jaccard Similarity'].iloc[0]

# Get the returns for the long stocks
long_returns = price_df[price_df['ticker'].isin(long_stocks) & (price_df['quarter'] == result_df['quarter'].iloc[0])]['quarterly_return']

# Get the returns for the short stocks
short_returns = price_df[price_df['ticker'].isin(short_stocks) & (price_df['quarter'] == result_df['quarter'].iloc[0])]['quarterly_return']

In [None]:
print(long_returns)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Sample initial investment amount
initial_investment = 10000  # or any amount you want

# Calculate cumulative returns
portfolio_df['cumulative_return'] = (1 + portfolio_df['portfolio_return']).cumprod() - 1

# Calculate account value over time
portfolio_df['account_value'] = initial_investment * (1 + portfolio_df['cumulative_return'])

# Plotting the account value over time
plt.figure(figsize=(10, 6))
plt.plot(portfolio_df['quarter'], portfolio_df['account_value'], marker='o', linestyle='-', color='blue')
plt.title('Portfolio Account Value Over Time')
plt.xlabel('Quarter')
plt.ylabel('Account Value ($)')
plt.xticks(rotation=45)
plt.grid()
plt.tight_layout()
plt.show()

In [None]:
(10389.210944304268/10000)-1

In [None]:
portfolio_df['account_value']

# JG space

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
data_sim = pd.read_csv('/data/workspace_files/lazy_price_replication/10k_final_with_ticker_name_filtered_w_similarity.csv')

In [None]:
data_sim

In [None]:
date_columns = [f'file_date{year}' for year in range(2007, 2013)]

# Initialize an empty list to store all the filtered dates
all_dates = []

# Loop through each file_date column
for col in date_columns:
    filings_date = data_sim[col].to_list()
    date_obj = pd.to_datetime(filings_date, format='%Y%m%d', errors='coerce')
    all_dates.extend(date_obj)

In [None]:
date_columns = [f'report_period_end_date{year}' for year in range(2007, 2013)]

# Initialize an empty list to store all the filtered dates
all_dates_rep = []

# Loop through each reporting_date column
for col in date_columns:
    filings_date = data_sim[col].to_list()
    date_obj = pd.to_datetime(filings_date, format='%Y%m%d', errors='coerce')
    all_dates_rep.extend(date_obj)

In [None]:
temp = dates_df['date'].dt.quarter.value_counts()
labels = [f"Q{quarter}" for quarter in temp.index]

In [None]:
temp = dates_df['date'].dt.quarter.value_counts()
colors = ['#ff9999', '#66b3ff', '#99ff99', '#ffcc99']

# Create the pie chart
plt.figure(figsize=(6, 6))  # Size of the chart
plt.pie(temp.values, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)

# Equal aspect ratio ensures that pie is drawn as a circle.
plt.axis('equal')

# Add a title
plt.title('Distribution of reports in every quarter')

# Display the chart
plt.show()

In [None]:
# Convert the list of dates into a pandas DataFrame for easier manipulation
dates_df = pd.DataFrame(all_dates, columns=['date'])

# Add a column for the year and the quarter
dates_df['year'] = dates_df['date'].dt.year
dates_df['quarter'] = dates_df['date'].dt.to_period('Q')  # This creates values like '2007Q1', '2007Q2', etc.

# Count the number of dates in each quarter
quarter_counts = dates_df['quarter'].value_counts().sort_index()

# Plot the histogram
plt.figure(figsize=(10, 6))
quarter_counts.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Number of Filings per Quarter (2007-2012)', fontsize=16)
plt.xlabel('Quarter', fontsize=12)
plt.ylabel('Number of Dates', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
price_data = pd.read_csv("/data/workspace_files/lazy_price_replication/all_ticker_prices.csv", index_col=0)
price_data['returns'] = price_data['Close'].pct_change()
price_data = price_data[abs(price_data['returns'])<90]

In [None]:
import pandas as pd

# Assuming you have a DataFrame 'data' with columns 'Date', 'Close', 'ticker', 'year', 'returns'
data = price_data
# Step 1: Convert 'Date' to datetime if not already done
data['Date'] = pd.to_datetime(data['Date'])

# Step 2: Extract year and month from the 'Date' column
data['year'] = data['Date'].dt.year
data['month'] = data['Date'].dt.month

# Step 3: Filter the data to only include February (month == 2)
data_feb_end = data[data['month'] == 2].copy()

# Step 4: Sort values by 'ticker' and 'Date' to make sure calculations are ordered correctly
data_feb_end.sort_values(by=['ticker', 'Date'], inplace=True)

# Step 5: Keep only the last trading day of February for each ticker and year
data_feb_end_last_day = data_feb_end.groupby(['ticker', 'year']).tail(1).copy()

# Step 6: Shift the closing price to align February end with the next year’s February end for each ticker
data_feb_end_last_day['feb_end_next_year'] = data_feb_end_last_day.groupby('ticker')['Close'].shift(-1)

# Step 7: Calculate the yearly return from February end to the next year’s February end
data_feb_end_last_day['yearly_return'] = (data_feb_end_last_day['feb_end_next_year'] - data_feb_end_last_day['Close']) / data_feb_end_last_day['Close']

# Step 8: Drop rows where 'feb_end_next_year' is NaN (the last year for which there's no next year's February)
data_feb_end_last_day = data_feb_end_last_day.dropna(subset=['feb_end_next_year'])

# Step 9: Keep only the relevant columns
yearly_returns = data_feb_end_last_day[['ticker', 'year', 'Close', 'feb_end_next_year', 'yearly_return']]

# Display the result
print(yearly_returns)
#

In [None]:

#Best results
'''import pandas as pd
# Convert 'date' to datetime
price_data['Date'] = pd.to_datetime(price_data['Date'])

# Extract year and quarter from 'date'
price_data['year'] = price_data['Date'].dt.year
price_data['quarter'] = price_data['Date'].dt.quarter

# Group by ticker, year, and quarter, then calculate quarterly return
yearly_returns = price_data.groupby(['ticker', 'year']).apply(
    lambda x: (x['Close'].iloc[-1] - x['Close'].iloc[0]) / x['Close'].iloc[0]
).reset_index(name='yearly_return')

# Output the DataFrame with quarterly returns
print(yearly_returns)'''

In [None]:
cols = [col for col in data_sim.columns if 'similarity' in col]
cols.append('ticker')
data = data_sim[cols]

In [None]:
prefixes = ['consine_similarity', 'jaccard_similarity', 'min_edit_distance_similarity']

# Melt the DataFrame to reshape it from wide to long format
melted_df = pd.melt(data, id_vars=['ticker'], var_name='similarity', value_name='value')

# Extract year and similarity type from the column name
melted_df['year'] = melted_df['similarity'].str.extract(r'(\d{4})').astype(int)
melted_df['similarity_type'] = melted_df['similarity'].str.extract(r'(^[a-zA-Z_]+)')

# Pivot the DataFrame to get separate columns for each similarity measure
final_df = melted_df.pivot_table(index=['ticker', 'year'], columns='similarity_type', values='value').reset_index()

# Rename the columns to a cleaner format
final_df.columns.name = None
final_df.rename(columns={'consine_similarity': 'cosine_similarity',
                         'min_edit_distance_similarity': 'min_edit_similarity'}, inplace=True)

# Display the final DataFrame
print(final_df)

In [None]:
yearly_returns

In [None]:
merged_df = pd.merge(yearly_returns, final_df, on=['ticker', 'year'], how='inner')

In [None]:
merged_df.columns

In [None]:
#merged_df['min_edit_distance_std'] = merged_df['min_edit_distance_std']
merged_df['new_factor_1'] = merged_df['jaccard_similarity_']*np.sign(merged_df['min_edit_distance_similarity_'])

In [None]:
merged_df

In [None]:
long_portfolio

In [None]:
# Step 1: Sort data by similarity measure (choose cosine_similarity_ for this example)
data_sorted = merged_df.sort_values(by='jaccard_similarity_', ascending=True)

# Step 2: Define the percentage cutoff for long and short positions
top_cutoff = 0.20  # Top 20% for long positions
bottom_cutoff = 0.20  # Bottom 20% for short positions

# Step 3: Define number of stocks in long and short positions
n_stocks = len(data_sorted)
n_long = int(top_cutoff * n_stocks)
n_short = int(bottom_cutoff * n_stocks)

# Step 4: Create Long and Short Portfolios
long_portfolio = data_sorted.head(n_long)
short_portfolio = data_sorted.tail(n_short)

# Step 5: Calculate Average Returns
long_return = long_portfolio['yearly_return'].mean()
short_return = short_portfolio['yearly_return'].mean()

# Step 6: Long-Short Portfolio Return
long_short_return = long_return - short_return

# Output the portfolio return
print(f"Long portfolio tickers: {long_portfolio['ticker'].tolist()}")
print(f"Short portfolio tickers: {short_portfolio['ticker'].tolist()}")
print(f"Long portfolio average return: {long_return:.2%}")
print(f"Short portfolio average return: {short_return:.2%}")
print(f"Long-Short portfolio return: {long_short_return:.2%}")

In [None]:
long_portfolio.groupby(['year_x'])['yearly_return_x'].sum()

In [None]:
merged_df = merged_df.sort_values(by=['ticker', 'year'])

# Step 2: Calculate momentum (percentage change in 'yearly_return' from previous year)
merged_df['momentum'] = merged_df.groupby('ticker')['yearly_return'].pct_change()

# Display the result
print(merged_df[['ticker', 'year', 'yearly_return', 'momentum']])

In [None]:
merged_df.fillna(1, inplace=True)

## Newer Sent

In [None]:
data_sent = pd.read_csv('/data/workspace_files/lazy_price_replication/10k_final_with_ticker_name_filtered_w_similarity_and_sentiment.csv')

In [None]:
# Define the periods of interest (2007-2008, 2008-2009, 2009-2010, and 2010-2011)
periods = [
    ('2007', '2008'),
    ('2008', '2009'),
    ('2009', '2010'),
    ('2010', '2011')
]

# Define the threshold for a big increase in sentiment
big_increase_threshold = 0.5  # Adjust this value as necessary

# Assuming you have a DataFrame called 'returns_data' that contains returns for multiple years
# Let's say 'returns_data' has columns: ['ticker', 'year', 'yearly_return']

# Initialize new lists to store results for the adjusted strategies
adjusted_portfolio_returns = []
adjusted_long_tickers = {}
adjusted_short_tickers = {}

# Iterate over each period to calculate the portfolio returns
for start_year, end_year in periods:
    
    # Step 1: Filter the returns data to get the returns for the end year of the period
    returns_period = merged_df[merged_df['year'] == int(end_year)]
    
    # Step 2: Merge the returns data for the end year with the main dataset based on 'ticker'
    data_sent_period = data_sent.merge(returns_period[['ticker', 'yearly_return']], on='ticker', how='left')

    # Step 3: Sort data by cosine similarity for the specific period (e.g., consine_similarity_2007_to_2008)
    cosine_column = f'consine_similarity_{start_year}_to_{end_year}'
    sentiment_column_start = f'sentiment{start_year}'
    sentiment_column_end = f'sentiment{end_year}'

    # Sort the data for the period
    data_sorted_adjusted = data_sent_period.sort_values(by=cosine_column, ascending=True)

    # Step 4: Define the percentage cutoff for long and short positions
    top_cutoff = 0.20  # Top 20% for long positions
    bottom_cutoff = 0.20  # Bottom 20% for short positions

    # Step 5: Define number of stocks in long and short positions
    n_stocks_adjusted = len(data_sorted_adjusted)
    n_long_adjusted = int(top_cutoff * n_stocks_adjusted)
    n_short_adjusted = int(bottom_cutoff * n_stocks_adjusted)

    # Step 6: Create Long Portfolio
    long_portfolio_adjusted = data_sorted_adjusted.head(n_long_adjusted)

    # Step 7: Apply the sentiment filter only for 2007 to 2008
    short_portfolio_adjusted = data_sorted_adjusted.tail(n_short_adjusted)
    if start_year != '2010':
        short_portfolio_adjusted = short_portfolio_adjusted[
            ~((short_portfolio_adjusted[sentiment_column_end] - short_portfolio_adjusted[sentiment_column_start]) > 0)
        ]

    # Step 8: Calculate Average Returns for Long and Short portfolios using the merged 'yearly_return' column
    long_return_adjusted = long_portfolio_adjusted['yearly_return'].mean()
    short_return_adjusted = short_portfolio_adjusted['yearly_return'].mean()

    # Step 9: Long-Short Portfolio Return
    long_short_return_adjusted = long_return_adjusted - short_return_adjusted

    # Store the result for this period
    adjusted_portfolio_returns.append({
        'period': f'{start_year} to {end_year}',
        'long_return': long_return_adjusted,
        'short_return': short_return_adjusted,
        'long_short_return': long_short_return_adjusted,
        'long_portfolio_tickers': long_portfolio_adjusted['ticker'].tolist(),
        'short_portfolio_tickers': short_portfolio_adjusted['ticker'].tolist()
    })

    adjusted_long_tickers[f'{start_year} to {end_year}'] = long_portfolio_adjusted['ticker'].tolist()
    adjusted_short_tickers[f'{start_year} to {end_year}'] = short_portfolio_adjusted['ticker'].tolist()

# Create a DataFrame from the adjusted portfolio returns
adjusted_portfolio_df = pd.DataFrame(adjusted_portfolio_returns)

# Sentiment-only strategy using deciles
sentiment_portfolio_returns = []
sentiment_long_tickers = {}
sentiment_short_tickers = {}

# Iterate over each period to calculate the sentiment-only portfolio returns
for start_year, end_year in periods:
    
    # Step 1: Filter the returns data to get the returns for the end year of the period
    returns_period = merged_df[merged_df['year'] == int(end_year)]
    
    # Step 2: Merge the returns data for the end year with the main dataset based on 'ticker'
    data_sent_period = data_sent.merge(returns_period[['ticker', 'yearly_return']], on='ticker', how='left')

    # Step 3: Sort by sentiment for the end year
    sentiment_column_end = f'sentiment{end_year}'
    data_sorted_sentiment = data_sent_period.sort_values(by=sentiment_column_end, ascending=True)

    # Step 4: Define deciles for long and short positions
    n_sentiment_stocks = len(data_sorted_sentiment)
    n_long_sentiment = n_sentiment_stocks // 10  # Top decile for long
    n_short_sentiment = n_sentiment_stocks // 10  # Bottom decile for short

    # Step 5: Create Long and Short Portfolios based on sentiment deciles
    long_portfolio_sentiment = data_sorted_sentiment.tail(n_long_sentiment)
    short_portfolio_sentiment = data_sorted_sentiment.head(n_short_sentiment)

    # Step 6: Calculate Average Returns for Long and Short portfolios using the merged 'yearly_return' column
    long_return_sentiment = long_portfolio_sentiment['yearly_return'].mean()
    short_return_sentiment = short_portfolio_sentiment['yearly_return'].mean()

    # Step 7: Long-Short Portfolio Return
    long_short_return_sentiment = long_return_sentiment - short_return_sentiment

    # Store the result for this period
    sentiment_portfolio_returns.append({
        'period': f'{start_year} to {end_year}',
        'long_return': long_return_sentiment,
        'short_return': short_return_sentiment,
        'long_short_return': long_short_return_sentiment,
        'long_portfolio_tickers': long_portfolio_sentiment['ticker'].tolist(),
        'short_portfolio_tickers': short_portfolio_sentiment['ticker'].tolist()
    })

    sentiment_long_tickers[f'{start_year} to {end_year}'] = long_portfolio_sentiment['ticker'].tolist()
    sentiment_short_tickers[f'{start_year} to {end_year}'] = short_portfolio_sentiment['ticker'].tolist()

# Create a DataFrame from the sentiment portfolio returns
sentiment_portfolio_df = pd.DataFrame(sentiment_portfolio_returns)
# Initialize variables to accumulate total returns across the full period
total_long_return_sentiment = 0
total_short_return_sentiment = 0
total_long_short_return_sentiment = 0

# Initialize counters for averaging
long_count = 0
short_count = 0

# Iterate over each period and accumulate returns for the sentiment-based strategy
for index, row in sentiment_portfolio_df.iterrows():
    # Accumulate returns and count tickers
    if not pd.isna(row['long_return']):
        total_long_return_sentiment += row['long_return']
        long_count += 1
    if not pd.isna(row['short_return']):
        total_short_return_sentiment += row['short_return']
        short_count += 1
    if not pd.isna(row['long_short_return']):
        total_long_short_return_sentiment += row['long_short_return']

# Calculate the average returns across all periods
average_long_return_sentiment = total_long_return_sentiment / long_count if long_count > 0 else 0
average_short_return_sentiment = total_short_return_sentiment / short_count if short_count > 0 else 0
average_long_short_return_sentiment = average_long_return_sentiment - average_short_return_sentiment

# Output the results for the sentiment-based strategy over the full period
print(f"\nSentiment-based strategy over the full period 2007-2011:")
print(f"Average Sentiment Long portfolio return: {average_long_return_sentiment:.2%}")
print(f"Average Sentiment Long-Short portfolio return: {average_long_short_return_sentiment:.2%}")

In [None]:
data_sent = pd.read_csv('/data/workspace_files/lazy_price_replication/10k_final_with_ticker_name_filtered_w_similarity_and_sentiment.csv')

In [None]:
sentiment_long_tickers

In [None]:
price_data['Date'] = pd.to_datetime(price_data['Date'])

In [None]:
import pandas as pd

# Initialize a list to store results for both strategies
sentiment_portfolio_returns = []
adjusted_portfolio_returns = []

# Assuming you have the price data with a 'Date' and 'Close' column
# price_data = DataFrame with ['Date', 'ticker', 'Close']

# Iterate over the period ranges like '2007 to 2008'
for period in sentiment_long_tickers.keys():
    start_year, end_year = period.split(" to ")
    start_year = int(start_year)
    end_year = int(end_year)
    
    # Step 2: Get long and short tickers for the period for both strategies
    long_sentiment_tick = sentiment_long_tickers[period]  # Sentiment-based long tickers
    short_sentiment_tick = sentiment_short_tickers[period]  # Sentiment-based short tickers

    long_adjusted_tick = adjusted_long_tickers[period]  # Adjusted-based long tickers
    short_adjusted_tick = adjusted_short_tickers[period]  # Adjusted-based short tickers

    # Filter price data for the period (using end year for filtering prices)
    year_price_data = price_data[price_data['Date'].dt.year == end_year]

    # Calculate daily returns
    year_price_data['daily_return'] = year_price_data.groupby('ticker')['Close'].pct_change()

    # Step 3: Separate long and short portfolios for sentiment-based strategy
    long_sentiment_returns = year_price_data[year_price_data['ticker'].isin(long_sentiment_tick)]
    short_sentiment_returns = year_price_data[year_price_data['ticker'].isin(short_sentiment_tick)]

    # Step 4: Calculate average daily returns for sentiment long and short portfolios
    long_sentiment_portfolio_return = long_sentiment_returns.groupby('Date')['daily_return'].mean().reset_index()
    short_sentiment_portfolio_return = short_sentiment_returns.groupby('Date')['daily_return'].mean().reset_index()

    # Step 5: Merge sentiment results into one DataFrame
    long_sentiment_portfolio_return['portfolio'] = 'long'
    short_sentiment_portfolio_return['portfolio'] = 'short'

    # Combine long and short sentiment returns
    combined_sentiment_returns = pd.concat([long_sentiment_portfolio_return, short_sentiment_portfolio_return], ignore_index=True)

    # Add period to combined sentiment returns for clarity
    combined_sentiment_returns['period'] = f'{start_year} to {end_year}'

    # Append the results for sentiment strategy for this period
    sentiment_portfolio_returns.append(combined_sentiment_returns)

    # Step 6: Separate long and short portfolios for adjusted-based strategy
    long_adjusted_returns = year_price_data[year_price_data['ticker'].isin(long_adjusted_tick)]
    short_adjusted_returns = year_price_data[year_price_data['ticker'].isin(short_adjusted_tick)]

    # Step 7: Calculate average daily returns for adjusted long and short portfolios
    long_adjusted_portfolio_return = long_adjusted_returns.groupby('Date')['daily_return'].mean().reset_index()
    short_adjusted_portfolio_return = short_adjusted_returns.groupby('Date')['daily_return'].mean().reset_index()

    # Step 8: Merge adjusted results into one DataFrame
    long_adjusted_portfolio_return['portfolio'] = 'long'
    short_adjusted_portfolio_return['portfolio'] = 'short'

    # Combine long and short adjusted returns
    combined_adjusted_returns = pd.concat([long_adjusted_portfolio_return, short_adjusted_portfolio_return], ignore_index=True)

    # Add period to combined adjusted returns for clarity
    combined_adjusted_returns['period'] = f'{start_year} to {end_year}'

    # Append the results for adjusted strategy for this period
    adjusted_portfolio_returns.append(combined_adjusted_returns)

# Step 9: Concatenate all periods' results into a single DataFrame for both strategies
final_sentiment_portfolio_returns = pd.concat(sentiment_portfolio_returns, ignore_index=True)
final_adjusted_portfolio_returns = pd.concat(adjusted_portfolio_returns, ignore_index=True)

# Output the results
print("Sentiment-based strategy returns:")
print(final_sentiment_portfolio_returns.head())

print("Adjusted-based strategy returns:")
print(final_adjusted_portfolio_returns.head())

In [None]:
# Define the periods of interest (2007-2008, 2008-2009, 2009-2010, and 2010-2011)
periods = [
    ('2007', '2008'),
    ('2008', '2009'),
    ('2009', '2010'),
    ('2010', '2011')
]

# Define the threshold for a big increase in sentiment
big_increase_threshold = 0.5  # Adjust this value as necessary

# Assuming you have a DataFrame called 'returns_data' that contains returns for multiple years
# Let's say 'returns_data' has columns: ['ticker', 'year', 'yearly_return']

# Initialize dictionaries to store long and short tickers per year for each strategy
adjusted_long_tickers = {}
adjusted_short_tickers = {}
sentiment_long_tickers = {}
sentiment_short_tickers = {}

# Store portfolio returns for the adjusted strategy
adjusted_portfolio_returns = []

# Iterate over each period to calculate the adjusted portfolio returns
for start_year, end_year in periods:
    
    # Step 1: Filter the returns data to get the returns for the end year of the period
    returns_period = merged_df[merged_df['year'] == int(end_year)]
    
    # Step 2: Merge the returns data for the end year with the main dataset based on 'ticker'
    data_sent_period = data_sent.merge(returns_period[['ticker', 'yearly_return']], on='ticker', how='left')

    # Step 3: Sort data by cosine similarity for the specific period (e.g., consine_similarity_2007_to_2008)
    cosine_column = f'consine_similarity_{start_year}_to_{end_year}'
    sentiment_column_start = f'sentiment{start_year}'
    sentiment_column_end = f'sentiment{end_year}'

    # Sort the data for the period
    data_sorted_adjusted = data_sent_period.sort_values(by=cosine_column, ascending=True)

    # Step 4: Define the percentage cutoff for long and short positions
    top_cutoff = 0.20  # Top 20% for long positions
    bottom_cutoff = 0.20  # Bottom 20% for short positions

    # Step 5: Define number of stocks in long and short positions
    n_stocks_adjusted = len(data_sorted_adjusted)
    n_long_adjusted = int(top_cutoff * n_stocks_adjusted)
    n_short_adjusted = int(bottom_cutoff * n_stocks_adjusted)

    # Step 6: Create Long Portfolio for the adjusted strategy
    long_portfolio_adjusted = data_sorted_adjusted.head(n_long_adjusted)

    # Step 7: Apply the sentiment filter only for 2007 to 2008
    short_portfolio_adjusted = data_sorted_adjusted.tail(n_short_adjusted)
    if start_year != '2010':
        short_portfolio_adjusted = short_portfolio_adjusted[
            ~((short_portfolio_adjusted[sentiment_column_end] - short_portfolio_adjusted[sentiment_column_start]) > 0)
        ]

    # Store long and short tickers for the adjusted strategy in dictionaries
    adjusted_long_tickers[f'{start_year} to {end_year}'] = long_portfolio_adjusted['ticker'].tolist()
    adjusted_short_tickers[f'{start_year} to {end_year}'] = short_portfolio_adjusted['ticker'].tolist()

    # Calculate Average Returns for Long and Short portfolios using the merged 'yearly_return' column
    long_return_adjusted = long_portfolio_adjusted['yearly_return'].mean()
    short_return_adjusted = short_portfolio_adjusted['yearly_return'].mean()

    # Step 8: Long-Short Portfolio Return
    long_short_return_adjusted = long_return_adjusted - short_return_adjusted

    # Store the result for this period
    adjusted_portfolio_returns.append({
        'period': f'{start_year} to {end_year}',
        'long_return': long_return_adjusted,
        'short_return': short_return_adjusted,
        'long_short_return': long_short_return_adjusted,
        'long_portfolio_tickers': long_portfolio_adjusted['ticker'].tolist(),
        'short_portfolio_tickers': short_portfolio_adjusted['ticker'].tolist()
    })

# Store portfolio returns for the sentiment strategy
sentiment_portfolio_returns = []

# Iterate over each period to calculate the sentiment-based portfolio returns
for start_year, end_year in periods:
    
    # Step 1: Filter the returns data to get the returns for the end year of the period
    returns_period = merged_df[merged_df['year'] == int(end_year)]
    
    # Step 2: Merge the returns data for the end year with the main dataset based on 'ticker'
    data_sent_period = data_sent.merge(returns_period[['ticker', 'yearly_return']], on='ticker', how='left')

    # Step 3: Sort by sentiment for the end year
    sentiment_column_end = f'sentiment{end_year}'
    data_sorted_sentiment = data_sent_period.sort_values(by=sentiment_column_end, ascending=True)

    # Step 4: Define deciles for long and short positions
    n_sentiment_stocks = len(data_sorted_sentiment)
    n_long_sentiment = n_sentiment_stocks // 10  # Top decile for long
    n_short_sentiment = n_sentiment_stocks // 10  # Bottom decile for short

    # Step 5: Create Long and Short Portfolios based on sentiment deciles
    long_portfolio_sentiment = data_sorted_sentiment.tail(n_long_sentiment)
    short_portfolio_sentiment = data_sorted_sentiment.head(n_short_sentiment)

    # Store long and short tickers for the sentiment strategy in dictionaries
    sentiment_long_tickers[f'{start_year} to {end_year}'] = long_portfolio_sentiment['ticker'].tolist()
    sentiment_short_tickers[f'{start_year} to {end_year}'] = short_portfolio_sentiment['ticker'].tolist()

    # Step 6: Calculate Average Returns for Long and Short portfolios using the merged 'yearly_return' column
    long_return_sentiment = long_portfolio_sentiment['yearly_return'].mean()
    short_return_sentiment = short_portfolio_sentiment['yearly_return'].mean()

    # Step 7: Long-Short Portfolio Return
    long_short_return_sentiment = long_return_sentiment - short_return_sentiment

    # Store the result for this period
    sentiment_portfolio_returns.append({
        'period': f'{start_year} to {end_year}',
        'long_return': long_return_sentiment,
        'short_return': short_return_sentiment,
        'long_short_return': long_short_return_sentiment,
        'long_portfolio_tickers': long_portfolio_sentiment['ticker'].tolist(),
        'short_portfolio_tickers': short_portfolio_sentiment['ticker'].tolist()
    })

# Create DataFrames from the portfolio returns
adjusted_portfolio_df = pd.DataFrame(adjusted_portfolio_returns)
sentiment_portfolio_df = pd.DataFrame(sentiment_portfolio_returns)

# Output the results for both strategies
print("\nAdjusted Strategy Long and Short Tickers by Year:")
print(adjusted_long_tickers)
print(adjusted_short_tickers)

print("\nSentiment Strategy Long and Short Tickers by Year:")
print(sentiment_long_tickers)
print(sentiment_short_tickers)


In [None]:
price_data['Date']

In [None]:
sentiment_portfolio_df

In [None]:
long_portfolio_returns = final_portfolio_returns[final_portfolio_returns['portfolio']=='long']
short_portfolio_returns = final_portfolio_returns[final_portfolio_returns['portfolio']=='short']

long_portfolio_returns.set_index('Date', inplace=True)
short_portfolio_returns.set_index('Date', inplace=True)

long_short_returns = long_portfolio_returns['daily_return'] - short_portfolio_returns['daily_return']

# Step 3: Create a DataFrame for long-short returns
long_short_returns_df = pd.DataFrame({
    'long_short_return': long_short_returns
})
cumulative_long_returns = (1 + long_portfolio_returns['daily_return']).cumprod() - 1
cumulative_short_returns = (1 + short_portfolio_returns['daily_return']).cumprod() - 1
cumulative_long_short_returns = (1 + long_short_returns).cumprod() - 1

# Step 4: Plotting Cumulative Returns
plt.figure(figsize=(14, 7))

# Plot Cumulative Returns
plt.plot(cumulative_long_returns.index, cumulative_long_returns, label='Cumulative Long Portfolio', color='green')
plt.plot(cumulative_short_returns.index, cumulative_short_returns, label='Cumulative Short Portfolio', color='red')
#plt.plot(long_short_returns_df.index, long_short_returns_df['long_short_return'], label='Cumulative Long-Short Portfolio', color='green')
plt.plot(cumulative_long_short_returns.index, cumulative_long_short_returns, label='Cumulative Long-Short Portfolio', color='orange')

# Step 5: Adding title and labels
plt.title('Returns: Long, Short, and Long-Short Portfolio using Jaccard similarity', fontsize=16)
plt.xlabel('Date', fontsize=14)
plt.ylabel('Cumulative Return', fontsize=14)
plt.axhline(0, color='black', linewidth=0.8, linestyle='--')  # Add a horizontal line at y=0
plt.legend()
plt.grid()

# Step 6: Show the plot
plt.tight_layout()
plt.show()
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Function to calculate and plot cumulative returns for a given portfolio
def plot_cumulative_returns_with_sharpe(final_portfolio_returns, strategy_name):
    # Separate long and short portfolio returns
    long_portfolio_returns = final_portfolio_returns[final_portfolio_returns['portfolio'] == 'long']
    short_portfolio_returns = final_portfolio_returns[final_portfolio_returns['portfolio'] == 'short']

    # Set index to Date for both long and short
    long_portfolio_returns.set_index('Date', inplace=True)
    short_portfolio_returns.set_index('Date', inplace=True)

    # Calculate Sharpe ratio for long and short portfolios
    long_sharpe_ratio = (long_portfolio_returns['daily_return'].mean() / long_portfolio_returns['daily_return'].std()) * np.sqrt(252)
    short_sharpe_ratio = (short_portfolio_returns['daily_return'].mean() / short_portfolio_returns['daily_return'].std()) * np.sqrt(252)

    print(f"{strategy_name} Long Portfolio Sharpe Ratio: {long_sharpe_ratio:.2f}")
    print(f"{strategy_name} Short Portfolio Sharpe Ratio: {short_sharpe_ratio:.2f}")

    # Plotting Cumulative Returns
    plt.figure(figsize=(14, 7))

    # Plot cumulative sum of daily returns for long, short, and combined long-short portfolio
    (long_portfolio_returns['daily_return'].cumsum()).plot(label='Cumulative Long Portfolio', color='green')
    (short_portfolio_returns['daily_return'].cumsum()).plot(label='Cumulative Short Portfolio', color='red')
    (0.5 * (short_portfolio_returns['daily_return'] + long_portfolio_returns['daily_return']).cumsum()).plot(label='Cumulative Long-Short Portfolio (Average)', color='orange')

    # Adding title and labels
    plt.title(f'Cumulative Returns: Long, Short, and Long-Short Portfolio ({strategy_name})', fontsize=16)
    plt.xlabel('Date', fontsize=14)
    plt.ylabel('Cumulative Return', fontsize=14)
    plt.axhline(0, color='black', linewidth=0.8, linestyle='--')  # Add a horizontal line at y=0
    plt.legend()
    plt.grid()

    # Show the plot
    plt.tight_layout()
    plt.show()

# Plot for the sentiment-based strategy
plot_cumulative_returns_with_sharpe(final_sentiment_portfolio_returns, 'Sentiment-Based')

# Plot for the adjusted cosine similarity-based strategy
plot_cumulative_returns_with_sharpe(final_adjusted_portfolio_returns, 'Adjusted Cosine Similarity-Based')

In [None]:
long_portfolio_returns['daily_return'].mean()/long_portfolio_returns['daily_return'].std() * np.sqrt(252)
long_portfolio_returns['daily_return'].cumsum().plot()
short_portfolio_returns['daily_return'].cumsum().plot()
1/2*(short_portfolio_returns['daily_return'] + long_portfolio_returns['daily_return']).cumsum().plot()

In [None]:
long_portfolio_returns['daily_return'].cumsum()

In [None]:
short_portfolio_returns['daily_return'].mean()/short_portfolio_returns['daily_return'].std() * np.sqrt(252)

In [None]:
long_portfolio_returns['daily_return'].mean()/long_portfolio_returns['daily_return'].std() * np.sqrt(252)

# JG Clean Space

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
data_sim = pd.read_csv('/data/workspace_files/lazy_price_replication/10k_final_with_ticker_name_filtered_w_similarity.csv')
#file_path = '/data/workspace_files/10k_final_with_ticker_name_filtered_w_similarity.csv'
#file_path = '/data/workspace_files/lazy_price_replication/10k_final_with_ticker_name_filtered_w_similarity.csv'

In [None]:
date_columns = [f'file_date{year}' for year in range(2007, 2013)]

# Initialize an empty list to store all the filtered dates
all_dates = []

# Loop through each file_date column
for col in date_columns:
    filings_date = data_sim[col].to_list()
    date_obj = pd.to_datetime(filings_date, format='%Y%m%d', errors='coerce')
    all_dates.extend(date_obj)

In [None]:
date_columns = [f'report_period_end_date{year}' for year in range(2007, 2013)]

# Initialize an empty list to store all the filtered dates
all_dates_rep = []

# Loop through each reporting_date column
for col in date_columns:
    filings_date = data_sim[col].to_list()
    date_obj = pd.to_datetime(filings_date, format='%Y%m%d', errors='coerce')
    all_dates_rep.extend(date_obj)

In [None]:
price_data = pd.read_csv("/data/workspace_files/lazy_price_replication/all_ticker_prices.csv", index_col=0)
price_data['returns'] = price_data['Close'].pct_change()
price_data = price_data[abs(price_data['returns'])<90]

# Calculating yearly returns

Jan to Jan returns

In [None]:
import pandas as pd
# Convert 'date' to datetime
price_data['Date'] = pd.to_datetime(price_data['Date'])

# Extract year and quarter from 'date'
price_data['year'] = price_data['Date'].dt.year
price_data['quarter'] = price_data['Date'].dt.quarter

# Group by ticker, year, and quarter, then calculate quarterly return
yearly_returns = price_data.groupby(['ticker', 'year']).apply(
    lambda x: (x['Close'].iloc[-1] - x['Close'].iloc[0]) / x['Close'].iloc[0]
).reset_index(name='yearly_return')

# Output the DataFrame with quarterly returns
print(yearly_returns)

In [None]:
cols = [col for col in data_sim.columns if 'similarity' in col]
cols.append('ticker')
data = data_sim[cols]

In [None]:
prefixes = ['consine_similarity', 'jaccard_similarity', 'min_edit_distance_similarity']

# Melt the DataFrame to reshape it from wide to long format
melted_df = pd.melt(data, id_vars=['ticker'], var_name='similarity', value_name='value')

# Extract year and similarity type from the column name
melted_df['year'] = melted_df['similarity'].str.extract(r'(\d{4})').astype(int)
melted_df['similarity_type'] = melted_df['similarity'].str.extract(r'(^[a-zA-Z_]+)')

# Pivot the DataFrame to get separate columns for each similarity measure
final_df = melted_df.pivot_table(index=['ticker', 'year'], columns='similarity_type', values='value').reset_index()

# Rename the columns to a cleaner format
final_df.columns.name = None
final_df.rename(columns={'consine_similarity': 'cosine_similarity',
                         'min_edit_distance_similarity': 'min_edit_similarity'}, inplace=True)

In [None]:
merged_df = pd.merge(yearly_returns, final_df, on=['ticker', 'year'], how='inner')
merged_df = merged_df.sort_values(by=['ticker', 'year'])
#Adding momentum
merged_df['momentum'] = merged_df.groupby('ticker')['yearly_return'].pct_change()
merged_df.fillna(1, inplace=True)

In [None]:
merged_df

In [None]:
# Initialize an empty list to store results
portfolio_returns = []
long_tickers = {}
short_tickers = {}

# Step 1: Group by year and iterate over each group
for year, group in merged_df.groupby('year'):
    # Step 2: Sort data by similarity measure (choose jaccard_similarity_ for this example)
    data_sorted = group.sort_values(by='jaccard_similarity_', ascending=True)

    # Step 3: Define the percentage cutoff for long and short positions
    top_cutoff = 0.20  # Top 20% for long positions
    bottom_cutoff = 0.20  # Bottom 20% for short positions

    # Step 4: Define number of stocks in long and short positions
    n_stocks = len(data_sorted)
    n_long = int(top_cutoff * n_stocks)
    n_short = int(bottom_cutoff * n_stocks)

    # Step 5: Create Long and Short Portfolios
    long_portfolio = data_sorted.head(n_long)
    short_portfolio = data_sorted.tail(n_short)

    # Step 6: Calculate Average Returns for Long and Short portfolios
    long_return = long_portfolio['yearly_return'].mean()
    short_return = short_portfolio['yearly_return'].mean()

    # Step 7: Long-Short Portfolio Return
    long_short_return = long_return - short_return

    # Store the result for this year
    portfolio_returns.append({
        'year': year,
        'long_return': long_return,
        'short_return': short_return,
        'long_short_return': long_short_return,
        'long_portfolio_tickers': long_portfolio['ticker'].tolist(),
        'short_portfolio_tickers': short_portfolio['ticker'].tolist()
    })

    long_tickers[year] = long_portfolio['ticker'].tolist()
    short_tickers[year] = short_portfolio['ticker'].tolist()


# Create a DataFrame from the portfolio returns
portfolio_df = pd.DataFrame(portfolio_returns)

# Output the results
for index, row in portfolio_df.iterrows():
    print(f"\nYear: {row['year']}")
    print(f"Long portfolio tickers: {row['long_portfolio_tickers']}")
    print(f"Short portfolio tickers: {row['short_portfolio_tickers']}")
    print(f"Long portfolio average return: {row['long_return']:.2%}")
    print(f"Short portfolio average return: {row['short_return']:.2%}")
    print(f"Long-Short portfolio return: {row['long_short_return']:.2%}")

In [None]:
portfolio_df['long_short_return'].mean()

# Calculate daily returns for stocks for all portfolios

In [None]:
# Initialize a list to store results
portfolio_returns = []

# Step 1: Iterate over the years and calculate daily returns
for year in long_tickers.keys():
    # Step 2: Get long and short tickers for the year
    long_tick = long_tickers[year]
    short_tick = short_tickers[year]

    # Filter price data for the year
    year_price_data = price_data[price_data['Date'].dt.year == year]

    # Calculate daily returns
    year_price_data['daily_return'] = year_price_data.groupby('ticker')['Close'].pct_change()

    # Step 3: Separate long and short portfolios
    long_returns = year_price_data[year_price_data['ticker'].isin(long_tick)]
    short_returns = year_price_data[year_price_data['ticker'].isin(short_tick)]

    # Step 4: Calculate average daily returns for long and short portfolios
    long_portfolio_return = long_returns.groupby('Date')['daily_return'].mean().reset_index()
    short_portfolio_return = short_returns.groupby('Date')['daily_return'].mean().reset_index()

    # Step 5: Merge results into one DataFrame
    long_portfolio_return['portfolio'] = 'long'
    short_portfolio_return['portfolio'] = 'short'

    # Combine long and short returns
    combined_returns = pd.concat([long_portfolio_return, short_portfolio_return], ignore_index=True)

    # Add year to combined returns for clarity
    combined_returns['year'] = year

    # Append the results for this year to the overall list
    portfolio_returns.append(combined_returns)

# Step 6: Concatenate all years' results into a single DataFrame
final_portfolio_returns = pd.concat(portfolio_returns, ignore_index=True)

# Output the results
print(final_portfolio_returns.head())

In [None]:
long_portfolio_returns = final_portfolio_returns[final_portfolio_returns['portfolio']=='long']
short_portfolio_returns = final_portfolio_returns[final_portfolio_returns['portfolio']=='short']

long_portfolio_returns.set_index('Date', inplace=True)
short_portfolio_returns.set_index('Date', inplace=True)

long_short_returns = long_portfolio_returns['daily_return'] - short_portfolio_returns['daily_return']

# Step 3: Create a DataFrame for long-short returns
long_short_returns_df = pd.DataFrame({
    'long_short_return': long_short_returns
})
cumulative_long_returns = (1 + long_portfolio_returns['daily_return']).cumprod() - 1
cumulative_short_returns = (1 + short_portfolio_returns['daily_return']).cumprod() - 1
cumulative_long_short_returns = (1 + long_short_returns).cumprod() - 1

# Step 4: Plotting Cumulative Returns
plt.figure(figsize=(14, 7))

# Plot Cumulative Returns
plt.plot(cumulative_long_returns.index, cumulative_long_returns, label='Cumulative Long Portfolio', color='green')
plt.plot(cumulative_short_returns.index, cumulative_short_returns, label='Cumulative Short Portfolio', color='red')
#plt.plot(long_short_returns_df.index, long_short_returns_df['long_short_return'], label='Cumulative Long-Short Portfolio', color='green')
plt.plot(cumulative_long_short_returns.index, cumulative_long_short_returns, label='Cumulative Long-Short Portfolio', color='orange')

# Step 5: Adding title and labels
plt.title('Returns: Long, Short, and Long-Short Portfolio using Jaccard similarity', fontsize=16)
plt.xlabel('Date', fontsize=14)
plt.ylabel('Cumulative Return', fontsize=14)
plt.axhline(0, color='black', linewidth=0.8, linestyle='--')  # Add a horizontal line at y=0
plt.legend()
plt.grid()

# Step 6: Show the plot
plt.tight_layout()
plt.show()
plt.show()

In [None]:
cumulative_long_short_returns.to_csv('Jaccard-Sim-Rets')

In [None]:
long_short_returns.to_csv('/data/workspace_files/lazy_price_replication/Jac-Portfolio-rets.csv')

# BERT

In [None]:
import pandas as pd
import numpy as np

In [None]:
!ls -l /data/workspace_files
!du -sh /data/workspace_files

In [None]:
data = pd.read_csv('/data/workspace_files/lazy_price_replication/10k_final_with_ticker_name_filtered_w_similarity.csv')

In [None]:
text_2007 = data['statement2007']

In [None]:
def extract_item_7(filing):
    # Find the start of ITEM 7
    filing = filing.lower()
    start_index = filing.find("item 7.")
    if start_index == -1:
        return None  # ITEM 7 not found
    
    # Find the start of the next item (ITEM 8)
    end_index = filing.find("ITEM 8.", start_index)
    if end_index == -1:
        end_index = len(filing)  # If ITEM 8 is not found, go to the end of the string
    
    # Extract ITEM 7
    item_7_content = filing[start_index:end_index].strip()
    return item_7_content

In [None]:
extracted_items = {}

# Loop through the years 2008 to 2011
for year in range(2007, 2013):
    # Create the column name dynamically
    column_name = f'statement{year}'
    
    # Extract the statement item for the current year
    text_year = data[column_name]
    
    # Apply the extraction function and drop NaN values
    item_extracted = text_year.apply(extract_item_7)
    
    # Store the extracted items in the dictionary
    extracted_items[year] = item_extracted

# Create a DataFrame from the extracted items
extracted_df = pd.DataFrame(extracted_items)

# Display the DataFrame
extracted_df

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

# Load the pre-trained FinBERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone')

# Initialize sentiment analysis pipeline
nlp_model = pipeline("text-classification", model=model, tokenizer=tokenizer)

# Test the sentiment analysis pipeline
sample_text = "The company's earnings report was extremely positive."
result = nlp_model(sample_text)
print(result)

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

# Load the pre-trained FinBERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone')

# Initialize sentiment analysis pipeline
nlp_model = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

In [None]:
def extract_sentiment(financial_texts, nlp):
    # Initialize lists to store results
    sentiments = []
    confidences = []

    # Iterate over each text in the Series
    for text in financial_texts:
        result = nlp(text[:500])
        sentiment = result[0]['label']  # The label will be 'positive', 'negative', or 'neutral'
        confidence = result[0]['score']
        
        # Append results to lists
        sentiments.append(sentiment)
        confidences.append(confidence)

    # Create a DataFrame from the results
    sentiment_df = pd.DataFrame({
        'text': financial_texts,
        'sentiment': sentiments,
        'confidence': confidences
    })
    
    return sentiment_df

In [None]:
!pip install --upgrade transformers
from transformers import BertTokenizer, BertModel

In [None]:
#getting keywords for sentiment
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-cased')
model_base = BertModel.from_pretrained('bert-base-cased', output_attentions=True)

In [None]:
!pip install nltk

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
import pandas as pd
import torch
import string
from nltk.corpus import stopwords

# Ensure you have the nltk stopwords downloaded
# import nltk
# nltk.download('stopwords')

def extract_keywords_series(financial_texts, model, tokenizer, top_n=150):
    """
    Extract top N keywords from each 10-K filing in the financial_texts Series, removing punctuation and stopwords.
    
    Parameters:
    financial_texts: pandas Series where each row contains a text (10-K filing)
    model: Pretrained BERT model
    tokenizer: Pretrained BERT tokenizer
    top_n: Number of top keywords to extract from each document
    
    Returns:
    A pandas Series where each row contains the extracted keywords for that filing.
    """
    # Initialize stopwords and punctuation filters
    stop_words = set(stopwords.words('english'))
    punctuation = set(string.punctuation)

    def clean_keywords(keywords):
        """
        Cleans the extracted keywords by removing stopwords, punctuation, and duplicates.
        """
        cleaned_keywords = []
        seen_keywords = set()  # To track unique words
        
        for keyword in keywords:
            # Filter out stopwords, punctuation, and ensure uniqueness
            if keyword.lower() not in stop_words and keyword not in punctuation and keyword.lower() not in seen_keywords:
                cleaned_keywords.append(keyword)
                seen_keywords.add(keyword.lower())  # Track as lowercase for case insensitivity
        
        return cleaned_keywords

    # Function to extract keywords for a single document using attention scores
    def extract_keywords(text, model, tokenizer, top_n=150):
        # Tokenize the input text
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
        input_ids = inputs['input_ids']
        
        # Pass the text through the model to get attention scores
        with torch.no_grad():
            outputs = model(**inputs, output_attentions=True)
            attentions = outputs.attentions  # List of attention layers (each contains multiple heads)
        
        # Get attention scores for the last layer
        last_layer_attention = attentions[-1].squeeze(0)  # [num_heads, seq_len, seq_len]
        
        # Sum attention across heads to get token importance
        attention_scores = last_layer_attention.sum(0)  # [seq_len, seq_len]
        
        # Focus on [CLS] token's attention scores to all other tokens
        cls_attention = attention_scores[0]  # Attention to [CLS] token
        
        # Get tokens and their attention scores
        tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze().tolist())
        
        # Rank tokens by their attention score to the [CLS] token
        token_attention_pairs = list(zip(tokens, cls_attention.tolist()))
        token_attention_pairs = sorted(token_attention_pairs, key=lambda x: x[1], reverse=True)
        
        # Extract top N keywords, filtering out special tokens
        keywords = [token for token, score in token_attention_pairs[:top_n] if token not in ['[CLS]', '[SEP]', '[PAD]']]
        
        # Clean the extracted keywords
        return clean_keywords(keywords)

    # Apply the extract_keywords function to each row in the financial_texts Series
    keywords_series = financial_texts.apply(lambda text: extract_keywords(text, model, tokenizer, top_n))

    # Return the Series containing keywords for each document
    return keywords_series

In [None]:
keywords_series = extract_keywords_series(item_7_extracted[:10], model, tokenizer)

#extract_sentiment(item_7_extracted,nlp_model)

In [None]:
keywords_series

In [None]:
keywords_series

# extracting MDA and keywords

In [None]:
import pandas as pd
import numpy as np

In [None]:
data = pd.read_csv('/data/workspace_files/lazy_price_replication/10k_final_with_ticker_name_filtered_w_similarity.csv')

In [None]:
data

In [None]:
def extract_item_7(filing):
    # Find the start of ITEM 7
    filing = filing.lower()
    start_index = filing.find("item 7.")
    if start_index == -1:
        return None  # ITEM 7 not found
    
    # Find the start of the next item (ITEM 8)
    end_index = filing.find("ITEM 8.", start_index)
    if end_index == -1:
        end_index = len(filing)  # If ITEM 8 is not found, go to the end of the string
    
    # Extract ITEM 7
    item_7_content = filing[start_index:end_index].strip()
    return item_7_content

In [None]:
# Define an empty dictionary to store extracted items for each year
extracted_items = {}

# Loop through the years 2007 to 2012
for year in range(2007, 2013):
    # Create the column name dynamically
    column_name = f'statement{year}'
    
    # Extract the statement item for the current year
    text_year = data[column_name]
    
    # Apply the extraction function and drop NaN values
    item_extracted = text_year.apply(extract_item_7)
    
    # Store the extracted items in the dictionary
    extracted_items[year] = item_extracted

# Create a DataFrame from the extracted items
extracted_df = pd.DataFrame(extracted_items)

In [None]:
# Reorder the columns to have the ticker as the first column (optional)
extracted_df = extracted_df.dropna()
extracted_df

In [None]:
!pip install --upgrade transformers
from transformers import BertTokenizer, BertModel

In [None]:
from transformers import BertTokenizer, BertModel
# Load pre-trained BERT model and tokenizer
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-uncased')
model_bert = BertModel.from_pretrained('bert-base-uncased', output_attentions=True)

In [None]:
import pandas as pd
import torch
import string
from nltk.corpus import stopwords

# Ensure you have the nltk stopwords downloaded
# import nltk
# nltk.download('stopwords')

def extract_keywords_series(financial_texts, model, tokenizer, top_n=150):
    """
    Extract top N keywords from each 10-K filing in the financial_texts Series, removing punctuation and stopwords.
    
    Parameters:
    financial_texts: pandas Series where each row contains a text (10-K filing)
    model: Pretrained BERT model
    tokenizer: Pretrained BERT tokenizer
    top_n: Number of top keywords to extract from each document
    
    Returns:
    A pandas Series where each row contains the extracted keywords for that filing.
    """
    # Initialize stopwords and punctuation filters
    stop_words = set(stopwords.words('english'))
    punctuation = set(string.punctuation)

    def clean_keywords(keywords):
        """
        Cleans the extracted keywords by removing stopwords, punctuation, and duplicates.
        """
        cleaned_keywords = []
        seen_keywords = set()  # To track unique words
        
        for keyword in keywords:
            # Filter out stopwords, punctuation, and ensure uniqueness
            if keyword.lower() not in stop_words and keyword not in punctuation and keyword.lower() not in seen_keywords:
                cleaned_keywords.append(keyword)
                seen_keywords.add(keyword.lower())  # Track as lowercase for case insensitivity
        
        return cleaned_keywords

    # Function to extract keywords for a single document using attention scores
    def extract_keywords(text, model, tokenizer, top_n=150):
        # Tokenize the input text
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
        input_ids = inputs['input_ids']
        
        # Pass the text through the model to get attention scores
        with torch.no_grad():
            outputs = model(**inputs, output_attentions=True)
            attentions = outputs.attentions  # List of attention layers (each contains multiple heads)
        
        # Get attention scores for the last layer
        last_layer_attention = attentions[-1].squeeze(0)  # [num_heads, seq_len, seq_len]
        
        # Sum attention across heads to get token importance
        attention_scores = last_layer_attention.sum(0)  # [seq_len, seq_len]
        
        # Focus on [CLS] token's attention scores to all other tokens
        cls_attention = attention_scores[0]  # Attention to [CLS] token
        
        # Get tokens and their attention scores
        tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze().tolist())
        
        # Rank tokens by their attention score to the [CLS] token
        token_attention_pairs = list(zip(tokens, cls_attention.tolist()))
        token_attention_pairs = sorted(token_attention_pairs, key=lambda x: x[1], reverse=True)
        
        # Extract top N keywords, filtering out special tokens
        keywords = [token for token, score in token_attention_pairs[:top_n] if token not in ['[CLS]', '[SEP]', '[PAD]']]
        
        # Clean the extracted keywords
        return clean_keywords(keywords)

    # Apply the extract_keywords function to each row in the financial_texts Series
    keywords_series = financial_texts.apply(lambda text: extract_keywords(text, model, tokenizer, top_n))

    # Return the Series containing keywords for each document
    return keywords_series

In [None]:
extract_keywords_series(extracted_df[:5]['2007'], model, tokenizer, top_n=150)

In [None]:

#getting keywords for sentiment
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-uncased')
model_base = BertModel.from_pretrained('bert-base-uncased', output_attentions=True)

# Sheet 2 Duplicate

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
data_sim = pd.read_csv('/data/workspace_files/lazy_price_replication/10k_final_with_ticker_name_filtered_w_similarity.csv')

In [None]:
text = data_sim['statement2007'].to_list()

In [None]:
data_sim = data_sim.dropna(subset=['ticker'])

In [None]:
date_columns = [f'file_date{year}' for year in range(2007, 2013)]

# Initialize an empty list to store all the filtered dates
all_dates = []

# Loop through each file_date column
for col in date_columns:
    filings_date = data_sim[col].to_list()
    date_obj = pd.to_datetime(filings_date, format='%Y%m%d', errors='coerce')
    all_dates.extend(date_obj)

In [None]:
date_columns = [f'report_period_end_date{year}' for year in range(2007, 2008)]

# Initialize an empty list to store all the filtered dates
all_dates_rep = []

# Loop through each reporting_date column
for col in date_columns:
    filings_date = data_sim[col].to_list()
    date_obj = pd.to_datetime(filings_date, format='%Y%m%d', errors='coerce')
    all_dates_rep.extend(date_obj)

In [None]:
# Convert the list of dates into a pandas DataFrame for easier manipulation
dates_df = pd.DataFrame(all_dates, columns=['date'])

# Add a column for the year and the quarter
dates_df['year'] = dates_df['date'].dt.year
dates_df['week'] = dates_df['date'].dt.to_period('W')  # This creates values like '2007Q1', '2007Q2', etc.

# Count the number of dates in each quarter
quarter_counts = dates_df['week'].value_counts().sort_index()

# Plot the histogram
plt.figure(figsize=(10, 6))
quarter_counts.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Number of Filings per week (2007-2012)', fontsize=16)
plt.xlabel('week', fontsize=12)
plt.ylabel('Number of Dates', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
# Convert the list of dates into a pandas DataFrame for easier manipulation
dates_df = pd.DataFrame(all_dates_rep, columns=['date'])

# Add a column for the year and the quarter
dates_df['year'] = dates_df['date'].dt.year
dates_df['week'] = dates_df['date'].dt.to_period('W')  # This creates values like '2007Q1', '2007Q2', etc.

# Count the number of dates in each quarter
quarter_counts = dates_df['week'].value_counts().sort_index()

# Plot the histogram
plt.figure(figsize=(10, 6))
quarter_counts.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Reporting period per week (2007-2012)', fontsize=16)
plt.xlabel('week', fontsize=12)
plt.ylabel('Number of Dates', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
import pandas as pd
import numpy as np

# Define the years you want to analyze
years = [2008, 2009, 2010, 2011, 2012]

# Initialize an empty list to store the data dictionaries
data_list = []

# Loop through each year
for year in years:
    # Convert the 'file_date' column for the year to datetime format
    data_sim[f'file_date{year}'] = pd.to_datetime(data_sim[f'file_date{year}'], format='%Y%m%d')
    
    # Define the start and end dates for each quarter and calculate trading dates
    quarters = [
        {'start': f'{year-1}-12-31', 'end': f'{year}-03-31'},  # Q1
        {'start': f'{year}-04-01', 'end': f'{year}-06-30'},    # Q2
        {'start': f'{year}-07-01', 'end': f'{year}-09-30'},    # Q3
        {'start': f'{year}-10-01', 'end': f'{year}-12-31'},    # Q4
    ]

    # Loop through each quarter for the current year
    for quarter in quarters:
        start_date = pd.to_datetime(quarter['start'])
        end_date = pd.to_datetime(quarter['end'])

        # Calculate the trading start and end dates (next quarter)
        if quarter['start'] == f'{year-1}-12-31':  # Q1
            trading_start = pd.to_datetime(f'{year}-04-01')
            trading_end = pd.to_datetime(f'{year}-06-30')
            trading_quarter = 'Q2'
            trading_year = year
        elif quarter['start'] == f'{year}-04-01':  # Q2
            trading_start = pd.to_datetime(f'{year}-07-01')
            trading_end = pd.to_datetime(f'{year}-09-30')
            trading_quarter = 'Q3'
            trading_year = year
        elif quarter['start'] == f'{year}-07-01':  # Q3
            trading_start = pd.to_datetime(f'{year}-10-01')
            trading_end = pd.to_datetime(f'{year}-12-31')
            trading_quarter = 'Q4'
            trading_year = year
        else:  # Q4
            trading_start = pd.to_datetime(f'{year+1}-01-01')
            trading_end = pd.to_datetime(f'{year+1}-03-31')
            trading_quarter = 'Q1'
            trading_year = year + 1

        # Filter the DataFrame for the specified time period
        filtered_data = data_sim[(data_sim[f'file_date{year}'] >= start_date) & (data_sim[f'file_date{year}'] <= end_date)]
        
        # Sort the filtered data by similarity metrics
        sorted_data_consine = filtered_data.sort_values(by=f'consine_similarity_{year-1}_to_{year}', ascending=True)
        sorted_data_jac = filtered_data.sort_values(by=f'jaccard_similarity_{year-1}_to_{year}', ascending=True)
        sorted_data_min_edit = filtered_data.sort_values(by=f'min_edit_distance_similarity_{year-1}_to_{year}', ascending=True)
        
        # Function to get the top and bottom quintiles
        def get_quintiles(sorted_df, col_name):
            quintile_size = int(np.ceil(len(sorted_df) * 0.20))
            top_quintile = sorted_df.head(quintile_size)[col_name].to_list()
            bottom_quintile = sorted_df.tail(quintile_size)[col_name].to_list()
            return top_quintile, bottom_quintile

        # Get the top and bottom quintile for each similarity measure
        top_consine, bottom_consine = get_quintiles(sorted_data_consine, 'ticker')
        top_jac, bottom_jac = get_quintiles(sorted_data_jac, 'ticker')
        top_min_edit, bottom_min_edit = get_quintiles(sorted_data_min_edit, 'ticker')
        # Determine the trading quarter based on the trading_start date

        quarter_str = f"{trading_start.year}-{trading_quarter}"
        # Create a dictionary to store the data for each quarter
        data_dict = {
            'year': year,
            'quarter': quarter_str,
            'start_date': start_date,
            'end_date': end_date,
            'trading_start': trading_start,
            'trading_end': trading_end,
            'Top Quintile Consine Similarity': top_consine,
            'Bottom Quintile Consine Similarity': bottom_consine,
            'Top Quintile Jaccard Similarity': top_jac,
            'Bottom Quintile Jaccard Similarity': bottom_jac,
            'Top Quintile Min Edit Distance Similarity': top_min_edit,
            'Bottom Quintile Min Edit Distance Similarity': bottom_min_edit
        }

        # Append the dictionary to the list
        data_list.append(data_dict)

# Create a DataFrame from the list of dictionaries
result_df = pd.DataFrame(data_list)

# Display the resulting DataFrame
result_df

In [None]:
price_data = pd.read_csv('/data/workspace_files/lazy_price_replication/all_ticker_prices.csv', index_col=0)

# Convert 'Date' to datetime
price_data['Date'] = pd.to_datetime(price_data['Date'])

# Extract year and quarter from 'Date'
price_data['year'] = price_data['Date'].dt.year
price_data['quarter'] = price_data['Date'].dt.quarter

# Group by ticker, year, and quarter, then calculate quarterly return
quarterly_returns = price_data.groupby(['ticker', 'year', 'quarter']).apply(
    lambda x: (x['Close'].iloc[-1] - x['Close'].iloc[0]) / x['Close'].iloc[0]
).reset_index(name='quarterly_return')

# Create a new DataFrame with quarterly periods and returns
quarterly_returns['quarter'] = quarterly_returns['year'].astype(str) + '-Q' + quarterly_returns['quarter'].astype(str)

# Drop the 'year' column if it's not needed
quarterly_prices = quarterly_returns[['ticker', 'quarter', 'quarterly_return']]

# Display the new DataFrame
price_df = quarterly_prices

In [None]:
import pandas as pd

# Assuming results_df is your stock_df and returns_df is your quarterly_prices

# Initialize an empty list to store portfolio returns
portfolio_returns = []

# Loop through each row of results_df to create the portfolio
for index, row in result_df.iterrows():
    # Get the tickers for long and short positions
    long_stocks = row['Bottom Quintile Jaccard Similarity']
    short_stocks = row['Top Quintile Jaccard Similarity']
    
    # Get the returns for the long stocks
    long_returns = price_df[price_df['ticker'].isin(long_stocks) & (price_df['quarter'] == row['quarter'])]['quarterly_return']
    
    # Get the returns for the short stocks
    short_returns = price_df[price_df['ticker'].isin(short_stocks) & (price_df['quarter'] == row['quarter'])]['quarterly_return']
    
    # Calculate the portfolio return for this quarter
    if len(long_returns) > 0 and len(short_returns) > 0:
        long_weight = 1 / len(long_stocks)  # Equal weight for long positions
        short_weight = -1 / len(short_stocks)  # Equal weight for short positions
        
        # Portfolio return calculation
        portfolio_return = (long_returns.sum() * long_weight) + (short_returns.sum() * short_weight)
        portfolio_returns.append({
            'quarter': row['quarter'],
            'portfolio_return': portfolio_return
        })

# Create a DataFrame from the portfolio returns
portfolio_df = pd.DataFrame(portfolio_returns)

# Display the resulting portfolio returns DataFrame
print(portfolio_df)

In [None]:

# Get the tickers for long and short positions
long_stocks = result_df['Top Quintile Jaccard Similarity'].iloc[0]
short_stocks = result_df['Bottom Quintile Jaccard Similarity'].iloc[0]

# Get the returns for the long stocks
long_returns = price_df[price_df['ticker'].isin(long_stocks) & (price_df['quarter'] == result_df['quarter'].iloc[0])]['quarterly_return']

# Get the returns for the short stocks
short_returns = price_df[price_df['ticker'].isin(short_stocks) & (price_df['quarter'] == result_df['quarter'].iloc[0])]['quarterly_return']

In [None]:
print(long_returns)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Sample initial investment amount
initial_investment = 10000  # or any amount you want

# Calculate cumulative returns
portfolio_df['cumulative_return'] = (1 + portfolio_df['portfolio_return']).cumprod() - 1

# Calculate account value over time
portfolio_df['account_value'] = initial_investment * (1 + portfolio_df['cumulative_return'])

# Plotting the account value over time
plt.figure(figsize=(10, 6))
plt.plot(portfolio_df['quarter'], portfolio_df['account_value'], marker='o', linestyle='-', color='blue')
plt.title('Portfolio Account Value Over Time')
plt.xlabel('Quarter')
plt.ylabel('Account Value ($)')
plt.xticks(rotation=45)
plt.grid()
plt.tight_layout()
plt.show()