In [7]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Function to load selected acronyms from the DataFrame
def load_acronyms_from_csv(csv_file):
    # Load the CSV into a pandas DataFrame
    acronyms_df = pd.read_csv(csv_file)

    # Get the unique list of acronyms (assuming 'Acronym' is the correct column name)
    acronyms = acronyms_df['Acronym'].unique()
    return acronyms

# Function to compute correlation matrix and create a heatmap, and save it as a PDF
def plot_correlation_heatmap(df, acronyms, output_pdf_path):
    # Select only the columns corresponding to the acronyms
    selected_data = df[acronyms]

    # Compute the correlation matrix
    correlation_matrix = selected_data.corr()

    # Increase the figure size to avoid truncation
    plt.figure(figsize=(14, 12))  # Adjust size as needed

    # Plot heatmap using seaborn with larger size and adjust other settings
    sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', linewidths=0.5,
                cbar_kws={'shrink': 0.8})  # Shrink color bar for better fit

    # Add title to the heatmap
    plt.title('Correlation Heatmap of Metrics', fontsize=16)

    # Adjust layout to fit everything
    plt.tight_layout()

    # Save the plot as a PDF
    plt.savefig(output_pdf_path, format='pdf')

    # Close the plot to free memory
    plt.close()

# Main function to run the analysis
def analyze_metrics_correlation(df, csv_file, output_pdf_path):
    # Load acronyms from the CSV
    acronyms = load_acronyms_from_csv(csv_file)

    # Plot the heatmap for correlated metrics and save it as a PDF
    plot_correlation_heatmap(df, acronyms, output_pdf_path)

# Example usage:
df = pd.read_csv('/content/hackathon_sample_v2.csv')  # Replace with your actual DataFrame source
csv_file = '/content/metrics_acronyms.csv'  # Replace with your CSV file path
output_pdf_path = '/content/heatmap_output.pdf'  # Replace with the desired PDF output path

# Run the analysis and save the heatmap
analyze_metrics_correlation(df, csv_file, output_pdf_path)


In [14]:
import pandas as pd
import json

class StockDataProcessor:
    """
    StockDataProcessor class provides methods to load acronyms from a CSV file, create a filtered DataFrame,
    filter stock tickers based on continuous months of data, and save the results to a JSON file.

    Attributes:
    -----------
    csv_file : str
        Path to the CSV file containing acronyms.
    df : pd.DataFrame
        DataFrame containing the stock data.
    additional_columns : list
        List of additional columns to keep from the original DataFrame.
    acronyms : list
        List of unique acronyms loaded from the CSV file.
    acronyms_df : pd.DataFrame
        Filtered DataFrame containing only the acronyms and additional columns.

    Methods:
    --------
    load_acronyms_from_csv():
        Loads acronyms from the provided CSV file.

    create_acronyms_dataframe():
        Creates a new DataFrame that includes only the acronyms and additional columns.

    filter_tickers_by_continuous_months(min_months=12):
        Filters the DataFrame for stock tickers that have at least `min_months` continuous months of data.

    save_dict_to_json(data, file_path):
        Saves the filtered ticker data to a JSON file in a human-readable format.
    """

    def __init__(self, csv_file, df, additional_columns):
        """
        Initializes the StockDataProcessor with the CSV file, DataFrame, and additional columns.

        Parameters:
        -----------
        csv_file : str
            Path to the CSV file containing the acronyms.
        df : pd.DataFrame
            The full dataset that includes stock ticker data and associated metrics.
        additional_columns : list
            List of additional columns to keep from the original DataFrame (e.g., 'stock_ticker', 'year', 'month').

        Raises:
        -------
        ValueError : if the csv_file or additional_columns are invalid.
        """
        if not csv_file or not isinstance(csv_file, str):
            raise ValueError("Invalid CSV file path provided.")
        if not isinstance(additional_columns, list):
            raise ValueError("additional_columns should be a list.")

        self.csv_file = csv_file
        self.df = df
        self.additional_columns = additional_columns
        self.acronyms = self.load_acronyms_from_csv()
        self.acronyms_df = self.create_acronyms_dataframe()

    def load_acronyms_from_csv(self):
        """
        Loads unique acronyms from the provided CSV file.

        Returns:
        --------
        list:
            A list of unique acronyms found in the CSV file under the 'Acronym' column.

        Raises:
        -------
        FileNotFoundError : if the CSV file cannot be found.
        KeyError : if the 'Acronym' column is missing from the CSV file.
        """
        try:
            acronyms_df = pd.read_csv(self.csv_file)
        except FileNotFoundError:
            raise FileNotFoundError(f"The file {self.csv_file} does not exist.")

        if 'Acronym' not in acronyms_df.columns:
            raise KeyError("The CSV file must contain an 'Acronym' column.")

        acronyms = acronyms_df['Acronym'].unique()
        return acronyms

    def create_acronyms_dataframe(self):
        """
        Creates a new DataFrame that contains only the selected acronyms and additional columns.

        Returns:
        --------
        pd.DataFrame:
            A filtered DataFrame with acronyms and additional columns such as 'stock_ticker', 'year', 'month'.

        Raises:
        -------
        KeyError : if any of the required columns are missing from the DataFrame.
        """
        columns_to_keep = list(self.acronyms) + self.additional_columns
        missing_columns = [col for col in columns_to_keep if col not in self.df.columns]

        if missing_columns:
            raise KeyError(f"The following columns are missing from the DataFrame: {missing_columns}")

        acronyms_df = self.df[columns_to_keep]
        return acronyms_df

    def filter_tickers_by_continuous_months(self, min_months=12):
        """
        Filters the DataFrame for stock tickers that have at least `min_months` continuous months of data.

        Parameters:
        -----------
        min_months : int, optional
            Minimum number of continuous months required (default is 12).

        Returns:
        --------
        dict:
            A dictionary of stock tickers with continuous month data, where the keys are stock tickers, years,
            and months, and the values are dictionaries containing the rest of the columns.

        Example:
        --------
        {
            'AAPL': {
                2021: {
                    1: {'feature1': 123, 'feature2': 456},
                    2: {'feature1': 789, 'feature2': 101}
                }
            }
        }

        Raises:
        -------
        ValueError : if the DataFrame does not contain required columns ('stock_ticker', 'year', 'month').
        """
        # Ensure required columns are present
        required_columns = ['stock_ticker', 'year', 'month']
        if not all(col in self.acronyms_df.columns for col in required_columns):
            raise ValueError(f"The DataFrame must contain the following columns: {required_columns}")

        # Sort the dataframe by stock_ticker, year, and month to ensure it's ordered correctly.
        self.acronyms_df = self.acronyms_df.sort_values(by=required_columns)

        # Group by stock_ticker
        grouped = self.acronyms_df.groupby('stock_ticker')

        # Create a dictionary to hold the filtered tickers
        filtered_tickers = {}

        # Iterate over each group (each stock ticker)
        for ticker, group in grouped:
            # Reset the index to ensure continuity in month checking
            group = group.reset_index(drop=True)

            # Create a 'date' column for each row based on year and month
            group['date'] = pd.to_datetime(group[['year', 'month']].assign(day=1))

            # Calculate the difference in months between consecutive rows
            group['month_diff'] = group['date'].diff().dt.days // 30

            # Replace the first value with 1 to handle the NaN generated by diff
            group['month_diff'].fillna(1, inplace=True)

            # Check if there are at least `min_months` continuous months
            current_streak = 0
            start_index = None
            for i, diff in enumerate(group['month_diff']):
                if diff == 1:  # Continuous month
                    if current_streak == 0:
                        start_index = i
                    current_streak += 1
                else:
                    current_streak = 1  # Reset the streak if discontinuous
                    start_index = i  # Start a new streak

                # If streak is at least `min_months`, process the data
                if current_streak >= min_months:
                    # Construct the nested dictionary for this ticker
                    for _, row in group.iloc[start_index:start_index + min_months].iterrows():
                        year = row['year']
                        month = row['month']

                        # Create a dictionary of the rest of the columns excluding stock_ticker, year, and month
                        rest_of_columns = row.drop(['stock_ticker', 'year', 'month', 'date', 'month_diff']).to_dict()

                        # Initialize the structure if not already present
                        if ticker not in filtered_tickers:
                            filtered_tickers[ticker] = {}
                        if year not in filtered_tickers[ticker]:
                            filtered_tickers[ticker][year] = {}

                        # Add the month data
                        filtered_tickers[ticker][year][month] = rest_of_columns

                    break  # Stop after finding the first valid sequence

        return filtered_tickers

    def save_dict_to_json(self, data, file_path):
        """
        Saves the filtered ticker data to a JSON file in a human-readable format.

        Parameters:
        -----------
        data : dict
            The dictionary to save.
        file_path : str
            The file path where the JSON data will be saved.

        Raises:
        -------
        ValueError : if the file path is not a valid string.
        """
        if not isinstance(file_path, str):
            raise ValueError("Invalid file path provided.")

        with open(file_path, 'w') as json_file:
            json.dump(data, json_file, indent=4)  # indent=4 makes the file human-readable


# Example usage:
if __name__ == "__main__":
    # Load your stock dataset
    df = pd.read_csv('/content/hackathon_sample_v2.csv')  # Your actual DataFrame source
    csv_file = '/content/metrics_acronyms.csv'  # CSV file containing acronyms
    additional_columns = ['stock_ticker', 'year', 'month']

    # Instantiate the class
    processor = StockDataProcessor(csv_file, df, additional_columns)

    # Filter tickers by continuous months
    ticker_dict = processor.filter_tickers_by_continuous_months(min_months=12)

    # Save the result as JSON
    processor.save_dict_to_json(ticker_dict, 'new_stocks_dict.json')
