In [None]:
import os
# Memory saving
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'


In [None]:
# mount drive to access data
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
%cd /content/drive/MyDrive/Lupus-Subreddit-LLM/
%ls

In [None]:
import re
import pandas as pd
import numpy as np

# Load Subreddit Annotated Dataset

* Structured summaries based on concepts framed in the Biopsychosocial Model

In [None]:
# Read in subreddit lupus dataset which has been human annotated with json headers across columns
final_post_df = pd.read_csv("formatted_500_llm_lupus_final_set.csv")
pd.set_option('display.max_colwidth', None)
# Drop unnecessary columns from final_post_df
final_post_df = final_post_df.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'], errors='ignore')
# Select only 'instruction' and 'response' from final_post_df
final_post_df = final_post_df[['instruction', 'response']]

final_post_df.head(5)

In [None]:
annotated_post_df = pd.read_csv("llm_finetuning_25_sample.csv")
annotated_post_df = annotated_post_df[['selftext', 'labeled_summaries']].rename(columns={
    'selftext': 'instruction',
    'labeled_summaries': 'response'
})
annotated_post_df.head(5)

second_set_df = pd.read_csv("filtered_ratings_revised_dw.csv")

second_set_df = second_set_df[['instruction', 'response_edited']].rename(columns={
    'response_edited': 'response'
})
second_set_df.head(5)


In [None]:
# Concatenate all datasets together
combined_df = pd.concat([final_post_df, annotated_post_df, second_set_df], ignore_index=True)
combined_df

In [None]:
import json
import pandas as pd
import re
import ast  # To safely convert stringified lists into real lists

# Function to extract, clean, and expand JSON while handling stringified lists
def extract_json_and_expand(response_text):
    if not isinstance(response_text, str):  # Ensure input is a string
        return {}

    # Extract JSON part using regex (capture from first { to first })
    json_match = re.search(r'\{.*?\}', response_text, re.DOTALL)  # Truncate at first }

    if json_match:
        json_text = json_match.group()

        # Ensure JSON ends with }, if not, add one
        if not json_text.endswith("}"):
            json_text += "}"

        try:
            # Load JSON as dictionary
            json_data = json.loads(json_text)

            # Deduplicate keys by merging lists & convert stringified lists
            cleaned_data = {}
            for key, value in json_data.items():
                # Convert stringified lists into real lists
                if isinstance(value, str) and value.startswith("[") and value.endswith("]"):
                    try:
                        value = ast.literal_eval(value)  # Convert safely
                    except (ValueError, SyntaxError):
                        pass  # Keep as string if conversion fails

                # Handle merging and deduplication
                if key in cleaned_data:  # If duplicate key exists
                    if isinstance(cleaned_data[key], list) and isinstance(value, list):
                        cleaned_data[key] = list(set(cleaned_data[key] + value))  # Merge and deduplicate lists
                    elif isinstance(cleaned_data[key], str) and isinstance(value, str):
                        cleaned_data[key] = cleaned_data[key] + "; " + value  # Concatenate strings
                    else:
                        cleaned_data[key] = value  # Keep latest valid value
                else:
                    cleaned_data[key] = value  # Store key-value pair

            # Convert list values into comma-separated strings
            return {key: ', '.join(value) if isinstance(value, list) else value for key, value in cleaned_data.items()}

        except json.JSONDecodeError:
            return {"Unstructured_Response": response_text}  # Store raw text if JSON fails

    return {"Unstructured_Response": response_text}  # Store raw text if no valid JSON found


# Apply function to extract and expand JSON in 'response' column
expanded_json_df = combined_df['response'].apply(extract_json_and_expand).apply(pd.Series)

# Count number of failed extractions BEFORE fixing Unstructured_Response
num_failed_before = expanded_json_df['Unstructured_Response'].notna().sum()
print(f"Number of failed JSON extractions before cleaning Unstructured_Response: {num_failed_before}")

# Step 2: Process rows where Unstructured_Response is not NaN (fix manually labeled JSON)
def fix_unstructured_response(text):
    if isinstance(text, str) and text.startswith("{") and text.endswith("}"):
        try:
            # Convert string dictionary back to actual dictionary
            json_data = ast.literal_eval(text)  # Converts string into real dictionary
            cleaned_data = {}

            for key, value in json_data.items():
                # Convert stringified lists into real lists
                if isinstance(value, str) and value.startswith("[") and value.endswith("]"):
                    try:
                        value = ast.literal_eval(value)  # Convert safely
                    except (ValueError, SyntaxError):
                        pass  # Keep as string if conversion fails

                # Store cleaned values
                cleaned_data[key] = ', '.join(value) if isinstance(value, list) else value

            return cleaned_data
        except (ValueError, SyntaxError):
            return {}  # If conversion fails, return empty dictionary
    return {}

# Apply function to fix unstructured responses
fixed_unstructured_df = expanded_json_df['Unstructured_Response'].dropna().apply(fix_unstructured_response).apply(pd.Series)

# Merge fixed unstructured responses back into the main dataframe
final_parsed_posts = pd.concat([combined_df, expanded_json_df, fixed_unstructured_df], axis=1)

# Count number of failed extractions AFTER fixing Unstructured_Response
num_failed_after = final_parsed_posts['Unstructured_Response'].notna().sum()
print(f"Number of failed JSON extractions after cleaning Unstructured_Response: {num_failed_after}")

In [None]:
print(final_parsed_posts.info())


In [None]:
final_parsed_posts

In [None]:
import pandas as pd

# Identify columns with duplicate names
duplicate_columns = final_parsed_posts.columns[final_parsed_posts.columns.duplicated()].tolist()

# Merge duplicates while keeping the most complete version
for col in set(duplicate_columns):
    matching_cols = final_parsed_posts.loc[:, final_parsed_posts.columns == col]

    if not matching_cols.empty:
        # Merge columns by filling missing values
        final_parsed_posts[col] = matching_cols.bfill(axis=1).iloc[:, 0]

        # Drop all duplicate occurrences, keeping only the first
        final_parsed_posts = final_parsed_posts.loc[:, ~final_parsed_posts.columns.duplicated()]

print("Duplicate columns merged successfully!")

# Display final column count
print(f"Final column count: {final_parsed_posts.shape[1]}")



In [None]:
print(final_parsed_posts.info())


In [None]:
print(final_parsed_posts[['Unstructured_Response']].dropna().head(1))


In [None]:
import json
import pandas as pd
import ast  # Safely convert stringified lists into real lists

# Function to convert Unstructured_Response string into a proper dictionary
def fix_unstructured_response(text):
    if isinstance(text, str) and text.startswith("{") and text.endswith("}"):
        try:
            # Convert the entire text into a real dictionary
            json_data = ast.literal_eval(text)  # Converts string into actual dictionary

            cleaned_data = {}
            for key, value in json_data.items():
                # Convert empty lists represented as strings ('[]') into real empty lists
                if value == '[]':
                    cleaned_data[key] = ''

                # Convert single strings and lists stored as strings properly
                elif isinstance(value, str):
                    if value.startswith("[") and value.endswith("]"):
                        try:
                            value = ast.literal_eval(value)  # Convert safely
                            cleaned_data[key] = ', '.join(value) if isinstance(value, list) else value
                        except (ValueError, SyntaxError):
                            cleaned_data[key] = value  # Keep as string if conversion fails
                    else:
                        cleaned_data[key] = value  # Keep as-is if already a clean string
                else:
                    cleaned_data[key] = value  # Store properly formatted value

            return cleaned_data
        except (ValueError, SyntaxError):
            return {}  # Return empty dictionary if conversion fails
    return {}

# Apply the function to Unstructured_Response
fixed_unstructured_df = final_parsed_posts['Unstructured_Response'].dropna().apply(fix_unstructured_response).apply(pd.Series)

# Ensure correct alignment with existing columns before merging
expected_columns = [col for col in final_parsed_posts.columns if col not in ['Unstructured_Response']]
fixed_unstructured_df = fixed_unstructured_df.reindex(columns=expected_columns, fill_value='')

# Merge the corrected parsed data back into the main dataframe
final_parsed_posts.update(fixed_unstructured_df)



In [None]:
print(final_parsed_posts.info())


final_parsed_posts.tail(5)

In [None]:
# parsed 553 posts

In [None]:
final_parsed_posts.to_csv("final_summary_post_expanded.csv")

In [None]:
import pandas as pd
# Rename 'Sex/Gender' column to 'Sex_Gender' if it exists
final_parsed_posts = final_parsed_posts.rename(columns={'Sex/Gender': 'Sex_Gender'})

# Identify the relevant columns (all columns between 'response' and 'Unstructured_Response')
columns_to_summarize = final_parsed_posts.loc[:, 'response':].columns.tolist()
columns_to_summarize.remove('response')  # Remove 'response' itself
columns_to_summarize.remove('Unstructured_Response')  # Remove 'Unstructured_Response'

# Create a dictionary to store summary DataFrames for each column
summary_dfs = {}

# Generate unique value counts for each column
for col in columns_to_summarize:
    value_counts = final_parsed_posts[col].dropna().str.split(', ').explode().value_counts().reset_index()
    value_counts.columns = ['Value', 'Frequency']
    value_counts['Themes'] = ''  # Blank column for themes
    summary_dfs[col] = value_counts

    # Save to CSV
    csv_filename = f"{col}_summary.csv"
    value_counts.to_csv(csv_filename, index=False)
    print(f"Saved: {csv_filename}")

print("All summaries have been saved as CSV files.")
