In [None]:
!pip install together -q

In [None]:
import os
import pandas as pd
from tqdm import tqdm
import json
import re
from together import Together

# Function to generate summary prompt
def generate_case_summary_prompt(case_text):
    return f"""
    Summarize the case text using this template as accurately as possible while
    maintaining correct English grammar. Do not add extra information:
    "The <active agent> did <action> to <passive agent> which led to
    <consequence>. The <active agent> had <good/bad/neutral> moral intention,
    however, the <action> violated <ethical principle> ethical principle which
    caused <ethical issue>."
    Case text is as follows: "{case_text}"

    give the output in comma-separated format
    """

# Function to generate feature extraction prompt
def inc_prompt(selftext):
    prompt = f"""
    Analyze the following sentence: {selftext} to extract the following features in stick to  1-2 words from the cases (only JSON format output needed, no unnecessary output needed):

    - Active agent:  The individual or entity that performs an action or initiates a process within a scenario. 
    - Passive agent: The individual or entity that is affected or impacted by the action performed by the active agent.
    - Action done by active agent: The specific act or behavior undertaken by the active agent that influences the passive agent.
    - Domain : The context or area (e.g., healthcare, business, technology) in which the action takes place, influencing the ethical implications.
    - Ethical issue(s): The moral conflicts that arise from the action, questioning what is right or wrong in the scenario.
    - Consequence: The outcome or effect that results from the action of the active agent on the passive agent or the environment.
    - Severity of consequence: The degree of harm or benefit caused by the consequence, ranging from mild to severe. 
    - Utility of consequence: Determine whether it benefits or harms stakeholders.
    - Duration of consequence: The length of time for which the consequence persists, either immediately or over the long term.
    - Moral intention of active agent: The ethical purpose or goal that the active agent aims to achieve through their actions.
    - Ethical principles upheld: The moral values or standards that are supported and respected by the active agent's actions.
    - Ethical principles violated: The moral values or standards that are disregarded or harmed by the active agent's actions.
    - Relationship between active agent and passive agent: The nature of the interaction or connection between the individuals or entities involved, which may affect the ethical dynamics.
    - Moral decision: Was the action of the active agent ethical or not based on the situation? Choose from ["Morally right","Morally wrong","Morally grey"].

    Provide your response in JSON format only.
    """
    return prompt

# Function to clean response
def clean_response(response_text):
    cleaned = re.sub(r"`{3}json|`{3}", "", response_text)  # Remove Markdown-like JSON markers
    cleaned = cleaned.strip()  # Remove leading/trailing whitespace
    return cleaned

# Function to extract JSON blocks from raw response
def extract_json_from_response(response_text):
    try:
        return json.loads(response_text)
    except json.JSONDecodeError as e:
        print(f"JSONDecodeError: {e} for response: {response_text[:100]}...")
        return None

# Function to run the agent and get a response
def run_agent(client, prompt, model, content):
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "assistant", "content": content},
                {"role": "user", "content": prompt}
            ]
        )
        response_text = clean_response(response.choices[0].message.content.strip())
        return response_text
    except Exception as e:
        print(f"Error in API call: {e}")
        return None

# Set API key for Together client
os.environ['TOGETHER_API_KEY'] = "YOUR_API_KEY"
client = Together(api_key=os.environ.get("TOGETHER_API_KEY"))

# Define the LLM model
llm_model = "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"

# Load dataset
data = pd.read_json("/kaggle/working/reddit_new.json")

# Initialize lists to collect results
results = []

# Process each row in the dataset
for i in tqdm(range(len(data))):
    case_text = data.iloc[i]["selftext"]  # Extract the case text

    # Generate summary
    summary_response = run_agent(
        client,
        generate_case_summary_prompt(case_text),
        llm_model,
        "You are a legal domain expert generating case summaries."
    )

    # Generate features
    feature_response = run_agent(
        client,
        inc_prompt(case_text),
        llm_model,
        "You are a text analyst and legal domain expert."
    )

    # Parse feature response as JSON
    feature_data = extract_json_from_response(feature_response)

    # Combine results in flat structure
    result = {
        "case_id": i + 1,
        "selftext": case_text,
        "summary": summary_response
    }

    # Merge features into the flat structure
    if feature_data:
        result.update(feature_data)
    else:
        result["Error"] = "Invalid JSON response for features"

    results.append(result)

# Save combined results to a JSON file
output_file = "summary_feat_new_reddit.json"
with open(output_file, "w") as f:
    json.dump(results, f, indent=4)

print(f"Combined results saved as '{output_file}'")


In [None]:
import json
import pandas as pd

# Load JSON file
input_file = "/kaggle/working/summary_feat_new_reddit.json"
output_file = "filter-new.xlsx"

# Load JSON data
with open(input_file, "r") as f:
    data = json.load(f)

# Filter out entries with "Error" in any key
filtered_data = [entry for entry in data if "Error" not in entry]

# Convert to DataFrame
df = pd.DataFrame(filtered_data)

# Save to Excel
df.to_excel(output_file, index=False)

print(f"Filtered data saved to {output_file}")


In [None]:
# import pandas as pd

# # Function to remove rows where all columns after the 3rd column are empty
# def filter_empty_rows(input_excel, output_excel):
#     # Load the Excel sheet into a DataFrame
#     df = pd.read_excel(input_excel)
    
#     # Find rows where all columns after the 3rd column are empty
#     filtered_df = df.loc[~df.iloc[:, 3:].isnull().all(axis=1)]
    
#     # Save the filtered DataFrame to a new Excel file
#     filtered_df.to_excel(output_excel, index=False)
#     print(f"Filtered rows saved to {output_excel}")

# # Example usage
# input_excel = "/kaggle/working/filtered_output_UK.xlsx"  # Replace with your input file name
# output_excel = "clean.xlsx"  # Replace with your desired output file name

# filter_empty_rows(input_excel, output_excel)
import pandas as pd
import json
import re

def clean_excel_file(input_excel, output_excel):
    # Load the Excel sheet
    df = pd.read_excel(input_excel)

    # Define a function to detect invalid rows (rows with JSON-like content or incorrect format)
    def is_invalid_row(row):
        # Check all columns after the 3rd column for JSON-like content
        for col in row.index[3:]:
            if isinstance(row[col], str) and re.search(r"\{.*?\}|\[.*?\]", row[col]):
                return True
        return False

    # Filter out invalid rows
    cleaned_df = df[~df.apply(is_invalid_row, axis=1)]
    invalid_rows = df[df.apply(is_invalid_row, axis=1)]

    # Save the cleaned DataFrame to a new Excel file
    cleaned_df.to_excel(output_excel, index=False)
    print(f"Cleaned data saved to '{output_excel}'.")

    # Save the invalid rows for debugging
    if not invalid_rows.empty:
        invalid_output = "invalid_rows.xlsx"
        invalid_rows.to_excel(invalid_output, index=False)
        print(f"Invalid rows saved to '{invalid_output}' for review.")

# Example usage
input_excel = "/kaggle/working/filter-new.xlsx"  # Replace with your input file path
output_excel = "final_new.xlsx"  # Replace with your desired output file path

clean_excel_file(input_excel, output_excel)
