In [None]:
import boto3
import pandas as pd
import io
import json
from botocore.exceptions import ClientError
import os


with open('/home/ec2-user/SageMaker/bucket_name.txt', 'r') as file:
    bucket_name = file.read().strip()
# S3 bucket and file details
bucket_name = bucket_name #Change bucket name to your s3 bucket name
file_name = 'sfdc_test.xlsx'
boto3.setup_default_session()


# Read the custom list from the file
with open('/home/ec2-user/SageMaker/custom_list.txt', 'r') as file:
    custom_list = file.read().strip().split(',')

print(f"Bucket name: {bucket_name}")
print(f"Custom list: {custom_list}")



# Initialize S3 and Bedrock clients
try:
    s3_client = boto3.client('s3')
    bedrock_client = boto3.client('bedrock-runtime')
    print("Successfully initialized AWS clients")
except Exception as e:
    print(f"Error initializing AWS clients: {str(e)}")
    raise

# Function to read Excel file from S3
def read_excel_from_s3(bucket, key):
    try:
        response = s3_client.get_object(Bucket=bucket, Key=key)
        excel_content = response['Body'].read()
        # Use row 0 as header
        df = pd.read_excel(io.BytesIO(excel_content), header=0, engine='openpyxl')
        print(f"Successfully read file {key} from bucket {bucket}")
        return df
    except ClientError as e:
        print(f"Error reading file from S3: {str(e)}")
        raise

# Read the Excel file from S3
try:
    df = read_excel_from_s3(bucket_name, file_name)
    print("Original DataFrame:")

    # Check if 'Opportunity Name' and 'Description' columns exist
    required_columns = ['Opportunity Name', 'Description', 'Opportunity Details']
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        raise ValueError(f"Missing required columns: {', '.join(missing_columns)}")

except Exception as e:
    print(f"Failed to read Excel file or process DataFrame: {str(e)}")
    raise

# Updated to a single-level list of categories
ai_categories = custom_list

def categorize_opportunity(description, opportunity_name, opportunity_details):
    prompt = f"""Analyze the following opportunity information and categorize it into one of these AI categories:
    {', '.join(ai_categories)}

    If it doesn't fit any category or there's not enough information, respond with 'Not Defined'.
    Be conservative in your categorization. If you're not sure, choose 'Not Defined'.

    Opportunity Name: {opportunity_name}
    Description: {description}
    Opportunity Details: {opportunity_details}

    Respond with only the category name, without any additional text.

    Category:"""

    body = json.dumps({
        "anthropic_version": "bedrock-2023-05-31",
        "max_tokens": 100,
        "messages": [
            {
                "role": "user",
                "content": prompt
            }
        ],
        "temperature": 0.1,
        "top_p": 0.9,
    })

    try:
        response = bedrock_client.invoke_model(
            body=body,
            modelId="anthropic.claude-3-sonnet-20240229-v1:0"  # Claude 3 Sonnet model ID
        )

        response_body = json.loads(response.get('body').read())
        category = response_body.get('content', [{}])[0].get('text', '').strip()

        if category not in ai_categories:
            category = 'Not Defined'

        return category
    except Exception as e:
        print(f"Error invoking Bedrock model for {opportunity_name}: {str(e)}")
        return 'Not Defined'

def process_opportunity(row):
    try:
        opportunity_name = row['Opportunity Name']
        description = str(row.get('Description', ''))
        opportunity_details = str(row.get('Opportunity Details', ''))

        category = categorize_opportunity(description, opportunity_name, opportunity_details)

        hashtag = '#' + ''.join(category.lower().split())
        tagged_opportunity_name = f"{opportunity_name} {hashtag}"

        return pd.Series({
            'Tagged Opportunity Name': tagged_opportunity_name,
            'Category': category
        })
    except Exception as e:
        print(f"Error processing row: {row}")
        print(f"Error details: {str(e)}")
        return pd.Series({
            'Tagged Opportunity Name': opportunity_name + " #error",
            'Category': "Error"
        })

# Apply the function to create new columns
try:
    new_columns = df.apply(process_opportunity, axis=1)
    df[['Tagged Opportunity Name', 'Category']] = new_columns

    print("\nUpdated DataFrame:")
    print(df[['Opportunity Name', 'Tagged Opportunity Name', 'Category']].head().to_string())
except Exception as e:
    print(f"Error processing opportunities: {str(e)}")
    raise

# Function to save DataFrame to S3
def save_dataframe_to_s3(df, bucket, key):
    try:
        csv_buffer = io.StringIO()
        df.to_csv(csv_buffer, index=False)
        s3_client.put_object(Bucket=bucket, Key=key, Body=csv_buffer.getvalue())
        print(f"Successfully saved file to S3 as {key}")
    except Exception as e:
        print(f"Error saving file to S3: {str(e)}")
        raise

# Save the updated DataFrame back to S3 as CSV
try:
    output_file_name = 'tagged_' + file_name.rsplit('.', 1)[0] + '.csv'
    save_dataframe_to_s3(df, bucket_name, output_file_name)
except Exception as e:
    print(f"Failed to save updated file to S3: {str(e)}")

print("\nScript execution completed.")
