# Tasks 1. Named Entity Recognition (NER)

- Implement a function called perform_ner(text) that takes a string text as input and performs Named Entity Recognition (NER) using a pre-trained model from Hugging Face.
- The function should return a list of tuples, where each tuple contains a named entity and its corresponding entity type.
- Write another function called display_entities(entities) that takes the list of tuples returned by perform_ner(text) and displays the named entities along with their entity types in a readable format.

In [8]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification

# 모델 및 토크나이저 로드
MODEL_NAME = "xlm-roberta-large-finetuned-conll03-english"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)

# Named Entity Recognition (NER) 함수
def perform_ner(text):
    """
    Hugging Face 모델을 사용해 텍스트에서 Named Entity를 추출합니다.
    :param text: 분석할 입력 텍스트 (string)
    :return: (entity, type) 튜플의 리스트
    """
    try:
        # 텍스트를 토큰화
        tokens = tokenizer(text, return_tensors="pt", truncation=True, is_split_into_words=False)
        input_ids = tokens["input_ids"]
        attention_mask = tokens["attention_mask"]

        # 모델 예측 수행
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        # 토큰 ID를 실제 단어와 매핑
        tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
        labels = predictions[0].tolist()

        # 모델의 레이블 맵핑 가져오기
        label_map = model.config.id2label
        results = []

        for token, label_id in zip(tokens, labels):
            label = label_map[label_id]
            if label != "O":  # "O"는 엔터티가 아님
                results.append((token, label))

        return results
    except Exception as e:
        print(f"Error during NER processing: {e}")
        return []

# Named Entity 결과 출력 함수
def display_entities(entities):
    """
    Named Entity Recognition 결과를 사람이 읽을 수 있는 형태로 출력합니다.
    :param entities: NER 결과로 생성된 (entity, type) 튜플 리스트
    """
    if not entities:
        print("엔터티를 찾을 수 없습니다.")
        return

    print("Named Entities:")
    for entity, entity_type in entities:
        print(f"엔터티: {entity}, 유형: {entity_type}")

# 사용 예제
if __name__ == "__main__":
    sample_text = "Jisoo was born in Seoul, the capital of South Korea. 지수는 27살이다."
    ner_results = perform_ner(sample_text)
    display_entities(ner_results)


Some weights of the model checkpoint at xlm-roberta-large-finetuned-conll03-english were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Named Entities:
엔터티: ▁Ji, 유형: I-PER
엔터티: soo, 유형: I-PER
엔터티: ▁Seoul, 유형: I-LOC
엔터티: ▁South, 유형: I-LOC
엔터티: ▁Korea, 유형: I-LOC


# Tasks 2. Sentiment Analysis

- Implement a function called perform_sentiment_analysis(text) that takes a string text as input and performs sentiment analysis using a pre-trained model from Hugging Face.
- The function should return a dictionary containing the top 3 detected emotions and their corresponding scores.
- Write another function called get_top_emotions(sentiment_dict) that takes the dictionary returned by perform_sentiment_analysis(text) and returns a list of the top 3 emotions.

In [12]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# 모델과 토크나이저 로드
model_name = "j-hartmann/emotion-english-distilroberta-base"  # 감정 분석 모델
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# 감정 분석 함수
def perform_sentiment_analysis(text):
    """
    텍스트에서 감정 분석을 수행하고 상위 3개의 감정과 점수를 반환합니다.
    """
    try:
        # 입력 텍스트 토큰화
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        
        # 긴 텍스트 잘림 확인
        if tokenizer.model_max_length and len(text) > tokenizer.model_max_length:
            print(f"Warning: Input text is longer than {tokenizer.model_max_length} tokens. It has been truncated.")

        # 모델 예측 수행
        outputs = model(**inputs)
        logits = outputs.logits

        # 소프트맥스 확률 계산
        probabilities = torch.softmax(logits, dim=-1)

        # 상위 3개 감정 추출
        topk = torch.topk(probabilities, k=3, dim=-1)
        top_indices = topk.indices[0].tolist()
        top_scores = topk.values[0].tolist()

        # 레이블 맵핑
        labels = model.config.id2label  # 모델에 정의된 감정 레이블 가져오기
        top_labels = [labels[idx] for idx in top_indices]

        # 결과 반환
        return [{"label": top_labels[i], "score": top_scores[i]} for i in range(3)]
    except Exception as e:
        print(f"Error during sentiment analysis: {e}")
        return []

# 테스트 실행
if __name__ == "__main__":
    # 테스트 
    test_texts = [
        "I will be happy working with BEN!",
        "I'm feeling very excited and joyful!",
    ]
    
    for text in test_texts:
        print(f"\nInput Text: {text}")
        results = perform_sentiment_analysis(text)
        print("Top 3 Emotions:")
        for res in results:
            print(f"Label: {res['label']}, Score: {res['score']:.4f}")



Input Text: I will be happy working with BEN!
Top 3 Emotions:
Label: joy, Score: 0.9877
Label: surprise, Score: 0.0051
Label: sadness, Score: 0.0026

Input Text: I'm feeling very excited and joyful!
Top 3 Emotions:
Label: joy, Score: 0.9933
Label: surprise, Score: 0.0024
Label: anger, Score: 0.0014


# Tasks 3. API Development

- Create a simple API using FastAPI that accepts a text input and returns the named entities along with their corresponding entity types and the detected emotions.
- Implement proper input validation and error handling in the API.
    - If the text field is missing or empty in the request body, return a 400 Bad Request response with an appropriate error message.
    - If an internal server error occurs during the processing of the request, return a 500 Internal Server Error response with an 
appropriate error message

In [20]:
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModelForSequenceClassification
import torch

In [24]:
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModelForSequenceClassification
import torch

# FastAPI 인스턴스 생성
app = FastAPI()

# NER 모델 및 토크나이저 로드
NER_MODEL_NAME = "xlm-roberta-large-finetuned-conll03-english"
ner_tokenizer = AutoTokenizer.from_pretrained(NER_MODEL_NAME)
ner_model = AutoModelForTokenClassification.from_pretrained(NER_MODEL_NAME)

# Sentiment Analysis 모델 및 토크나이저 로드
SENTIMENT_MODEL_NAME = "j-hartmann/emotion-english-distilroberta-base"
sentiment_tokenizer = AutoTokenizer.from_pretrained(SENTIMENT_MODEL_NAME)
sentiment_model = AutoModelForSequenceClassification.from_pretrained(SENTIMENT_MODEL_NAME)

# 요청 데이터 형식 정의
class AnalyzeRequest(BaseModel):
    text: str

# NER 함수
def perform_ner(text):
    """
    텍스트에서 Named Entity를 추출합니다.
    """
    tokens = ner_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = ner_model(**tokens)
    predictions = torch.argmax(outputs.logits, dim=-1)
    labels = predictions[0].tolist()
    tokens = ner_tokenizer.convert_ids_to_tokens(tokens['input_ids'][0])
    label_map = ner_model.config.id2label

    results = []
    for token, label_id in zip(tokens, labels):
        label = label_map[label_id]
        if label != "O":  # "O"는 엔터티가 아님
            results.append({"entity": token, "type": label})
    return results

# Sentiment Analysis 함수
def perform_sentiment_analysis(text):
    """
    텍스트에서 감정 분석을 수행하고 상위 3개의 감정을 반환합니다.
    """
    inputs = sentiment_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = sentiment_model(**inputs)
    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=-1)
    topk = torch.topk(probabilities, k=3, dim=-1)
    top_indices = topk.indices[0].tolist()
    top_scores = topk.values[0].tolist()

    labels = sentiment_model.config.id2label
    top_labels = [labels[idx] for idx in top_indices]

    return [{"label": top_labels[i], "score": top_scores[i]} for i in range(3)]

# API 엔드포인트 정의
@app.post("/analyze")
async def analyze_text(request: AnalyzeRequest):
    if not request.text.strip():
        raise HTTPException(status_code=400, detail="The 'text' field cannot be empty.")

    try:
        # Named Entity Recognition 수행
        entities = perform_ner(request.text)

        # Sentiment Analysis 수행
        sentiments = perform_sentiment_analysis(request.text)

        # 결과 반환
        return {
            "entities": entities,
            "emotions": [sentiment["label"] for sentiment in sentiments],
        }
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")

# uvicorn script_name:app --reload


Some weights of the model checkpoint at xlm-roberta-large-finetuned-conll03-english were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
