In [1]:
import pandas as pd

file_path_baseline = 'result_baseline.tsv'
file_path_ratio82 = 'result_ratio_8_2.tsv'

baseline_df = pd.read_csv(file_path_baseline, sep='\t')
adjusted_df = pd.read_csv(file_path_ratio82, sep='\t')

In [3]:
print(baseline_df.head())
print(adjusted_df.head())

                         original             summary
0  describe the taste in one word                 not
1                 favorite coffee  my favorite coffee
2            yummy coconut flavor        great coffee
3                        so yummy         great candy
4    teeccino herbal coffee mocha        great coffee
                         original       summary
0  describe the taste in one word    not stevia
1                 favorite coffee  great coffee
2            yummy coconut flavor       love it
3                        so yummy          zotz
4    teeccino herbal coffee mocha  great coffee


In [2]:
# adjusted 데이터프레임의 summary 열에 새로운 이름 부여
adjusted_df = adjusted_df.rename(columns={'summary': 'summary_adjusted'})

# baseline 데이터프레임에 adjusted 데이터프레임의 summary_adjusted 열 추가
merged_df = pd.concat([baseline_df, adjusted_df['summary_adjusted']], axis=1)

# 결과 확인
print(merged_df.head())

                         original             summary summary_adjusted
0  describe the taste in one word                 not       not stevia
1                 favorite coffee  my favorite coffee     great coffee
2            yummy coconut flavor        great coffee          love it
3                        so yummy         great candy             zotz
4    teeccino herbal coffee mocha        great coffee     great coffee


In [None]:
#====================================================================#

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import pairwise_distances
import numpy as np

def calculate_cosine_similarity(vec1, vec2, chunk_size=100):
    similarity_scores = []
    for i in range(0, vec1.shape[0], chunk_size):
        chunk_end = min(i + chunk_size, vec1.shape[0])
        chunk_similarity = 1 - pairwise_distances(vec1[i:chunk_end], vec2[i:chunk_end], metric='cosine')
        similarity_scores.extend(chunk_similarity)
    return np.array(similarity_scores)

# 원본 리뷰, summary, adjust summary를 각각 리스트로 가져옴
original_reviews = merged_df['original'].tolist()
summaries = merged_df['summary'].tolist()
adjusted_summaries = merged_df['summary_adjusted'].tolist()

# 데이터를 무작위로 섞고, 일부만 샘플링
sample_size = 100  # 원하는 샘플 크기로 조정
merged_df_sampled = merged_df.sample(sample_size, random_state=42)

original_reviews_sampled = merged_df_sampled['original'].tolist()
summaries_sampled = merged_df_sampled['summary'].tolist()
adjusted_summaries_sampled = merged_df_sampled['summary_adjusted'].tolist()

# CountVectorizer를 사용하여 텍스트를 벡터로 변환
vectorizer = CountVectorizer().fit(original_reviews_sampled + summaries_sampled + adjusted_summaries_sampled)
original_vectors_sampled = vectorizer.transform(original_reviews_sampled)
summary_vectors_sampled = vectorizer.transform(summaries_sampled)
adjusted_summary_vectors_sampled = vectorizer.transform(adjusted_summaries_sampled)

# "original"와 "summary" 간의 코사인 유사성 계산
similarity_scores_summary = calculate_cosine_similarity(original_vectors_sampled, summary_vectors_sampled)

# "original"와 "adjusted summary" 간의 코사인 유사성 계산
similarity_scores_adjusted = calculate_cosine_similarity(original_vectors_sampled, adjusted_summary_vectors_sampled)

# 각 리뷰에 대한 정보 손실 평가: 1에 가까울수록 손실이 적음
information_loss_summary = 1 - np.diag(similarity_scores_summary)
information_loss_adjusted = 1 - np.diag(similarity_scores_adjusted)

# 결과 출력
print("Summary의 샘플 리뷰 평균 정보 손실:", information_loss_summary.mean())
print("Adjusted Summary의 샘플 리뷰 평균 정보 손실:", information_loss_adjusted.mean())


Summary의 샘플 리뷰 평균 정보 손실: 0.866460522502189
Adjusted Summary의 샘플 리뷰 평균 정보 손실: 0.8420345245045597


In [4]:
from nltk import edit_distance

# 각 리뷰에 대한 편집 거리 계산
merged_df['edit_distance_summary'] = merged_df.apply(lambda row: edit_distance(row['original'], row['summary']), axis=1)
merged_df['edit_distance_adjusted'] = merged_df.apply(lambda row: edit_distance(row['original'], row['summary_adjusted']), axis=1)

# 편집 거리를 정보 손실로 해석 (편집 거리가 크면 정보 손실이 크다고 가정)
max_edit_distance = max(merged_df['edit_distance_summary'].max(), merged_df['edit_distance_adjusted'].max())
merged_df['information_loss_summary'] = merged_df['edit_distance_summary'] / max_edit_distance
merged_df['information_loss_adjusted'] = merged_df['edit_distance_adjusted'] / max_edit_distance

# 결과 출력
print("Summary의 전체 리뷰 평균 정보 손실:", merged_df['information_loss_summary'].mean())
print("Adjusted Summary의 전체 리뷰 평균 정보 손실:", merged_df['information_loss_adjusted'].mean())


Summary의 전체 리뷰 평균 정보 손실: 0.1892795329210602
Adjusted Summary의 전체 리뷰 평균 정보 손실: 0.16006387637817113
