In [4]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import torch
import numpy as np
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import warnings
warnings.filterwarnings("ignore")

In [5]:
topics_df = pd.read_csv("topics.csv")
opinions_df = pd.read_csv("opinions.csv")

# Göz atalım:
print("Topics:")
print(topics_df.head())

print("\nOpinions:")

Topics:
             id      topic_id  \
0  9704a709b505  007ACE74B050   
1  de347c859ab6  00944C693682   
2  c20937683442  00BD97EA4041   
3  9d03e2bef0ff  00C6E82FE5BA   
4  c2203a58aa5c  013B9AA6B9DB   

                                                text      type effectiveness  
0  On my perspective, I think that the face is a ...  Position      Adequate  
1  With so many things in this world that few peo...  Position     Effective  
2  No because, why should a computer know how you...  Position      Adequate  
3  I think that it wouldn't be valueable to have ...  Position      Adequate  
4  Well, some people believe that it was somethin...  Position      Adequate  

Opinions:


## Aşama 1 - Anlamsal Eşleştirme


In [6]:
model = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')

topic_embeddings = model.encode(topics_df['text'].tolist(), convert_to_tensor=True)
opinion_embeddings = model.encode(opinions_df['text'].tolist(), convert_to_tensor=True)

best_matches = []
for opinion_idx, opinion_embedding in enumerate(opinion_embeddings):
    cosine_scores = util.cos_sim(opinion_embedding, topic_embeddings)
    best_topic_idx = torch.argmax(cosine_scores).item()
    matched_topic_text = topics_df.iloc[best_topic_idx]['text']

    best_matches.append({
        "opinion_text": opinions_df.iloc[opinion_idx]['text'],
        "matched_topic_text": matched_topic_text,
        "similarity_score": cosine_scores[0][best_topic_idx].item()
    })

matches_df = pd.DataFrame(best_matches)
matches_df.head()

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.51k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Unnamed: 0,opinion_text,matched_topic_text,similarity_score
0,I think that the face is a natural landform be...,I believe that the Face on Mars is just a natu...,0.84795
1,"If life was on Mars, we would know by now. The...","On my perspective, I think that the face is a ...",0.749434
2,People thought that the face was formed by ali...,The face on mars is a natural landformation. P...,0.796797
3,"though some say that life on Mars does exist, ...",I believe there is life on mars.,0.82847
4,"It says in paragraph 7, on April 5, 1998, Mars...",The face on Mars is a natural landform,0.708627


## Aşama 2 - Sınıflandırma

In [7]:
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
labels = ["Claim", "Counterclaim", "Rebuttal", "Evidence"]

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0


In [8]:
sample_texts = matches_df["opinion_text"].tolist()[:10]

classified_opinions = []
for text in sample_texts:
    result = classifier(text, labels)
    top_label = result['labels'][0]
    top_score = result['scores'][0]

    classified_opinions.append({
        "opinion_text": text,
        "predicted_label": top_label,
        "confidence": top_score
    })

classified_df = pd.DataFrame(classified_opinions)
classified_df.head()

Unnamed: 0,opinion_text,predicted_label,confidence
0,I think that the face is a natural landform be...,Counterclaim,0.284834
1,"If life was on Mars, we would know by now. The...",Evidence,0.409653
2,People thought that the face was formed by ali...,Claim,0.388607
3,"though some say that life on Mars does exist, ...",Counterclaim,0.391677
4,"It says in paragraph 7, on April 5, 1998, Mars...",Evidence,0.449056


In [9]:
classified_opinions = []

# Tüm görüşleri al
all_texts = matches_df["opinion_text"].tolist()

# Döngüyle sınıflandır
for i, text in enumerate(all_texts):
    result = classifier(text, labels)
    top_label = result['labels'][0]
    top_score = result['scores'][0]

    classified_opinions.append({
        "opinion_text": text,
        "predicted_label": top_label,
        "confidence": top_score
    })

    # Her 20 adımda bir durum bildir
    if i % 20 == 0:
        print(f"{i}/{len(all_texts)} classified...")

# DataFrame’e dök
classified_df = pd.DataFrame(classified_opinions)
classified_df.head()

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


0/27099 classified...
20/27099 classified...
40/27099 classified...
60/27099 classified...
80/27099 classified...
100/27099 classified...
120/27099 classified...
140/27099 classified...
160/27099 classified...
180/27099 classified...
200/27099 classified...
220/27099 classified...
240/27099 classified...
260/27099 classified...
280/27099 classified...
300/27099 classified...
320/27099 classified...
340/27099 classified...
360/27099 classified...
380/27099 classified...
400/27099 classified...
420/27099 classified...
440/27099 classified...
460/27099 classified...
480/27099 classified...
500/27099 classified...
520/27099 classified...
540/27099 classified...
560/27099 classified...
580/27099 classified...
600/27099 classified...
620/27099 classified...
640/27099 classified...
660/27099 classified...
680/27099 classified...
700/27099 classified...
720/27099 classified...
740/27099 classified...
760/27099 classified...
780/27099 classified...
800/27099 classified...
820/27099 classified..

Unnamed: 0,opinion_text,predicted_label,confidence
0,I think that the face is a natural landform be...,Counterclaim,0.284834
1,"If life was on Mars, we would know by now. The...",Evidence,0.409653
2,People thought that the face was formed by ali...,Claim,0.388607
3,"though some say that life on Mars does exist, ...",Counterclaim,0.391677
4,"It says in paragraph 7, on April 5, 1998, Mars...",Evidence,0.449056


In [15]:
classified_df.to_csv("classified_opinions.csv", index=False)

In [16]:
from google.colab import files
files.download("classified_opinions.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Aşama 3 - Conclusion Üretimi

In [10]:
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")

summarizer = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cuda:0


In [11]:
def generate_summary(topic, opinions):
    # Flan-T5, direkt prompt'u görevli cümleyle sever:
    prompt = f"Summarize social media opinions about this topic: {topic}. Opinions:\n{opinions}"

    output = summarizer(prompt, max_length=100, do_sample=False)
    return output[0]['generated_text']

In [12]:
sample_topic = matches_df.iloc[0]["matched_topic_text"]

relevant = classified_df[classified_df["opinion_text"].isin(
    matches_df[matches_df["matched_topic_text"] == sample_topic]["opinion_text"]
)]

opinions_text = "\n".join([f"{row['predicted_label']}: {row['opinion_text']}" for _, row in relevant.iterrows()])

conclusion = generate_summary(sample_topic, opinions_text)

print("📌 Topic:", sample_topic)
print("🧾 Conclusion:", conclusion)

Both `max_new_tokens` (=256) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


📌 Topic: I believe that the Face on Mars is just a natural landform 
🧾 Conclusion: This article it also states that the "Face on Mars" is really just a mesa .


In [14]:
conclusions = []

# Benzersiz topic'leri al
unique_topics = matches_df["matched_topic_text"].unique()

for topic in unique_topics:
    relevant = classified_df[classified_df["opinion_text"].isin(
        matches_df[matches_df["matched_topic_text"] == topic]["opinion_text"]
    )]

    # Limit the number of opinions to avoid OutOfMemoryError
    relevant = relevant.head(10)

    opinions_text = "\n".join([f"{row['predicted_label']}: {row['opinion_text']}" for _, row in relevant.iterrows()])

    # Check if there are any opinions to summarize
    if opinions_text:
        summary = generate_summary(topic, opinions_text)
    else:
        summary = "No opinions found for this topic."


    conclusions.append({
        "topic": topic,
        "conclusion": summary
    })

conclusion_df = pd.DataFrame(conclusions)
conclusion_df.head()

Both `max_new_tokens` (=256) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Unnamed: 0,topic,conclusion
0,I believe that the Face on Mars is just a natu...,"This article it also states that the ""Face on ..."
1,"On my perspective, I think that the face is a ...",I think that the face on Mars is a natural lan...
2,The face on mars is a natural landformation. P...,Thousands of anxious web surfers were waiting ...
3,I believe there is life on mars.,I believe there is life on Mars.
4,The face on Mars is a natural landform,The face on Mars is a natural landform.


In [17]:
final_opinions_df = pd.merge(classified_df, matches_df, on="opinion_text", how="left")
final_opinions_df = final_opinions_df[["opinion_text", "predicted_label", "matched_topic_text"]]
final_opinions_df.head()

Unnamed: 0,opinion_text,predicted_label,matched_topic_text
0,I think that the face is a natural landform be...,Counterclaim,I believe that the Face on Mars is just a natu...
1,"If life was on Mars, we would know by now. The...",Evidence,"On my perspective, I think that the face is a ..."
2,People thought that the face was formed by ali...,Claim,The face on mars is a natural landformation. P...
3,"though some say that life on Mars does exist, ...",Counterclaim,I believe there is life on mars.
4,"It says in paragraph 7, on April 5, 1998, Mars...",Evidence,The face on Mars is a natural landform


In [18]:
final_opinions_df.to_csv("opinions.csv", index=False)

In [19]:
conclusion_df.to_csv("conclusions.csv", index=False)