In [None]:
# 1. Google Drive'ı Colab'a Bağlama
from google.colab import drive
drive.mount('/content/drive')


In [None]:
!pip install https://huggingface.co/turkish-nlp-suite/tr_core_news_trf/resolve/main/tr_core_news_trf-1.0-py3-none-any.whl

#**TOPİC MODELLİNG**

In [None]:
import pandas as pd
import gensim
from gensim import corpora
from gensim.models.ldamodel import LdaModel
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import re
from datetime import datetime
import os
import spacy
from spacy.lang.tr.stop_words import STOP_WORDS
import multiprocessing


In [None]:
multiprocessing.set_start_method("spawn", force=True)
spacy.require_gpu()
print("GPU kullanılıyor mu?", spacy.prefer_gpu())

nlp = spacy.load("tr_core_news_trf")
nltk.download('punkt')


In [None]:
def load_data(file_path, sheet_name=0):
    return pd.read_excel(file_path, sheet_name=sheet_name)

data_path = 'data/tweet_dataset.xlsx'

if not os.path.exists(data_path):
    from google.colab import files
    print("Lütfen Excel dosyanızı yükleyin.")
    uploaded = files.upload()
    for filename in uploaded.keys():
        os.rename(filename, data_path)
else:
    print("Veri dosyası mevcut.")


In [None]:
df = load_data(data_path)

df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
filtered_df = df[(df.index.year > 2021) & (df.index.year <= 2024)]
monthly_groups = filtered_df.groupby(pd.Grouper(freq='M'))

print("Veri hazırlandı ve aylık gruplandı.")


In [None]:
def preprocess_parallel(texts, batch_size=100, n_process=4):
    processed = []
    for doc in nlp.pipe(texts, batch_size=batch_size, n_process=n_process):
        tokens = [
            token.lemma_ for token in doc
            if token.is_alpha and token.lemma_ not in stopwords and len(token.lemma_) > 1
        ]
        processed.append(tokens)
    return processed


In [None]:
def perform_topic_modeling(processed_texts, num_topics=5, passes=15, iterations=400, chunksize=2000):
    dictionary = corpora.Dictionary(processed_texts)
    corpus = [dictionary.doc2bow(text) for text in processed_texts]

    if len(dictionary) == 0:
        return None, None, None

    lda_model = LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=num_topics,
        random_state=100,
        update_every=1,
        chunksize=chunksize,
        passes=passes,
        iterations=iterations,
        alpha='auto',
        per_word_topics=True,
    )
    return lda_model, corpus, dictionary


#**TOPİC MODELLİNG GÖRSELLEŞTİRME**

In [None]:
import os
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from gensim.models.ldamodel import LdaModel
from itertools import combinations


In [None]:
def extract_topics(model, prefix, num_topics=100, num_words=10):
    """
    Her bir topic için kelime–ağırlık sözlüğü döner.
    Topic ID'lerine prefix ekleyerek benzersizleştirir.
    """
    topics = model.print_topics(num_topics=num_topics, num_words=num_words)
    result = {}
    for tid, text in topics:
        uid = f"{prefix}_T{tid}"
        weights = {
            w.strip().split("*")[1].replace('"',''): float(w.strip().split("*")[0])
            for w in text.split("+")
        }
        result[uid] = weights
    return result

def cosine_sim(d1, d2):
    """İki ağırlık sözlüğü arasındaki kosinüs benzerliğini hesaplar."""
    all_w = set(d1) | set(d2)
    v1 = np.array([d1.get(w,0) for w in all_w])
    v2 = np.array([d2.get(w,0) for w in all_w])
    dot = np.dot(v1, v2)
    norm = np.linalg.norm(v1)*np.linalg.norm(v2)
    return dot/norm if norm else 0.0

def build_graph(topics, threshold=0.3):
    """
    topics: {topic_id: {word:weight}}
    Benzerlik > threshold olan topic’ler arasında kenar oluşturur.
    """
    G = nx.Graph()
    G.add_nodes_from(topics.keys())
    for t1, t2 in combinations(topics, 2):
        sim = cosine_sim(topics[t1], topics[t2])
        if sim > threshold:
            G.add_edge(t1, t2, weight=sim)
    return G


In [None]:
models_dir = "models/lda"
for fname in sorted(os.listdir(models_dir)):
    if not fname.endswith(".model"):
        continue
    period = fname.replace("lda_", "").replace(".model", "")
    mdl = LdaModel.load(os.path.join(models_dir, fname))
    tp = extract_topics(mdl, prefix=period)
    G = build_graph(tp, threshold=0.3)

    # Baskın topic
    cent = nx.degree_centrality(G)
    dom = max(cent, key=cent.get)
    print(f"{period}: Dominant Topic = {dom} (score={cent[dom]:.3f})")

    # Görselleştir
    plt.figure(figsize=(8,6))
    pos = nx.spring_layout(G, seed=42, k=0.5)
    widths = [2*d['weight'] for _,_,d in G.edges(data=True)]
    nx.draw(G, pos, with_labels=True, node_size=300, width=widths)
    plt.title(f"{period} Topic Graph")
    plt.show()


In [None]:
# Tüm periodlerden topic’leri topla
all_topics = {}
for fname in sorted(os.listdir(models_dir)):
    if fname.endswith(".model"):
        p = fname.replace("lda_", "").replace(".model", "")
        m = LdaModel.load(os.path.join(models_dir, fname))
        all_topics.update(extract_topics(m, prefix=p))

# Global ağ
G_global = build_graph(all_topics, threshold=0.3)

# En baskın iki topic
cent_glob = nx.degree_centrality(G_global)
top2 = sorted(cent_glob.items(), key=lambda x: x[1], reverse=True)[:2]

print("== Global Dominant Topics ==")
for tid, score in top2:
    print(f"- {tid}: score={score:.3f}")

# Global grafiği çiz
plt.figure(figsize=(10,8))
pos = nx.spring_layout(G_global, seed=42, k=0.3)
nx.draw(G_global, pos, node_size=50, with_labels=False, width=0.5)
plt.title("Global Topic Network")
plt.show()


In [None]:
output_dir = 'outputs/lda_models'
os.makedirs(output_dir, exist_ok=True)

for month, group in monthly_groups:
    month_str = month.strftime('%Y_%m')
    print(f"\n--- {month_str} ---")

    texts = group['tweet'].dropna().tolist()
    if not texts:
        print("Metin bulunamadı.")
        continue

    processed = preprocess_parallel(texts)
    lda_model, corpus, dictionary = perform_topic_modeling(processed, num_topics=100)

    if lda_model is None:
        print("Yetersiz veri.")
        continue

    print("Konular:")
    for idx, topic in lda_model.print_topics(num_topics=100, num_words=10):
        print(f"Konu #{idx+1}: {topic}")

    lda_model.save(os.path.join(output_dir, f'lda_{month_str}.model'))
    dictionary.save(os.path.join(output_dir, f'dict_{month_str}.dict'))
    corpora.MmCorpus.serialize(os.path.join(output_dir, f'corpus_{month_str}.mm'), corpus)


#**EN ÇOK KULLANILAN KELİMELER VE ONLARLA BERABER EN ÇOK KULLANILAN KELİMELER**

In [None]:
import pandas as pd
import spacy
from spacy.lang.de.stop_words import STOP_WORDS
from collections import Counter

# Almanca spaCy modeli
nlp = spacy.load("de_core_news_sm")


In [None]:
keyword_groups = {
    "turk":    ["türkisch", "türkei"],
    "migration": ["abwanderung", "auswanderung", "migration", "immigrant", "einwanderer"],
    "syrien":  ["syrien", "syrisch", "syrer"]
}

# Her grup için birlikte geçen kelimeleri sayacak Counter
group_counters = {grp: Counter() for grp in keyword_groups}


In [None]:
keyword_groups = {
    "turk":    ["türkisch", "türkei"],
    "migration": ["abwanderung", "auswanderung", "migration", "immigrant", "einwanderer"],
    "syrien":  ["syrien", "syrisch", "syrer"]
}

# Her grup için birlikte geçen kelimeleri sayacak Counter
group_counters = {grp: Counter() for grp in keyword_groups}


In [None]:
output_file = "outputs/keyword_context.xlsx"
with pd.ExcelWriter(output_file) as writer:
    for grp, counter in group_counters.items():
        top25 = counter.most_common(25)
        pd.DataFrame(top25, columns=['context_word', 'frequency']) \
          .to_excel(writer, sheet_name=grp, index=False)

print(f"Kaydedildi: {output_file}")


# **TR HATESPEECH EMOTİON**

In [None]:
import os
import warnings
import locale
import pandas as pd
import spacy
import torch
from transformers import pipeline, logging

# Ortam ve hata ayarları
os.environ["HF_HUB_DOWNLOAD_TIMEOUT"] = "60"
locale.getpreferredencoding = lambda: "UTF-8"
warnings.filterwarnings("ignore")
logging.set_verbosity_error()

# GPU kullanılabiliyorsa zorunlu kıl
if torch.cuda.is_available():
    spacy.require_gpu()


In [None]:
nlp = spacy.load("tr_core_news_trf")
nlp.disable_pipes("ner", "parser")  # Gereksiz bileşenleri kapat

data_path = "data/turkish_tweets.xlsx"
df = pd.read_excel(data_path)

# Tweetleri lemmatize et
preprocessed_texts = [
    " ".join([token.lemma_ for token in doc if token.is_alpha])
    for doc in nlp.pipe(df['tweet'].astype(str), batch_size=32, n_process=1)
]


In [None]:
emotion_pipeline = pipeline(
    "text-classification",
    model="maymuni/bert-base-turkish-cased-emotion-analysis",
    tokenizer="maymuni/bert-base-turkish-cased-emotion-analysis",
    truncation=True,
    framework="pt",
    device=0,
    batch_size=32
)

hate_pipeline = pipeline(
    "text-classification",
    model="barandinho/distilbert-base-turkish-cased-toxic-lang",
    tokenizer="barandinho/distilbert-base-turkish-cased-toxic-lang",
    truncation=True,
    framework="pt",
    device=0,
    batch_size=32
)


In [None]:
emotion_results = emotion_pipeline(preprocessed_texts, batch_size=32)
hate_results = hate_pipeline(preprocessed_texts, batch_size=32)

df["emotion_label"] = [r["label"] for r in emotion_results]
df["emotion_score"] = [r["score"] for r in emotion_results]
df["hate_speech_label"] = [r["label"] for r in hate_results]
df["hate_speech_score"] = [r["score"] for r in hate_results]


In [None]:
output_path = "outputs/tr_emotion_hate_results.xlsx"
os.makedirs(os.path.dirname(output_path), exist_ok=True)

df.to_excel(output_path, index=False)
print(f"Sonuçlar kaydedildi: {output_path}")


# **DE HATESPEECH EMOTİON**

In [None]:
import pandas as pd
import spacy
from transformers import pipeline

# SpaCy Almanca modelini yükle
nlp = spacy.load("de_core_news_sm")


In [None]:
def preprocess_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])


In [None]:
hate_pipeline = pipeline(
    "text-classification",
    model="chrisrtt/gbert-multi-class-german-hate",
    tokenizer="chrisrtt/gbert-multi-class-german-hate",
    truncation=True
)

def predict_hate_speech(text):
    try:
        processed = preprocess_text(text)
        result = hate_pipeline(processed)[0]
        return result["label"], result["score"]
    except:
        return "error", None


In [None]:
emotion_pipeline = pipeline(
    "text-classification",
    model="visegradmedia-emotion/Emotion_RoBERTa_german6_v7",
    tokenizer="visegradmedia-emotion/Emotion_RoBERTa_german6_v7",
    truncation=True
)

def predict_emotion(text):
    try:
        processed = preprocess_text(text)
        result = emotion_pipeline(processed)[0]
        return result["label"], result["score"]
    except:
        return "error", None


In [None]:
data_path = "data/german_tweets.xlsx"
df = pd.read_excel(data_path)

def classify_row(text):
    hate_label, hate_score = predict_hate_speech(text)
    emo_label, emo_score = predict_emotion(text)
    return pd.Series([hate_label, hate_score, emo_label, emo_score])

df[['hate_speech_label', 'hate_speech_score', 'emotion_label', 'emotion_score']] = df['tweet'].apply(classify_row)


In [None]:
label_mapping = {
    "LABEL_0": "anger",
    "LABEL_1": "fear",
    "LABEL_2": "disgust",
    "LABEL_3": "sadness",
    "LABEL_4": "joy",
    "LABEL_5": "none"
}

df["emotion_label"] = df["emotion_label"].map(label_mapping)


In [None]:
output_path = "outputs/de_emotion_hate_results.xlsx"
os.makedirs(os.path.dirname(output_path), exist_ok=True)

df.to_excel(output_path, index=False)
print(f"Sonuçlar kaydedildi: {output_path}")


# **KELİME FREKANSLARI**

In [None]:
import pandas as pd
import spacy
from collections import Counter
import warnings
import os

warnings.filterwarnings("ignore")
spacy.prefer_gpu()

# Türkçe transformer tabanlı modelin yüklenmesi
nlp = spacy.load("tr_core_news_trf")

# Stopword listesi
from spacy.lang.tr.stop_words import STOP_WORDS
stopwords = list(STOP_WORDS)


In [None]:
df = pd.read_excel("data/turkish_tweets.xlsx")
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df['year_month'] = df['date'].dt.strftime('%Y-%m')

tweets = df['tweet'].astype(str).tolist()
year_months = df['year_month'].tolist()

monthly_tokens = {ym: [] for ym in set(year_months)}

for ym, doc in zip(year_months, nlp.pipe(tweets, batch_size=50)):
    tokens = [token.lemma_.lower() for token in doc if token.is_alpha and token.lemma_.lower() not in stopwords]
    monthly_tokens[ym].extend(tokens)

monthly_top50 = {
    ym: Counter(tokens).most_common(50) for ym, tokens in monthly_tokens.items()
}

output_path = "outputs/tr_top50_by_month.xlsx"
os.makedirs(os


In [None]:
keyword_groups = {
    "suriye": ["suriye", "suriyeli"],
    "goc": ["göç", "göçmen", "göçmenler"],
    "alman": ["alman", "almanlar", "almanya"]
}

group_counters = {group: Counter() for group in keyword_groups}

df = pd.read_excel("data/turkish_tweets.xlsx")
tweets = df["tweet"].astype(str).tolist()

for doc in nlp.pipe(tweets, batch_size=50, n_process=2):
    tokens = list(doc)
    for i, token in enumerate(tokens):
        token_text = token.text.lower()
        for group, keywords in keyword_groups.items():


In [None]:
output_path = "outputs/tr_keyword_context.xlsx"
with pd.ExcelWriter(output_path) as writer:
    for group, counter in group_counters.items():
        df_out = pd.DataFrame(counter.most_common(25), columns=["Kelime", "Frekans"])
        df_out.to_excel(writer, sheet_name=group, index=False)

print(f"Anahtar kelime bağlam analizleri kaydedildi: {output_path}")


In [None]:
df = pd.read_excel("outputs/tr_emotion_hate_results.xlsx")
df.loc[df['emotion_label'] == 'surpriz', 'emotion_label'] = 'şaşkın'
df.to_excel("outputs/tr_emotion_hate_results_updated.xlsx", index=False)
print("Etiket düzeltmeleri yapıldı ve dosya güncellendi.")


# **ZAMAN SERİSİ ANALİZİ**

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt

# Dosya yolu
input_path = "outputs/Hate_Emotion_Analysis_Final.xlsx"
df = pd.read_excel(input_path)

# Tarih sütunu dönüştürme ve filtreleme
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df = df[df['date'] >= '2023-06-01']
df['year_month'] = df['date'].dt.to_period('M').astype(str)


In [None]:
# Hate Speech yüzdeleri
hate_groups = (
    df.groupby('year_month')['hate_speech_label']
    .value_counts(normalize=True)
    .rename('percentage')
    .mul(100)
    .reset_index()
)

# Emotion yüzdeleri
emotion_groups = (
    df.groupby('year_month')['emotion_label']
    .value_counts(normalize=True)
    .rename('percentage')
    .mul(100)
    .reset_index()
)


In [None]:
os.makedirs("plots/hate_speech", exist_ok=True)
os.makedirs("plots/emotion", exist_ok=True)


In [None]:
for cat in hate_groups['hate_speech_label'].unique():
    data = hate_groups[hate_groups['hate_speech_label'] == cat].copy()
    data.sort_values('year_month', inplace=True)

    plt.figure(figsize=(10, 6))
    plt.plot(data['year_month'], data['percentage'], marker='o')
    plt.title(f"Hate Speech Oranı: '{cat}'")
    plt.xlabel("Yıl-Ay")
    plt.ylabel("Yüzde (%)")
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.tight_layout()

    plt.savefig(f"plots/hate_speech/hate_{cat}.png")
    plt.close()


In [None]:
for cat in emotion_groups['emotion_label'].unique():
    data = emotion_groups[emotion_groups['emotion_label'] == cat].copy()
    data.sort_values('year_month', inplace=True)

    plt.figure(figsize=(10, 6))
    plt.plot(data['year_month'], data['percentage'], marker='o')
    plt.title(f"Duygu Oranı: '{cat}'")
    plt.xlabel("Yıl-Ay")
    plt.ylabel("Yüzde (%)")
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.tight_layout()

    plt.savefig(f"plots/emotion/emotion_{cat}.png")
    plt.close()
