In [None]:
import sys
import os

sys.path.append(
    os.path.abspath(os.path.join(os.getcwd(), "..", ".."))
)  # Adjust as needed
sys.path.append(
    os.path.abspath(os.path.join(os.getcwd(), "..", "..", "scripts"))
)  # Adjust as needed
import pandas as pd
import numpy as np
from scripts.my_text_cleaning import clean_dataframe
from scripts.parallel_topic_model import deduplicate_text_and_embeddings
from bertopic import BERTopic
from transformers import pipeline
from bertopic.representation import TextGeneration
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import MaximalMarginalRelevance
from nltk.corpus import stopwords

In [None]:
chosen_dataset = "cop26_tweets_en"
chosen_dataset = "covid_tweets_en"
chosen_dataset = "ukraine_tweets_en"
ur_df = pd.read_parquet("./../../data/raw/" + chosen_dataset + ".parquet")
doc_info = pd.read_csv(
    "./../../data/processed/document_info_" + chosen_dataset + ".csv"
)[["Document", "Topic", "Representative_document", "Name"]]
topic_info = pd.read_csv(
    "./../../data/processed/topic_info_" + chosen_dataset + ".csv"
)
if "Unnamed: 0" in topic_info.columns:
    topic_info.drop(columns=["Unnamed: 0"], inplace=True)
doc_info.Topic = doc_info.Topic.astype(int)
embeddings = np.load("./../../data/processed/" + chosen_dataset + ".parquet.npy")
topic_model = BERTopic.load("./../../models/with_hashtags/cop26_tweets_en.parquet.topic_model")
cln_df = clean_dataframe(
    ur_df, 
    'text',
    phrases_to_remove=["&gt;", "&lt;", "&amp;", "RT : "],
    remove_empty=False,
    remove_urls=True,
    normalize_hashtags=True,
    normalize_mentions=True,
    user_placeholder="user",
    strip_punctuation=False,
    lowercase=False,
    )
unique_docs, unique_embeddings = deduplicate_text_and_embeddings(cln_df, embeddings, 'Cleantext')
topic_model = BERTopic.load(
    "./../../models/" + chosen_dataset + ".parquet.topic_model",
    embedding_model="all-mpnet-base-v2",
)
print(f"{len(embeddings)=}")
print(f"{len(cln_df)=}")
print(f"{len(unique_docs)=}")
print(f"{len(doc_info)=}")
print(cln_df.columns)
print(len(unique_docs), len(unique_embeddings))

In [None]:
embeddings, unique_embeddings = None, None

In [None]:
# Get English + Spanish stopwords
stopwords_en = stopwords.words("english")
stopwords_es = stopwords.words("spanish")
data_specific_stopwords = []
if chosen_dataset == "covid_tweets_en":
    data_specific_stopwords = [
        "covid",
        "covid19",
        "coronavirus",
        #"pandemic",
        #"virus",
        #"people",
        #"get",
        #"like",
        #"one",
        #"new",
        #"cases",
        #"health",
        #"vaccine",
        #"vaccines",
        #"vaccinated",
        #"deaths",
        #"time",
        #"year",
        #"day",
        #"years",
    ]
elif chosen_dataset == "ukraine_tweets_en":
    data_specific_stopwords = [
        "ukraine",
        "russia",
        #"war",
        #"russian",
        #"people",
        #"like",
        #"one",
        #"get",
        #"just",
        #"know",
        #"time",
        #"day",
        #"year",
        #"years",
        #"donbas",
        #"ukrainian",
        #"military",
        #"ukrainians",
        #"today",
    ]
elif chosen_dataset == "cop26_tweets_en":
    data_specific_stopwords = [
        "cop26",
        #"climate",
        #"people",
        #"like",
        #"one",
        #"get",
        #"just",
        #"know",
        #"time",
        #"day",
        #"year",
        #"years",
        #"action",
        #"change",
        #"global",
        #"world",
        #"new",
        #"need",
    ]
custom_stopwords = set(
    stopwords_en + stopwords_es + ["http", "https", "amp", "www", "com"] + ["user", 'rt'] + data_specific_stopwords
)
print(custom_stopwords)

In [None]:
docs_clean = doc_info.Document.tolist()
topics = (doc_info.Topic.tolist())
vectorizer_model = CountVectorizer(stop_words=list(custom_stopwords), ngram_range=(1, 2))

representation_models = {
    "MMR": MaximalMarginalRelevance(diversity=0.7)
}

topic_model.update_topics(
    docs=docs_clean, topics=topics, vectorizer_model=vectorizer_model, representation_model=representation_models
)


In [None]:
new_topic_info = topic_model.get_topic_info()
new_topic_info['New_Name'] = new_topic_info.apply(
    lambda row: str(row['Topic']) + "_" + "_".join(row['MMR']),
    axis=1
)

new_topic_info

In [None]:
merged_topic_info = topic_info.merge(
    new_topic_info[["Topic", 'MMR', "New_Name"]], on="Topic", suffixes=("_old", "_new")
)
merged_topic_info

In [None]:
merged_topic_info.to_csv(
    "./../../data/processed/topic_info_" + chosen_dataset + "_with_MMR.csv", index=False
)

### Code for label generation with LLM

In [None]:
prompt = "I have a topic described by the following keywords: [KEYWORDS]. Based on the previous keywords, what is this topic about?"

# Create your representation model
generator = pipeline("text2text-generation", model="google/flan-t5-base")
flan_model = TextGeneration(generator)


In [None]:
# MMR Model
mmr_model = MaximalMarginalRelevance(diversity=0.3)

# Text generation
representation_model = {"MMR": mmr_model, "Flan": flan_model}

In [None]:
vectorizer_model = CountVectorizer(stop_words="english")

topic_model.update_topics(
    unique_docs,
    topics=doc_info.Topic.to_list(),
    vectorizer_model=vectorizer_model,
    representation_model=representation_model,
)


In [None]:
for r in topic_model.get_topic_info()["Zephyr"]:
    print(r)

## Generation with Zephyr-7B-α
Failed because of uncaught error in transformers.pipeline:
Maybe because of ctransformer model.
The huggingface model is too big.
```python 
AttributeError: 'TextGenerationPipeline' object has no attribute 'assistant_model'
```


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

model = AutoModelForCausalLM.from_pretrained("HuggingFaceH4/zephyr-7b-alpha")
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-alpha")

generator = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    max_new_tokens=50,
    repetition_penalty=1.1,
)

print(generator("Once upon a time"))


In [None]:
from ctransformers import AutoModelForCausalLM
from transformers import AutoTokenizer, pipeline
from bertopic.representation import TextGeneration
from bertopic.representation import MaximalMarginalRelevance

# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
model = AutoModelForCausalLM.from_pretrained(
    "TheBloke/zephyr-7B-alpha-GGUF",
    model_file="zephyr-7b-alpha.Q3_K_M.gguf",
    model_type="mistral",
    gpu_layers=50,
    hf=True,
)

tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-alpha")


In [None]:
!pip install ctransformers[cuda]
!pip install --upgrade git+https://github.com/huggingface/transformers

from ctransformers import AutoModelForCausalLM

# from transformers import AutoTokenizer, pipeline
import transformers

# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
model = AutoModelForCausalLM.from_pretrained(
    "TheBloke/zephyr-7B-alpha-GGUF",
    model_file="zephyr-7b-alpha.Q4_K_M.gguf",
    model_type="mistral",
    gpu_layers=50,
    hf=True,
)
tokenizer = transformers.AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-alpha")

# Pipeline
generator = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    max_new_tokens=50,
    repetition_penalty=1.1,
)

In [None]:
# MMR Model
mmr_model = MaximalMarginalRelevance(diversity=0.3)

# Text generation with Zephyr
zephyr = TextGeneration(generator, prompt=prompt)
representation_model = {"MMR": mmr_model, "Zephyr": zephyr}


In [None]:
topic_model.update_topics(
    unique_docs,
    topics=doc_info.Topic.to_list(),
    representation_model=representation_model,
)
