In [None]:
!pip install pandas gensim nltk


In [None]:
pip install pyLDAvis


In [None]:
pip install langchain-openai

In [5]:
pip install langchain

  and should_run_async(code)


Collecting langchain
  Downloading langchain-0.2.0-py3-none-any.whl (973 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m973.7/973.7 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.6-py3-none-any.whl (28 kB)
Collecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.0-py3-none-any.whl (23 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain)
  Downloading marshmallow-3.21.2-py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.3/49.3 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain)
  Downloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain)
  Downloading mypy_extens

In [7]:
import pandas as pd
import gensim
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models import TfidfModel
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from typing import List, Tuple, Dict
import pyLDAvis
from pyLDAvis import gensim
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import LLMChain
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning, module="ipykernel.ipkernel")
from openai import OpenAI
import json
from google.colab import userdata #we need this to get our openai secret key from the enviroment variables




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
def load_data() -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Load usernames and comments data from CSV files."""
    usernames_df = pd.read_csv('usernames.csv')
    comments_df = pd.read_csv('user_comments.csv')
    return usernames_df, comments_df

In [9]:
usernames_df, comments_df = load_data()

In [10]:
def preprocess_text(text: str) -> list:
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text.lower())
    filtered_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and token.isalpha()]
    return filtered_tokens


In [12]:

def create_dictionary_corpus_with_tfidf(df: pd.DataFrame) -> tuple:
    comments = df['comments'].apply(preprocess_text)
    dictionary = corpora.Dictionary(comments)
    corpus = [dictionary.doc2bow(text) for text in comments]

    # Create TF-IDF model
    tfidf_model = TfidfModel(corpus)
    tfidf_corpus = tfidf_model[corpus]

    return dictionary, tfidf_corpus


In [13]:
def build_lda_model(corpus, dictionary, num_topics=3) -> LdaModel:
    lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10, random_state=42)
    return lda_model


In [16]:
def assign_topics_to_documents(df: pd.DataFrame, dictionary, lda_model) -> pd.DataFrame:
    def get_dominant_topic(text):
        bow = dictionary.doc2bow(preprocess_text(text))
        topic_distribution = lda_model.get_document_topics(bow)
        return max(topic_distribution, key=lambda x: x[1])[0]

    df['topic'] = df['comments'].apply(get_dominant_topic)
    return df


In [17]:
def print_top_words(lda_model, num_words=10):
    for i in range(lda_model.num_topics):
        topic = lda_model.show_topic(i, num_words)
        words = [word for word, _ in topic]
        print(f"Topic #{i}: {' '.join(words)}")


In [19]:
def main():
    # Load data
    usernames_df, comments_df = load_data()

    # Assuming comments are in 'comments' column, adjust this if necessary
    comments = comments_df['comments']

    # Number of topics
    num_topics = 3

    # Create dictionary and corpus with TF-IDF
    dictionary, tfidf_corpus = create_dictionary_corpus_with_tfidf(comments_df)

    # Build and fit the LDA model
    lda_model = build_lda_model(tfidf_corpus, dictionary, num_topics=num_topics)

    # Assign topics to documents
    df_with_topics = assign_topics_to_documents(comments_df, dictionary, lda_model)

    # Print the DataFrame with assigned topics
    print(df_with_topics)

    # Print the top words for each topic
    print_top_words(lda_model, num_words=10)

    return (df_with_topics, lda_model, dictionary, tfidf_corpus)




In [20]:

df_with_topics, lda_model, dictionary, tfidf_corpus = main()

                  username                                           comments  \
0           LoveAGoodTwist  Female, Kentucky.  4 years out. Work equine on...   
1               wahznooski  As a woman of reproductive age, fuck Texas|As ...   
2     Churro_The_fish_Girl  what makes you want to become a vet?|what make...   
3                 abarthch  I see of course there are changing variables, ...   
4               VoodooKing  I have 412+ and faced issues because wireguard...   
...                    ...                                                ...   
3271            B1u3Chips_  I’m looking into applying for veterinary nursi...   
3272           Daktari2018  Good for you for sticking to standards of care...   
3273               Sheepb1  Yes feel free to ask someone to double check, ...   
3274               Elyrath  Same! Helps massively. Errors can still occur,...   
3275         Real_Use_3216  It’s no different than undergrad. School is sc...   

      topic  
0         2  

In [21]:

def visualize_lda_model(lda_model, corpus, dictionary):
    vis_data = gensim.prepare(lda_model, corpus, dictionary)
    pyLDAvis.display(vis_data)
    return pyLDAvis.display(vis_data)

In [22]:
# Visualize the LDA model
%matplotlib inline
pyLDAvis.enable_notebook()
vis = visualize_lda_model(lda_model, tfidf_corpus, dictionary)

In [23]:
vis

In [24]:
#was not used, just tested to see how it fared
#df_with_topics.to_csv("training_data.csv", index = False) w

In [25]:
def extract_topics(lda_model: LdaModel, num_topics: int, words_per_topic: int) -> List[List[str]]:
    """
    Extracts the top words for each topic from the LDA model.

    Parameters:
        lda_model (LdaModel): The trained LDA model.
        dictionary (Dictionary): The dictionary used for LDA model training.
        num_topics (int): The number of topics to extract.
        words_per_topic (int): The number of words per topic.

    Returns:
        List[List[str]]: A list of lists containing the top words for each topic.
    """
    topic_words = []
    for topic_id in range(num_topics):
        top_words = lda_model.show_topic(topic_id, topn=words_per_topic)
        topic_words.append([word for word, _ in top_words])
    return topic_words

In [26]:
def assign_topics_to_documents(lda_model: LdaModel, corpus: List[List[Tuple[int, int]]], num_topics: int) -> List[int]:
    document_topics = []
    for doc_bow in corpus:
        topic_probs = lda_model.get_document_topics(doc_bow)
        most_likely_topic = max(topic_probs, key=lambda x: x[1])[0]
        document_topics.append(most_likely_topic)
    return document_topics

In [27]:
# Assign the most likely topic to each document using the lda model
# Extract top words for each topic

num_topics = 3
words_per_topic = 20
top_words_per_topic = extract_topics(lda_model, num_topics, words_per_topic)

# Assign the most likely topic to each document using the lda model
document_topics = assign_topics_to_documents(lda_model, tfidf_corpus, num_topics)


In [None]:
document_topics

In [28]:

client = OpenAI(api_key=userdata.get('OPENAI_API_KEY'))

def generate_detailed_description(comment: str, topic_words: List[str]) -> str:
    """
    Generates a detailed description of the topic discussed in the comment using the OpenAI API.

    Parameters:
        comment (str): The comment to describe.
        topic_words (List[str]): The list of topic words.

    Returns:
        str: A detailed description generated by the language model.
    """
    prompt = f"""
    I have a comment that reads as follows:
    "{comment}"

    Based on the comment above, the main topic has the following keywords:
    {', '.join(topic_words)}

    Please provide a detailed description of the topic discussed in the comment, and categorize it under one of the following labels:
    1. Medical Doctor
    This label should only include practicing doctors and or consultants to doctors/clinics.
    Medical school students, nurses or medical professionals who aren’t doctors should go into the “Other” label (C) instead
    2. Veterinarian
    This label should only include practicing vets and/or consultants to vets/clinics.
    Veterinarian students or veterinarian technicians should go into the “Other” label (C) instead
    3. Other
    Anyone who does not fit within the Medical Doctor, or a Veterinarian label.

    Make sure the response is in the following format and in JSON:
    {{
        "topic": "<topic label>",
        "description": "<detailed topic description>"
    }}
    """

    try:
      response = client.chat.completions.create(
          model="gpt-3.5-turbo",
          response_format={ "type": "json_object" },
          messages=[
              {"role": "system", "content": "You are a helpful assistant."},
              {"role": "user", "content": prompt}
          ],
          seed= 1111)
    except Exception as e:

      # Truncate comment and topic words if they are too long
      comment = comment[:60000]  # Limit the comment to 80000 characters
      topic_words = topic_words[:20]
      prompt = f"""
        I have a comment that reads as follows:
        "{comment}"

        Based on the comment above, the main topic has the following keywords:
        {', '.join(topic_words)}

        Please provide a detailed description of the topic discussed in the comment, and categorize it under one of the following labels:
        1. Medical Doctor
        This label should only include practicing doctors and or consultants to doctors/clinics.
        Medical school students, nurses or medical professionals who aren’t doctors should go into the “Other” label (C) instead
        2. Veterinarian
        This label should only include practicing vets and/or consultants to vets/clinics.
        Veterinarian students or veterinarian technicians should go into the “Other” label (C) instead
        3. Other
        Anyone who does not fit within the Medical Doctor, or a Veterinarian label.

        Make sure the response is in the following format and in JSON:
        {{
            "topic": "<topic label>",
            "description": "<detailed topic description>"
        }}
        """
      response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        response_format={ "type": "json_object" },
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        seed= 1111
    )
    for choice in response.choices:
      # Extract topic and description from content
      content = choice.message.content
      # content = response.choices[0].message['content']

    # Parse the JSON response
    try:
        response_json = json.loads(content)
        topic_label = response_json["topic"]
        description = response_json["description"]
    except (json.JSONDecodeError, KeyError):
        topic_label = "Unknown"
        description = "Could not generate a detailed description."

    return topic_label, description



In [29]:

# Clean the 'comments' column and get the label using llm
comments_df['topic_id'] = document_topics
comments_df['comments'] = comments_df['comments'].str.strip().str.replace(r'\s+', ' ', regex=True)
comments_df['topic_words'] = comments_df['topic_id'].apply(lambda x: top_words_per_topic[x])
comments_df['detailed_description'] = comments_df.apply(
    lambda row: generate_detailed_description(row['comments'], row['topic_words']), axis=1
)


In [30]:
comments_df.to_csv("training_data.csv", index = False)