Step 1: Install Required Libraries

Step 2: importing datasets

In [9]:
import pandas as pd
import json

# Load the JSON data
file_path = 'Reddit_data_train.json'  # Replace with the actual file path
with open(file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)

# Flatten the JSON data into a list of dictionaries
flattened_data = []
for user_id, posts in data.items():
    for post in posts:
        flattened_data.append({'user_id': user_id, 'user_post': post['text']})

# Convert the list to a DataFrame
df = pd.DataFrame(flattened_data)

# Display the first few rows of the DataFrame
print(df.head())


    user_id                                          user_post
0  S1NCL41R  Here is what I don’t get with this advice.  Wi...
1  S1NCL41R  from a financial standpoint, buy an older hond...
2  S1NCL41R  Correct.  The point is that he can’t take bene...
3  S1NCL41R  The provider can provide you a hardship waiver...
4  S1NCL41R  might i suggest that a key to this problem act...


In [10]:
df.to_csv('user_data.csv', index=False)

Step 3: Topic modelling, finding key topics

In [35]:
import pandas as pd
import openai
from langchain.retrievers import WikipediaRetriever
import os
from dotenv import load_dotenv

load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")

# Set up OpenAI API credentials
openai.api_key = 'api_key'

# Load the dataset with user IDs and posts
data = pd.read_csv('user_data.csv')

# Initialize the WikipediaRetriever
retriever = WikipediaRetriever()

# Define the prompt for topic modeling
topic_modeling_prompt = """
Given the following user post:
"{post}"

Generate up to 10 key topics related to the post.
"""

# Define the prompt for user profiling
user_profiling_prompt = """
Given the following user post:
"{post}"

Based on the generated topics:
{topics}

Assign a topic score to the user for each topic.
"""

# Perform topic modeling and user profiling for each user
user_profiles = []

for index, row in data.iterrows():
    user_id = row['user_id']
    user_post = row['post']
    
    # Generate topics using OpenAI's language model
    topic_modeling_input = topic_modeling_prompt.format(post=post)
    topic_modeling_response = openai.Completion.create(
        engine='gpt-3.5-turbo-1106',
        prompt=topic_modeling_input,
        max_tokens=100,
        n=10,  # Generate up to 10 topics
        stop=None,
        temperature=0.7
    )
    generated_topics = topic_modeling_response.choices[0].text.strip().split('\n')
    
    # Verify topics using the WikipediaRetriever
    verified_topics = []
    for topic in generated_topics:
        docs = retriever.get_relevant_documents(query=topic)
        if docs:
            verified_topics.append(topic)
    
    # Perform user profiling using the verified topics
    user_profiling_input = user_profiling_prompt.format(post=user_post, topics='\n'.join(verified_topics))
    user_profiling_response = openai.Completion.create(
        engine='gpt-3.5-turbo-1106',
        prompt=user_profiling_input,
        max_tokens=100,
        n=1,  # Generate a single response
        stop=None,
        temperature=0.7
    )
    topic_scores = user_profiling_response.choices[0].text.strip().split('\n')
    
    # Create the user profile
    user_profile = {
        'user_id': user_id,
        'topics': verified_topics,
        'topic_scores': topic_scores
    }
    
    user_profiles.append(user_profile)

# Convert user profiles to DataFrame
user_profiles_df = pd.DataFrame(user_profiles)

# Save identified topics to a CSV file
identified_topics_df = pd.DataFrame({'Identified Topics': user_profiles_df['topics'].explode().unique()})
identified_topics_df.to_csv('identified_topics.csv', index=False)

# Save user profiles to a CSV file
user_profiles_df.to_csv('user_profiles.csv', index=False)

APIRemovedInV1: 

You tried to access openai.Completion, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742


In [37]:
import pandas as pd
import openai
from langchain.retrievers import WikipediaRetriever
import os
from dotenv import load_dotenv

load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")

# Set up OpenAI API credentials
openai.api_key = api_key  # Use the API key loaded from environment variables

# Load the dataset with user IDs and posts
data = pd.read_csv('user_data.csv')

# Initialize the WikipediaRetriever
retriever = WikipediaRetriever()

# Define the prompt for topic modeling
topic_modeling_prompt = """
Given the following user post:
"{post}"

Generate up to 10 key topics related to the post.
"""

# Define the prompt for user profiling
user_profiling_prompt = """
Given the following user post:
"{post}"

Based on the generated topics:
{topics}

Assign a topic score to the user for each topic.
"""

# Perform topic modeling and user profiling for each user
user_profiles = []

for index, row in data.iterrows():
    user_id = row['user_id']
    user_post = row['user_post']
    
    # Generate topics using OpenAI's language model
    topic_modeling_input = topic_modeling_prompt.format(post=user_post)
    topic_modeling_response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that generates topics based on user posts."},
            {"role": "user", "content": topic_modeling_input},
        ],
        max_tokens=100,
    )
    generated_topics = topic_modeling_response.choices[0].message["content"].strip().split('\n')
    
    # Verify topics using the WikipediaRetriever
    verified_topics = []
    for topic in generated_topics:
        docs = retriever.get_relevant_documents(query=topic)
        if docs:
            verified_topics.append(topic)
    
    # Perform user profiling using the verified topics
    user_profiling_input = user_profiling_prompt.format(post=user_post, topics='\n'.join(verified_topics))
    user_profiling_response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that assigns topic scores to user posts."},
            {"role": "user", "content": user_profiling_input},
        ],
        max_tokens=100,
    )
    topic_scores = user_profiling_response.choices[0].message["content"].strip().split('\n')
    
    # Create the user profile
    user_profile = {
        'user_id': user_id,
        'topics': verified_topics,
        'topic_scores': topic_scores
    }
    
    user_profiles.append(user_profile)

# Convert user profiles to DataFrame
user_profiles_df = pd.DataFrame(user_profiles)

# Save identified topics to a CSV file
identified_topics_df = pd.DataFrame({'Identified Topics': user_profiles_df['topics'].explode().unique()})
identified_topics_df.to_csv('identified_topics.csv', index=False)

# Save user profiles to a CSV file
user_profiles_df.to_csv('user_profiles.csv', index=False)


APIRemovedInV1: 

You tried to access openai.ChatCompletion, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742


In [41]:
import pandas as pd
from openai import OpenAI
from langchain.retrievers import WikipediaRetriever
import os
from dotenv import load_dotenv

# Load the dataset with user IDs and posts
data = pd.read_csv('user_data.csv')

# Initialize the OpenAI client
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# Define the prompt for topic modeling
topic_modeling_prompt = """
Given the following user post:
"{post}"

Generate up to 10 key topics related to the post.
"""

# Define the prompt for user profiling
user_profiling_prompt = """
Given the following user post:
"{post}"

Based on the generated topics:
{topics}

Assign a topic score to the user for each topic.
"""

# Perform topic modeling and user profiling for each user
user_profiles = []

for index, row in data.iterrows():
    user_id = row['user_id']
    user_post = row['user_post']
    
    # Generate topics using OpenAI's GPT-3.5 model
    topic_modeling_input = topic_modeling_prompt.format(post=user_post)
    topic_modeling_response = client.chat.completions.create(
        messages=[
            {"role": "user", "content": topic_modeling_input},
        ],
        model="gpt-3.5-turbo",  # Use the GPT-3.5 model
        temperature=0
    )
    generated_topics = topic_modeling_response.choices[0].message.content.strip().split('\n')
    
    # Verify topics using the WikipediaRetriever
    verified_topics = []
    for topic in generated_topics:
        docs = retriever.get_relevant_documents(query=topic)
        if docs:
            verified_topics.append(topic)
    
    # Perform user profiling using the verified topics
    user_profiling_input = user_profiling_prompt.format(post=user_post, topics='\n'.join(verified_topics))
    user_profiling_response = client.chat.completions.create(
        messages=[
            {"role": "user", "content": user_profiling_input},
        ],
        model="gpt-3.5-turbo",  # Use the GPT-3.5 model
        temperature=0,
        timeout=60
    )
    topic_scores = user_profiling_response.choices[0].message.content.strip().split('\n')
    
    # Create the user profile
    user_profile = {
        'user_id': user_id,
        'topics': verified_topics,
        'topic_scores': topic_scores
    }
    
    user_profiles.append(user_profile)

# Convert user profiles to DataFrame
user_profiles_df = pd.DataFrame(user_profiles)

# Save identified topics to a CSV file
identified_topics_df = pd.DataFrame({'Identified Topics': user_profiles_df['topics'].explode().unique()})
identified_topics_df.to_csv('identified_topics.csv', index=False)

# Save user profiles to a CSV file
user_profiles_df.to_csv('user_profiles.csv', index=False)


KeyboardInterrupt: 

In [45]:
import pandas as pd
from openai import OpenAI
import os
from dotenv import load_dotenv
import concurrent.futures

# Load the dataset with user IDs and posts
data = pd.read_csv('user_data2.csv')

# Initialize the OpenAI client
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# Define the prompt for topic modeling
topic_modeling_prompt = """
Given the following user post:
"{post}"

Generate up to 10 key topics related to the post.
"""

# Define the prompt for user profiling
user_profiling_prompt = """
Given the following user post:
"{post}"

Based on the generated topics:
{topics}

Assign a topic score to the user for each topic.
"""

# Function to create batches from the dataset
def create_batches(dataframe, batch_size):
    for i in range(0, len(dataframe), batch_size):
        yield dataframe.iloc[i:i + batch_size]

# Set batch size
BATCH_SIZE = 10  # Adjust based on your requirements and API limitations

# Function to process a batch of user posts
def process_batch(batch):
    batch_results = []
    for _, row in batch.iterrows():
        user_id = row['user_id']
        user_post = row['user_post']
        
        # Generate topics using OpenAI's GPT-3.5 model
        topic_modeling_input = topic_modeling_prompt.format(post=user_post)
        topic_modeling_response = client.chat.completions.create(
            messages=[
                {"role": "user", "content": topic_modeling_input},
            ],
            model="gpt-3.5-turbo",  # Use the GPT-3.5 model
            temperature=0
        )
        generated_topics = topic_modeling_response.choices[0].message.content.strip().split('\n')
        
        # Perform user profiling using the generated topics
        user_profiling_input = user_profiling_prompt.format(post=user_post, topics='\n'.join(generated_topics))
        user_profiling_response = client.chat.completions.create(
            messages=[
                {"role": "user", "content": user_profiling_input},
            ],
            model="gpt-3.5-turbo",  # Use the GPT-3.5 model
            temperature=0,
            timeout=60
        )
        topic_scores = user_profiling_response.choices[0].message.content.strip().split('\n')
        
        # Create the user profile
        user_profile = {
            'user_id': user_id,
            'topics': generated_topics,
            'topic_scores': topic_scores
        }
        batch_results.append(user_profile)
    return batch_results

# Perform topic modeling and user profiling in parallel for batches
with concurrent.futures.ThreadPoolExecutor() as executor:
    # Create batches
    batches = list(create_batches(data, BATCH_SIZE))
    
    # Process batches in parallel
    results = executor.map(process_batch, batches)

# Flatten the results from batches
user_profiles = [profile for batch in results for profile in batch]

# Convert user profiles to DataFrame
user_profiles_df = pd.DataFrame(user_profiles)

# Save identified topics to a CSV file
identified_topics_df = pd.DataFrame({'Identified Topics': user_profiles_df['topics'].explode().unique()})
identified_topics_df.to_csv('identified_topics.csv', index=False)

# Save user profiles to a CSV file
user_profiles_df.to_csv('user_profiles.csv', index=False)


In [46]:
import pandas as pd
from openai import OpenAI
import os
from dotenv import load_dotenv
import concurrent.futures

# Load the dataset with user IDs and posts
data = pd.read_csv('user_data2.csv')

# Initialize the OpenAI client
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# Define the prompt for topic modeling
topic_modeling_prompt = """
Given the following user post:
"{post}"

Generate up to 10 key topics related to the post.
"""

# Define the prompt for user profiling
user_profiling_prompt = """
Given the following user post:
"{post}"

Based on the generated topics:
{topics}

Assign a topic score to the user for each topic.
"""

# Function to create batches from the dataset
def create_batches(dataframe, batch_size):
    for i in range(0, len(dataframe), batch_size):
        yield dataframe.iloc[i:i + batch_size]

# Set batch size
BATCH_SIZE = 10  # Adjust based on your requirements and API limitations

# Function to process a batch of user posts and collect topics
def process_batch(batch):
    batch_results = []
    all_topics = []  # Collect all topics from each user post
    for _, row in batch.iterrows():
        user_id = row['user_id']
        user_post = row['user_post']
        
        # Generate topics using OpenAI's GPT-3.5 model
        topic_modeling_input = topic_modeling_prompt.format(post=user_post)
        topic_modeling_response = client.chat.completions.create(
            messages=[
                {"role": "user", "content": topic_modeling_input},
            ],
            model="gpt-3.5-turbo",  # Use the GPT-3.5 model
            temperature=0
        )
        generated_topics = topic_modeling_response.choices[0].message.content.strip().split('\n')
        all_topics.extend(generated_topics)  # Add generated topics to the all_topics list
        
        # Perform user profiling using the generated topics
        user_profiling_input = user_profiling_prompt.format(post=user_post, topics='\n'.join(generated_topics))
        user_profiling_response = client.chat.completions.create(
            messages=[
                {"role": "user", "content": user_profiling_input},
            ],
            model="gpt-3.5-turbo",  # Use the GPT-3.5 model
            temperature=0,
            timeout=60
        )
        topic_scores = user_profiling_response.choices[0].message.content.strip().split('\n')
        
        # Create the user profile
        user_profile = {
            'user_id': user_id,
            'topics': generated_topics,
            'topic_scores': topic_scores
        }
        batch_results.append(user_profile)
    return batch_results, all_topics

# Perform topic modeling and user profiling in parallel for batches
all_topics_collected = []  # List to collect topics from all batches
with concurrent.futures.ThreadPoolExecutor() as executor:
    # Create batches
    batches = list(create_batches(data, BATCH_SIZE))
    
    # Process batches in parallel
    batch_results = executor.map(process_batch, batches)
    
    # Flatten the results from batches and collect topics
    user_profiles = []
    for batch_result, batch_topics in batch_results:
        user_profiles.extend(batch_result)
        all_topics_collected.extend(batch_topics)

# Analyze and select the top 20 topics
top_20_topics = pd.Series(all_topics_collected).value_counts().head(20).index.tolist()

# Convert user profiles to DataFrame
user_profiles_df = pd.DataFrame(user_profiles)

# Save top 20 identified topics to a CSV file
identified_topics_df = pd.DataFrame({'Identified Topics': top_20_topics})
identified_topics_df.to_csv('identified_topics_v2.csv', index=False)

# Save user profiles to a CSV file
user_profiles_df.to_csv('user_profiles_v2.csv', index=False)


In [47]:
import pandas as pd
from openai import OpenAI
import os
from dotenv import load_dotenv
import concurrent.futures
from collections import Counter

# Load the dataset with user IDs and posts
data = pd.read_csv('user_data2.csv')

# Initialize the OpenAI client
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# Define the prompt for topic modeling
topic_modeling_prompt = """
Given the following user post:
"{post}"

Generate key topics related to the post.
"""

# Set batch size
BATCH_SIZE = 10  # Adjust based on your requirements and API limitations

# Function to create batches from the dataset
def create_batches(dataframe, batch_size):
    for i in range(0, len(dataframe), batch_size):
        yield dataframe.iloc[i:i + batch_size]

# Function to process a batch of user posts and collect topics
def process_batch(batch):
    user_topics = {}  # Dictionary to store topics for each user
    post_profiles = []
    all_topics = []   # List to collect all topics
    for _, row in batch.iterrows():
        user_id = row['user_id']
        user_post = row['user_post']

        # Generate topics using OpenAI's GPT-3.5 model
        topic_modeling_input = topic_modeling_prompt.format(post=user_post)
        topic_modeling_response = client.chat.completions.create(
            messages=[
                {"role": "user", "content": topic_modeling_input},
            ],
            model="gpt-3.5-turbo",
            temperature=0
        )
        generated_topics = topic_modeling_response.choices[0].message.content.strip().split('\n')[:10]  # Limit to 10 topics per post
        single_word_topics = [topic for topic in generated_topics if len(topic.split()) == 1]  # Filter for single-word topics

        # Aggregate topics for each user
        if user_id not in user_topics:
            user_topics[user_id] = []
        user_topics[user_id].extend(single_word_topics)

        # Create post profile with associated topics
        post_profile = {
            'user_id': user_id,
            'user_post': user_post,
            'topics': single_word_topics
        }
        post_profiles.append(post_profile)

        # Collect all single-word topics
        all_topics.extend(single_word_topics)

    return user_topics, post_profiles, all_topics

# Perform topic modeling in parallel for batches
with concurrent.futures.ThreadPoolExecutor() as executor:
    # Create batches
    batches = list(create_batches(data, BATCH_SIZE))

    # Process batches in parallel
    batch_results = executor.map(process_batch, batches)

    # Flatten the results from batches and aggregate user topics
    aggregated_user_topics = {}
    all_post_profiles = []
    all_topics_collected = []
    for user_topics, post_profiles, all_topics in batch_results:
        for user_id, topics in user_topics.items():
            if user_id not in aggregated_user_topics:
                aggregated_user_topics[user_id] = Counter()
            aggregated_user_topics[user_id].update(topics)
        all_post_profiles.extend(post_profiles)
        all_topics_collected.extend(all_topics)

# Create user profiles with the most frequent single-word topics
user_profiles = []
for user_id, topics_counter in aggregated_user_topics.items():
    top_topics = [topic for topic, _ in topics_counter.most_common(10)]  # Get the top 10 topics
    user_profiles.append({'user_id': user_id, 'topics': top_topics})

# Identify the top 20 single-word topics used across all posts
top_20_topics = pd.Series(all_topics_collected).value_counts().head(20).index.tolist()

# Convert to DataFrames
user_profiles_df = pd.DataFrame(user_profiles)
post_profiles_df = pd.DataFrame(all_post_profiles)
top_20_topics_df = pd.DataFrame({'Top 20 Topics': top_20_topics})

# Save user profiles to a CSV file
user_profiles_df.to_csv('user_profiles_merged_v3.csv', index=False)

# Save post profiles with topics to a CSV file
post_profiles_df.to_csv('post_profiles_merged_v3.csv', index=False)

# Save the top 20 topics to a CSV file
top_20_topics_df.to_csv('top_20_topics_merged.csv', index=False)
