In [None]:
import json
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import scipy.stats as stats
import seaborn as sns
from tqdm import tqdm
import torch


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# from zipfile import ZipFile
# with ZipFile('./drive/MyDrive/DevGPT.zip', 'r') as zipObj:
#   zipObj.extractall('./drive/MyDrive/DevGPT')


In [None]:
# File list
dir_list = []
for dir in os.listdir('./drive/MyDrive/DevGPT/DevGPT'):
    if 'snapshot' in dir:
        dir_list.append(f'./drive/MyDrive/DevGPT/DevGPT/{dir}')

print(dir_list)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

#### Loading Json File by Path

In [None]:
def load_json(file_path):
    json_file_path = file_path

    with open(json_file_path, 'r') as file:
        # Load JSON data into a Python object
        return json.load(file)

def execute_in_all(filelist, func):
    result_list = []
    for folder in filelist:
        for filename in os.listdir(folder):
            if filename.endswith('.json'):
                file_path = os.path.join(folder, filename)
                data = load_json(file_path)
                result_list += func(data)
    return result_list


### RQ1. What is the distribution of different issues that developer present to ChatGPT on link shared to github?

#### Method 1: Visualization of Different type of Github Links

In [None]:
type_list = []

def count_type(data):
    type_list = []
    for source in data['Sources']:
        create_time = 'CreatedAt' if 'CreatedAt' in source else 'AuthorAt'
        type_list.append((source['Type'], source[create_time]))
    return type_list

type_list += execute_in_all(filelist=dir_list, func=count_type)

In [None]:
# Gather the # of each type
type_dict = {}
for type in type_list:
    if type[0] not in type_dict:
        type_dict[type[0]] = [1]
    else:
        type_dict[type[0]][0] += 1

print(type_dict)

time_dict = {}
for type in type_list:
    time = type[1][:7]
    if time not in time_dict:
        time_dict[time] = {}
    if type[0] not in time_dict[time]:
        time_dict[time][type[0]] = 1
    else:
        time_dict[time][type[0]] += 1

print(time_dict)

In [None]:
type_df = pd.DataFrame.from_dict(type_dict)
time_df = pd.DataFrame.from_dict(time_dict)

In [None]:
time = time_df.sort_index(axis=1)
print(time)

row_sum = time.iloc[5].sum()
print(row_sum)

In [None]:
type_df.plot(kind='bar', legend=True)
plt.ylabel('Counts')
plt.title('GPT Share Link')
plt.tight_layout()
plt.show()

In [None]:
df = time_df

df.columns = pd.to_datetime(df.columns)

df = df.sort_index(axis=1)
df = df.loc[:, df.columns.year >= 2022]

# Plot scatter plot
fig, ax = plt.subplots(figsize=(10, 6))
for category in df.index:
    ax.scatter(df.columns, df.loc[category], label=category)

# Add legend and labels
ax.legend()
ax.set_xlabel('Date')
ax.set_ylabel('Counts')
ax.set_title('Counts of Categories Over Time')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Find data from 2022 Jan
def find_2022(data):
    data_list = []
    for source in data['Sources']:
        create_time = 'CreatedAt' if 'CreatedAt' in source else 'AuthorAt'
        if source[create_time][:7] == '2022-01':
            data_list.append(source)
    return data_list

share_2022 = execute_in_all(filelist=dir_list, func=find_2022)

In [None]:
import pprint
# Prints the nicely formatted dictionary
pprint.pprint(share_2022[0])
# Sets 'pretty_dict_str' to the formatted string value
pretty_dict_str = pprint.pformat(share_2022[0])

##### Conclusion 1
The user often respond to old post with ChatGPT links, this result in the inaccuracy of data.
To Do: Use data of the ChatGPT link to estimatetime.

In [None]:
type_list = []

def count_type(data):
    type_list = []
    for source in data['Sources']:
        for share in source['ChatgptSharing']:
            if 'DateOfConversation' in share:
                type_list.append((source['Type'], share['DateOfConversation']))
    return type_list

type_list += execute_in_all(filelist=dir_list, func=count_type)

In [None]:
time_dict = {}
for type in type_list:
    try:
        date = pd.to_datetime(type[1], format="%B %d, %Y")
        time = date.strftime("%B %Y")
        if time not in time_dict:
            time_dict[time] = {}
        if type[0] not in time_dict[time]:
            time_dict[time][type[0]] = 1
        else:
            time_dict[time][type[0]] += 1
    except:
        continue

print(time_dict)

In [None]:
time_df = pd.DataFrame.from_dict(time_dict)

In [None]:
df = time_df

df.columns = pd.to_datetime(df.columns)

df = df.sort_index(axis=1)
# df = df.loc[:, df.columns.year >= 2022]

category_colors = {
    'hacker news': 'red',
    'pull request': 'blue',
    'issue': 'green',
    'discussion': 'orange',
    'commit': 'purple',
    'code file': 'brown'
}

# Plot scatter plot
fig, ax = plt.subplots(figsize=(10, 6))
for category in df.index:
    color = category_colors.get(category, 'black')  # Default to black if color not defined
    ax.scatter(df.columns, df.loc[category], label=category, color=color)

    # Add lines connecting points
    ax.plot(df.columns, df.loc[category], linestyle='-', alpha=0.5, color=color)

# Add legend and labels
ax.legend()
ax.set_xlabel('Date')
ax.set_ylabel('Counts')
ax.set_title('Counts of Categories Over Time')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

##### Conclusion 2
There is a significant amout of increase in june and july for chatgpt usage. However, the reduce after august might be affected by the fact that more data are collected during augest.
Observation from the chart:
 - People are more instereted in disscussion and solving issues using ChatGpt in May and June.
 - There is a significant amount increase in pr involve ChatGPT link in July and August. (might suggest users are using chatgpt for coding or code reviewing).
    - Suggesting higher confident in ChatGPT's ability in terms of coding.
 - There are high amount of usage in code file. This cover many different areas, e.g. ChatGPT related toolkits and demos, code initiated with chatgpt, ChatGPT assisted documentation, etc.
    - This means the code file does not directly related to coding questions since majroity of it is documentation related.
 - There is little disscussion, this might because people dont tend to use github disscussion.
    - In contrary the high amount in issue, showing ChatGPT is implemented for work related scenarios very quickly.
 - ChatGPT usage in PR might suggest different reason, e.g. coding style, request update, etc. Can be consider similar to issue.

In [None]:
# Get counts for each category for the latest date
category_counts = type_df.loc[0]

# Plot pie chart
plt.figure(figsize=(8, 8))
plt.pie(category_counts, labels=category_counts.index, colors=[category_colors.get(cat, 'black') for cat in category_counts.index], autopct='%1.1f%%', startangle=140)
plt.title('Distribution of Categories')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()

In [None]:
# Get counts for each category for the latest date
new_df = pd.DataFrame.transpose(df)
category_counts = new_df.loc['2023-05-01']

# Plot pie chart
plt.figure(figsize=(8, 8))
plt.pie(category_counts, labels=category_counts.index, colors=[category_colors.get(cat, 'black') for cat in category_counts.index], autopct='%1.1f%%', startangle=140)
plt.title('Distribution of Categories')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()

In [None]:
# Get counts for each category for the latest date
new_df = pd.DataFrame.transpose(df)
category_counts = new_df.loc['2023-07-01']

# Plot pie chart
plt.figure(figsize=(8, 8))
plt.pie(category_counts, labels=category_counts.index, colors=[category_colors.get(cat, 'black') for cat in category_counts.index], autopct='%1.1f%%', startangle=140)
plt.title('Distribution of Categories')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()

In [None]:
# Get counts for each category for the latest date
new_df = pd.DataFrame.transpose(df)
category_counts = new_df.loc['2023-09-01']

# Plot pie chart
plt.figure(figsize=(8, 8))
plt.pie(category_counts, labels=category_counts.index, colors=[category_colors.get(cat, 'black') for cat in category_counts.index], autopct='%1.1f%%', startangle=140)
plt.title('Distribution of Categories')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()

##### Conclusion 3
Observation from pie chart:
 - In May most ChatGPT usage are disscusion related.
 - In July there is significant increase in ChatGPT usage in directly coding related topics.
 - In September, the direct usage in pr reduced but increase in commit.
     - This might suggest ChatGPT usage become less welcomed in code base with multiple maintainers.
     - Whereas for smaller independent projects general coding, ChatGPT is considered as sufficient tool.

In [None]:
correlation_matrix = new_df.corr()
print(correlation_matrix)

#### Conclusion 4
Base on correlation analysis:
 - hacker news: strong correlation with issues, disscussion. Consider they are all disscusion related.
 - code file: strong correlation to everything but commit. Referencing the nature of code file can be very diverse.
 - commit: low correlation with other categories. The usage of ChatGPT in commit often suggest direct practice of using ChatGPT for coding (referencing ChatGPT link in code).
    - The low correlation with other types in comparison, might suggest most other type of usage of ChatGPT is disscussion related.
    - The change in amount/percentage of commit might shows developer's overall confident level on using ChatGPT for coding assist.

# RQ#2

1. here we want to find a way to tell what symbolizes satifaction with a response ie(no more prompting... a thank you)

2. use that information to see what is said before

In [None]:

#get all data

def find_2022(data):
    data_list = []
    for source in data['Sources']:
        data_list.append(source)
    return data_list

all_data = execute_in_all(filelist=dir_list, func=find_2022)

In [None]:
pprint.pprint(all_data[1])

In [None]:
pprint.pprint(all_data[1]['ChatgptSharing'][0]['Conversations'][0]['Answer']) #example of answer access

#


In [None]:
pprint.pprint(all_data[1]['ChatgptSharing'][0]['Conversations'])

Note data is in order of when asked


In [None]:
max_prompts = 0
total_prompts = 0
num_missing = 0
num_entries = 0
list_of_num_prompts = []

for entry in all_data:
    num_entries += 1

    for sharing in entry['ChatgptSharing']:

        # Check if 'NumberOfPrompts' key exists in the dictionary
        if 'NumberOfPrompts' in sharing:
            list_of_num_prompts.append(sharing['NumberOfPrompts'])
            total_prompts += sharing['NumberOfPrompts']

            if sharing['NumberOfPrompts'] > max_prompts:
                max_prompts = sharing['NumberOfPrompts']

        else:
            #print("NumberOfPrompts key missing in this sharing entry")
            num_missing += 1
            if 'Conversations' in sharing:
                print(sharing['Conversations'])


print("The maximum number of prompts is:", max_prompts)
print("The total number of prompts is:", total_prompts)
print("The average number of prompts is:", total_prompts / len(all_data))
print("The number of sharing entries missing the 'NumberOfPrompts' key is:", num_missing)
print("The number of sharing entries is:", num_entries)
print("the percentage of missing entries is:", num_missing/num_entries) #percentage of missing entries



Clean data by removing missing conversations and taking out ' and "

In [None]:

# Cleaning text data by removing special characters and handling missing data
for i, entry in tqdm(enumerate(all_data)):
    for j, sharing in enumerate(entry['ChatgptSharing']):
        new_conversations = []
        if 'Conversations' not in sharing:
            continue
        for k, conversation in enumerate(sharing['Conversations']):
            # Check if 'Prompt' and 'Answer' keys exist
            if 'Prompt' in conversation and 'Answer' in conversation:
                # Clean special characters
                cleaned_prompt = conversation['Prompt'].replace('\'', '').replace('\"', '')
                cleaned_answer = conversation['Answer'].replace('\"', '').replace('\'', '')
                # Update the conversation with cleaned data
                conversation['Prompt'] = cleaned_prompt
                conversation['Answer'] = cleaned_answer
            # Always append the possibly modified conversation
            new_conversations.append(conversation)
        # Update the Conversations list with cleaned conversations
        all_data[i]['ChatgptSharing'][j]['Conversations'] = new_conversations



pprint.pprint(all_data[1]) #example of cleaned data





In [None]:



# Get the frequency of each number of prompts
frequency = [list_of_num_prompts.count(i) for i in range(1, max_prompts+1)]

# Create the x-axis values (number of prompts)
x = list(range(1, max_prompts+1))

# Plot the graph
plt.bar(x, frequency)
plt.xlabel('Number of Prompts')
plt.ylabel('Frequency')
plt.title('Number of Prompts by Frequency')
plt.xticks(np.arange(0, 100, step=5))
plt.xlim(1, 100)
plt.ylim(0, 5000)
plt.show()
# Create a DataFrame with the frequency of each number of prompts
df_prompts = pd.DataFrame({'Frequency': list_of_num_prompts})

# Get the statistics of the number of prompts
statistics = df_prompts.describe()

print(statistics)



conclusion 1 majority of the data falls within 5 prompts and therefore is likely resolved in 5 prompts

I want to gert the avg over time for sentiment

In [None]:
%pip install tqdm

In [None]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from tqdm import tqdm
from collections import defaultdict



# Assuming all_data is already loaded with data like the provided example
conversations = {}


# Assuming all_data is already loaded with data like the provided example
from collections import defaultdict

# Initialize a dictionary to hold lists of prompts and answers by their index in the conversation
conversations_by_index = defaultdict(list)

# Correctly iterating over the 'ChatgptSharing' array and then 'Conversations'
for entry in tqdm(all_data):
    if 'ChatgptSharing' in entry:
        for sharing in entry['ChatgptSharing']:
            if 'Conversations' in sharing:
                # Iterate through each conversation
                for idx, conv in enumerate(sharing['Conversations']):
                    # Append the prompt and answer to the appropriate list in the dictionary
                    conversations_by_index[idx].append((conv['Prompt'], conv['Answer']))

# Now 'conversations_by_index' is a dictionary where each key is an index and the value is a list of tuples
# Each tuple is (Prompt, Answer) from all conversations at that position

# Assuming 'conversations_by_index' is filled as described, where each index maps to a list of (Prompt, Answer) tuples

# Initialize TF-IDF vectorizers for prompts and answers
prompt_vectorizer = TfidfVectorizer()
answer_vectorizer = TfidfVectorizer()

# Dictionaries to store TF-IDF matrices for prompts and answers separately
prompt_tfidf_matrices = {}
answer_tfidf_matrices = {}

# Dictionaries to store cosine similarity results for prompts and answers separately
prompt_similarity_averages = {}
answer_similarity_averages = {}

for index, conv_group in tqdm(conversations_by_index.items()):
    # Extract separate lists for prompts and answers
    prompts = [pair[0] for pair in conv_group]
    answers = [pair[1] for pair in conv_group]

    # Compute TF-IDF vectors for prompts and answers separately
    if prompts:
        prompt_tfidf_matrix = prompt_vectorizer.fit_transform(prompts)
        prompt_tfidf_matrices[index] = prompt_tfidf_matrix

        # Compute cosine similarity for prompts
        if prompt_tfidf_matrix.shape[0] > 1:
            prompt_similarity_matrix = cosine_similarity(prompt_tfidf_matrix)
            np.fill_diagonal(prompt_similarity_matrix, 0)
            prompt_avg_similarity = np.sum(prompt_similarity_matrix) / (prompt_similarity_matrix.shape[0] * (prompt_similarity_matrix.shape[0] - 1))
            prompt_similarity_averages[index] = prompt_avg_similarity

    if answers:
        answer_tfidf_matrix = answer_vectorizer.fit_transform(answers)
        answer_tfidf_matrices[index] = answer_tfidf_matrix

        # Compute cosine similarity for answers
        if answer_tfidf_matrix.shape[0] > 1:
            answer_similarity_matrix = cosine_similarity(answer_tfidf_matrix)
            np.fill_diagonal(answer_similarity_matrix, 0)
            answer_avg_similarity = np.sum(answer_similarity_matrix) / (answer_similarity_matrix.shape[0] * (answer_similarity_matrix.shape[0] - 1))
            answer_similarity_averages[index] = answer_avg_similarity

# Output average similarity results for each index for prompts and answers
print("Average Cosine Similarity Scores by Prompt Index:")
for idx, avg_sim in prompt_similarity_averages.items():
    print(f"Prompt Index {idx}: {avg_sim:.4f}")
print("Average Cosine Similarity Scores by Answer Index:")
for idx, avg_sim in answer_similarity_averages.items():
    print(f"Answer Index {idx}: {avg_sim:.4f}")


In [None]:


# Assuming prompt_similarity_averages and answer_similarity_averages are dictionaries filled as previously described

# Lists to hold the indices and corresponding average similarities
prompt_indices = list(prompt_similarity_averages.keys())
prompt_similarities = [prompt_similarity_averages[idx] for idx in prompt_indices]

answer_indices = list(answer_similarity_averages.keys())
answer_similarities = [answer_similarity_averages[idx] for idx in answer_indices]

# Plotting the average cosine similarity scores for prompts
plt.figure(figsize=(10, 5))
plt.plot(prompt_indices, prompt_similarities, marker='o', linestyle='-', color='blue', label='Prompts')
plt.plot(answer_indices, answer_similarities, marker='s', linestyle='--', color='red', label='Answers')
plt.xlabel('Prompt/Answer Index')
plt.ylabel('Average Cosine Similarity')
plt.title('Average Cosine Similarity Scores by Index for Prompts and Answers (one to all)')
plt.legend()
plt.grid(True)

plt.tight_layout()

# Show the plot
plt.show()


conclusion 2: the correlation of one to all rises with the number of prompts until it hits a large drop and then rises with number of prompts until the end. This could be since developers are asking smilar questions to the previous the further they go in conversation. The big drop is likely due to a switch in the topic of prompts

sentiment analysis... try to do this but compsre the sentiment for each response number

In [None]:
from textblob import TextBlob
from collections import defaultdict

def analyze_sentiments(data):
    # Assuming all_data is already loaded with data like the provided example
    conversations_by_index = defaultdict(list)

    # Correctly iterating over the 'ChatgptSharing' array and then 'Conversations'
    for entry in tqdm(data):
        if 'ChatgptSharing' in entry:
            for sharing in entry['ChatgptSharing']:
                if 'Conversations' in sharing:
                    # Iterate through each conversation
                    for idx, conv in enumerate(sharing['Conversations']):
                        # Append the prompt, answer, and their sentiments to the appropriate list in the dictionary
                        prompt_sentiment = TextBlob(conv['Prompt']).sentiment.polarity
                        answer_sentiment = TextBlob(conv['Answer']).sentiment.polarity
                        conversations_by_index[idx].append({
                            'Prompt': conv['Prompt'],
                            'Prompt Sentiment': prompt_sentiment,
                            'Answer': conv['Answer'],
                            'Answer Sentiment': answer_sentiment
                        })

    # Now 'conversations_by_index' is a dictionary where each key is an index and the value is a list of dictionaries
    # Each dictionary contains the prompt, the answer, and their respective sentiments

    # To calculate average sentiment per index:
    average_sentiment_by_index = {}

    for idx, conversations in conversations_by_index.items():
        total_prompt_sentiment = sum(c['Prompt Sentiment'] for c in conversations)
        total_answer_sentiment = sum(c['Answer Sentiment'] for c in conversations)
        average_prompt_sentiment = total_prompt_sentiment / len(conversations)
        average_answer_sentiment = total_answer_sentiment / len(conversations)
        average_sentiment_by_index[idx] = {
            'Average Prompt Sentiment': average_prompt_sentiment,
            'Average Answer Sentiment': average_answer_sentiment
        }

    # # Print average sentiments for each index
    # for idx, sentiments in average_sentiment_by_index.items():
    #     print(f"Index {idx}:")
    #     print(f"  Average Prompt Sentiment: {sentiments['Average Prompt Sentiment']:.2f}")
    #     print(f"  Average Answer Sentiment: {sentiments['Average Answer Sentiment']:.2f}")
    return average_sentiment_by_index



In [None]:


def graph_sentiments(average_sentiment, type_name):
    # Lists to hold the indices and corresponding average sentiments
    indices = list(average_sentiment.keys())
    average_prompt_sentiments = [average_sentiment[idx]['Average Prompt Sentiment'] for idx in indices]
    average_answer_sentiments = [average_sentiment[idx]['Average Answer Sentiment'] for idx in indices]

    # Plotting the average sentiment scores for prompts
    plt.figure(figsize=(12, 6))
    plt.plot(indices, average_prompt_sentiments, label='Average Prompt Sentiment', marker='o', linestyle='-', color='blue')
    plt.xlabel('Conversation Index')
    plt.ylabel('Average Sentiment Score')
    plt.title(f'Average Sentiment Scores by Conversation Index for Prompts for {type_name}')  # Corrected title format
    plt.legend()
    plt.tight_layout()
    plt.show()

    # Plotting the average sentiment scores for answers
    plt.figure(figsize=(12, 6))
    plt.plot(indices, average_answer_sentiments, label='Average Answer Sentiment', marker='s', linestyle='--', color='green')
    plt.xlabel('Conversation Index')
    plt.ylabel('Average Sentiment Score')
    plt.title(f'Average Sentiment Scores by Conversation Index for Answers for {type_name}')  # Corrected title format
    plt.legend()
    plt.tight_layout()
    plt.show()


In [None]:
issue = []
discussion = []
commit = []
code_file = []
pull_request = []
hacker_news = []


for i, item in enumerate(all_data):
    if item['Type'] == 'issue':
        issue.append(item)
    elif item['Type'] == 'discussion':
        discussion.append(item)
    elif item['Type'] == 'commit':
        commit.append(item)
    elif item['Type'] == 'code file':
        code_file.append(item)
    elif item['Type'] == 'pull request':
        pull_request.append(item)
    elif item['Type'] == 'hacker news':
        hacker_news.append(item)


types = [issue, discussion, commit, code_file, pull_request, hacker_news]






In [None]:
#run all sentiment analysis

all_data_sentiment = analyze_sentiments(all_data)
graph_sentiments(all_data_sentiment, 'All Data')
issue_sentiment = analyze_sentiments(issue)
graph_sentiments(issue_sentiment, 'Issue')
discussion_sentiment = analyze_sentiments(discussion)
graph_sentiments(discussion_sentiment, 'Discussion')
commit_sentiment = analyze_sentiments(commit)
graph_sentiments(commit_sentiment, 'Commit')
code_file_sentiment = analyze_sentiments(code_file)
graph_sentiments(code_file_sentiment, 'Code File')
pull_request_sentiment = analyze_sentiments(pull_request)
graph_sentiments(pull_request_sentiment, 'Pull Request')
hacker_news_sentiment = analyze_sentiments(hacker_news)
graph_sentiments(hacker_news_sentiment, 'Hacker News')


In [None]:


# Load the pre-trained BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# List to hold all texts for batch processing
texts = []
indices = []

# Collect all texts
for i, item in enumerate(all_data):
    if 'ChatgptSharing' in item:
        for j, sharing in enumerate(item['ChatgptSharing']):
            if 'Conversations' in sharing:
                for k, conversation in enumerate(sharing['Conversations']):
                    if 'Prompt' in conversation and 'Answer' in conversation:
                        indices.append((i, j, k))  # Store index to know where to put back embeddings
                        texts.append(conversation['Prompt'])
                        texts.append(conversation['Answer'])

# Encode all texts at once in batches
embeddings = model.encode(texts, convert_to_tensor=True, batch_size=32)  # Adjust batch size based on your system capabilities

# Split embeddings back into individual tensors and update original data structure
for idx, (i, j, k) in enumerate(indices):
    prompt_idx = 2 * idx  # Even index
    answer_idx = 2 * idx + 1  # Odd index
    all_data[i]['ChatgptSharing'][j]['Conversations'][k]['Prompt Embedding'] = embeddings[prompt_idx].cpu().numpy()
    all_data[i]['ChatgptSharing'][j]['Conversations'][k]['Answer Embedding'] = embeddings[answer_idx].cpu().numpy()

# # Optionally, if you need to free up GPU memory:
# if torch.cuda.is_available():
#     torch.cuda.empty_cache()


    # Calculate cosine similarity between current and previous prompt
    if idx > 0:  # Ensure there is a previous prompt
        previous_prompt_idx = 2 * (idx - 1)
        similarity = cosine_similarity(embeddings[previous_prompt_idx].unsqueeze(0), embeddings[prompt_idx].unsqueeze(0))
        similarities.append(similarity.item())
    else:
        similarities.append(0)  # No similarity for the first prompt

# Plotting the similarities
plt.figure(figsize=(10, 5))
plt.plot(similarities, marker='o', linestyle='-', color='b')
plt.title('Cosine Similarity between Successive Prompts')
plt.xlabel('Prompt Index')
plt.ylabel('Cosine Similarity')
plt.grid(True)
plt.show()




In [None]:
i, j = 0, 0  # Adjust these indices to point to the desired 'ChatgptSharing' and 'Conversations'

# Extract the specific conversation data
conversation_data = all_data[i]['ChatgptSharing'][j]['Conversations']
similarities = []
for k in range(1, len(conversation_data)):
    # Convert NumPy arrays back to tensors
    current_embedding = torch.tensor(conversation_data[k]['Prompt Embedding'])
    previous_embedding = torch.tensor(conversation_data[k-1]['Prompt Embedding'])

    # Calculate cosine similarity and append to list
    similarity = cosine_similarity(current_embedding.unsqueeze(0), previous_embedding.unsqueeze(0))
    similarities.append(similarity.item())

# Insert 0 at the beginning since the first prompt has no previous prompt to compare with
similarities.insert(0, 0)

# Plotting the similarities
plt.figure(figsize=(10, 5))
plt.plot(similarities, marker='o', linestyle='-', color='b')
plt.title('Cosine Similarity between Successive Prompts in a Conversation')
plt.xlabel('Prompt Index')
plt.ylabel('Cosine Similarity')
plt.grid(True)
plt.show()

Conclusion 3: taken all these artifacts together we can observe that we have to use a combination of correlation and sentiment anlysis to determine if the developer is satisfied. In general we would like to see a low correlation and a high sentiment value to represent a developer being satisfied as they are likely just asking an entirely new conversation. When it comes to the end of a conversation it is important to do external analysis on if the question was correctly answered which is out of the scope of our research question. 

## RQ3 Temporal Change in ChatGPT Interest

In [None]:
df = time_df

df.columns = pd.to_datetime(df.columns)

df = df.sort_index(axis=1)
# df = df.loc[:, df.columns.year >= 2022]

category_colors = {
    'hacker news': 'red',
    'pull request': 'blue',
    'issue': 'green',
    'discussion': 'orange',
    'commit': 'purple',
    'code file': 'brown'
}

# Plot scatter plot
fig, ax = plt.subplots(figsize=(10, 6))
for category in df.index:
    color = category_colors.get(category, 'black')  # Default to black if color not defined
    ax.scatter(df.columns, df.loc[category], label=category, color=color)

    # Add lines connecting points
    ax.plot(df.columns, df.loc[category], linestyle='-', alpha=0.5, color=color)

# Add legend and labels
ax.legend()
ax.set_xlabel('Date')
ax.set_ylabel('Counts')
ax.set_title('Counts of Categories Over Time')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

### Temporal Analysis

From conclusion 1 of RQ1, we observed that there exist changes of interest in different categories on ChatGPT. In this section, we will try to test the statisical significants of this changes.
For this section we compute the rate of change overtime to determine the statistical significance.

In [None]:
# Convert to DataFrame
roc_df = df

# Transpose the DataFrame
roc_df = roc_df.transpose()

# Fill NaN values with 0
roc_df = roc_df.fillna(0)

# Transpose again to compute rate of change in rows
roc_df = roc_df.transpose()

# Compute the rate of change between each month for each category
rate_of_change = roc_df.pct_change(axis=1) * 100  # Multiply by 100 to convert to percentage change

print(df)
print("\nRate of Change:")
print(rate_of_change)

### Conclusion 1

The rate of change in the discussion towards chatGPT shows that there is a massive jump in interests after the release of chatGPT. However, there also exists a sharp decrease right afterward. Though from the table we can observe that the initial increase in interest manifest overtime. The interest in discussion (hacker news, discussion, etc) is very high in the first two month, and reduced very fast after the initial hype. Where there are more pr and issue created using ChatGPT showing that although the interest in general community has decrease, many developer does attempt to implement chatGPT as part of their programmign tool kit or for problem solving. However, the commit is heavily reduced, this might suggest that user are less confident using ChatGPT to write code, or it also might because developer have using the code from chatGPT very causally without referencing the ChatGPT conversation in the commit. Therefore, the commit stats is a lot less valuable.