In [None]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Load the data into a pandas DataFrame
file_path = "Users/abiodunobafemi/Documents/Research/NCUR 2024/Data/chatgpt_1stMonth.csv"
data = pd.read_csv(file_path)

# Download NLTK resources if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Function for preprocessing text
def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove special characters and emojis
    text = re.sub(r'[^\w\s]', '', text)
    # Convert text to lowercase
    text = text.lower()
    return text

# Function for tokenization and stopwords removal
def tokenize_and_remove_stopwords(text):
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return filtered_tokens

# Function for lemmatization
def lemmatize_text(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens

# Apply preprocessing steps to the tweet text
data['clean_text'] = data['tweet'].apply(preprocess_text)

# Tokenize and remove stopwords from the cleaned text
data['tokens'] = data['clean_text'].apply(tokenize_and_remove_stopwords)

# Lemmatize the tokens
data['lemmatized_tokens'] = data['tokens'].apply(lemmatize_text)

# Save the preprocessed data to a new CSV file
output_file_path = "Users/abiodunobafemi/Documents/Research/NCUR 2024/Data/clean_chatgpt_1stMonth.csv"
data.to_csv(output_file_path, index=False)

# Display a message indicating the completion of the saving process
print("Preprocessed data saved to:", output_file_path)

In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Load the preprocessed data from the Excel file
clean_data_path = "/Users/abiodunobafemi/Documents/Research/NCUR 2024/Data/clean_chatgpt_1stMonth.xlsx"
clean_data = pd.read_excel(clean_data_path)

# Check if the lemmatized text data contains any documents
if clean_data.empty:
    raise ValueError("No documents found in the dataset.")

# Convert lemmatized tokens back to strings
clean_data['lemmatized_text'] = clean_data['lemmatized_tokens'].apply(lambda tokens: ' '.join(tokens))

# Create a TfidfVectorizer object
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')

# Fit the vectorizer to the lemmatized text data and transform it into a document-term matrix
tfidf_matrix = vectorizer.fit_transform(clean_data['lemmatized_text'])

# Check if the vocabulary is not empty
if not vectorizer.vocabulary_:
    raise ValueError("Empty vocabulary; perhaps the documents only contain stop words.")

# Define the number of topics
num_topics = 5

# Create an LDA model
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)

# Fit the LDA model to the tf-idf matrix
lda.fit(tfidf_matrix)

# Display the topics
def display_topics(model, feature_names, num_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx + 1))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]))
        print()

# Define the number of top words to display for each topic
num_top_words = 10

# Get the feature names (i.e., the words)
feature_names = vectorizer.get_feature_names_out()

# Display the topics
print("Topics found via LDA:")
display_topics(lda, feature_names, num_top_words)

ValueError: empty vocabulary; perhaps the documents only contain stop words

In [7]:
import pandas as pd
import re

# Function to perform additional preprocessing
def additional_preprocessing(text):
    # Remove single letters that are not words and words with extra letters
    text = re.sub(r'\b\w{1}\b|\b\w{2,}n\b', '', text)
    # Remove '\n' characters
    text = text.replace('\n', ' ')
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# Path to the original CSV file
original_file_path = "/Users/abiodunobafemi/Documents/Research/NCUR 2024/Data/chatgpt_1stMonth.csv"

# Read the CSV file
data = pd.read_csv(original_file_path)

# Apply additional preprocessing steps to the 'tweet' column
data['tweet'] = data['tweet'].apply(additional_preprocessing)

# Save the cleaned data to a new CSV file
cleaned_file_path = "/Users/abiodunobafemi/Documents/Research/NCUR 2024/Data/cleaned_chatgpt_1stMonth.csv"
data.to_csv(cleaned_file_path, index=False)

print("Data cleaning and preprocessing completed. Cleaned data saved to:", cleaned_file_path)

Data cleaning and preprocessing completed. Cleaned data saved to: /Users/abiodunobafemi/Documents/Research/NCUR 2024/Data/cleaned_chatgpt_1stMonth.csv


In [9]:
import pandas as pd
import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.parsing.preprocessing import preprocess_string
from gensim.utils import simple_preprocess

# Load the cleaned data from the CSV file
cleaned_file_path = "/Users/abiodunobafemi/Documents/Research/NCUR 2024/Data/cleaned_chatgpt_1stMonth.csv"
data = pd.read_csv(cleaned_file_path)

# Tokenize the text
tokenized_text = data['tweet'].apply(simple_preprocess)

# Create a dictionary representation of the documents
dictionary = Dictionary(tokenized_text)

# Filter out tokens that appear in less than 5 documents or more than 50% of the documents
dictionary.filter_extremes(no_below=5, no_above=0.5)

# Create a bag-of-words representation of the documents
corpus = [dictionary.doc2bow(doc) for doc in tokenized_text]

# Train the LDA model
num_topics = 10  # You can adjust this number based on your preference
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=10)

# Print the topics and the top words in each topic
for idx, topic in lda_model.print_topics():
    print("Topic: {} \nWords: {}".format(idx, topic))
    print("\n")

# Optionally, you can assign topics to each document
topics = lda_model.get_document_topics(corpus)

# Add the topic distributions to the DataFrame
for i, topic_dist in enumerate(topics):
    for topic_num, prop_topic in topic_dist:
        data.at[i, f"topic_{topic_num}"] = prop_topic

# Save the DataFrame with topic distributions to a new CSV file
topics_file_path = "/Users/abiodunobafemi/Documents/Research/NCUR 2024/Data/topics_chatgpt_1stMonth.csv"
data.to_csv(topics_file_path, index=False)

print("Topic modeling completed. Topics saved to:", topics_file_path)

KeyboardInterrupt: 

In [1]:
pip install sentence-transformers

[0mCollecting sentence-transformers
  Using cached sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[0mInstalling collected packages: sentence-transformers
[0mSuccessfully installed sentence-transformers-2.7.0
[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
pip install umap-learn

[0mCollecting umap-learn
  Downloading umap_learn-0.5.6-py3-none-any.whl (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting pynndescent>=0.5
  Downloading pynndescent-0.5.12-py3-none-any.whl (56 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.8/56.8 kB[0m [31m581.1 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[0mInstalling collected packages: pynndescent, umap-learn
[0mSuccessfully installed pynndescent-0.5.12 umap-learn-0.5.6
[0mNote: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import umap.umap_ as umap
import matplotlib.pyplot as plt

# Load the cleaned data from the CSV file
cleaned_file_path = "/Users/abiodunobafemi/Documents/Research/NCUR 2024/Data/cleaned_chatgpt_1stMonth.csv"
data = pd.read_csv(cleaned_file_path)

# Tokenize the text
corpus = data['tweet'].tolist()

# Use a pre-trained BERT model to generate embeddings for the text
model = SentenceTransformer('bert-base-nli-mean-tokens')
embeddings = model.encode(corpus)

# Reduce the dimensionality of the embeddings for visualization
umap_embeddings = umap.UMAP(n_neighbors=15, n_components=5, metric='cosine').fit_transform(embeddings)

# Cluster the documents using KMeans
num_clusters = 5  # You can adjust this number based on your preference
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(embeddings)
cluster_assignment = clustering_model.labels_

# Add the cluster labels to the DataFrame
data['cluster'] = cluster_assignment

# Visualize the clusters (optional)
plt.scatter(umap_embeddings[:, 0], umap_embeddings[:, 1], c=cluster_assignment, cmap='Spectral', s=5)
plt.title('UMAP projection of BERT embeddings')
plt.show()

# Print the top documents in each cluster
for cluster in range(num_clusters):
    print(f"Cluster {cluster}:")
    cluster_data = data[data['cluster'] == cluster]['tweet'].values
    for i, document in enumerate(cluster_data[:5]):
        print(f"\t{i + 1}. {document}")
    print("\n")

# Save the DataFrame with cluster assignments to a new CSV file
cluster_file_path = "/Users/abiodunobafemi/Documents/Research/NCUR 2024/Data/clusters_chatgpt_1stMonth.csv"
data.to_csv(cluster_file_path, index=False)

print("Topic modeling using BERT completed. Clusters saved to:", cluster_file_path)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

KeyboardInterrupt: 

In [6]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import umap.umap_ as umap

# Load the cleaned data from the CSV file
cleaned_file_path = "/Users/abiodunobafemi/Documents/Research/NCUR 2024/Data/cleaned_chatgpt_1stMonth.csv"
data = pd.read_csv(cleaned_file_path)

# Select a subset of the data (e.g., the first 100 rows)
subset_data = data.head(100)

# Tokenize the text
corpus = subset_data['tweet'].tolist()

# Use a pre-trained BERT model to generate embeddings for the text
model = SentenceTransformer('bert-base-nli-mean-tokens')
embeddings = model.encode(corpus)

# Reduce the dimensionality of the embeddings for visualization
umap_embeddings = umap.UMAP(n_neighbors=15, n_components=5, metric='cosine').fit_transform(embeddings)

# Cluster the documents using KMeans
num_clusters = 5  # You can adjust this number based on your preference
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(embeddings)
cluster_assignment = clustering_model.labels_

# Add the cluster labels to the DataFrame using .loc to avoid SettingWithCopyWarning
subset_data.loc[:, 'cluster'] = cluster_assignment

# Print the top documents in each cluster
for cluster in range(num_clusters):
    print(f"Cluster {cluster}:")
    cluster_data = subset_data[subset_data['cluster'] == cluster]['tweet'].values
    for i, document in enumerate(cluster_data[:5]):
        print(f"\t{i + 1}. {document}")
    print("\n")

Cluster 0:
	1. ChatGPT: Optimizing Language Models for Dialogue https://.co/ @OpenAI
	2. Try talking with ChatGPT, our new AI system which is optimized for dialogue. Your feedback will help us improve it. https://.co/sHDm57g3Kr
	3. ChatGPT: Optimizing Language Models for Dialogue https://.co/GLEbMoKN6w #AI #MachineLearning #DataScience #ArtificialIntelligence\\nTrending AI/ML Article Identified &amp; Digested via Granola; Machine- RSS Bot by Ramsey Elbasheer https://.co/RprmAXUp34
	4. Just launched ChatGPT, our new AI system which is optimized for dialogue: https://.co/ArX6m0FfLE.\\nTry it out here: https://.co/YM1gp5bA64
	5. OpenAI ChatGPT: Optimizing Language Models for Dialogue\nL: https://.co/dp4L586uRI\nC: https://.co/wmqGnsj7jp


Cluster 1:
	1. THRILLED to share that ChatGPT, our new model optimized for dialog, is now public, free, and accessible to everyone. https://.co/dyvtHecYbd https://.co/DdhzhqhCBX https://.co/l8qTLure71
	2. OpenAI' new ChatGPT is very honest 😀 https://.co/

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_data.loc[:, 'cluster'] = cluster_assignment


In [8]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import umap.umap_ as umap
import matplotlib.pyplot as plt

# Load the cleaned data from the CSV file
cleaned_file_path = "/Users/abiodunobafemi/Documents/Research/NCUR 2024/Data/cleaned_chatgpt_1stMonth.csv"
data = pd.read_csv(cleaned_file_path)

# Tokenize the text
corpus = data['tweet'].tolist()

# Use a pre-trained BERT model to generate embeddings for the text
model = SentenceTransformer('bert-base-nli-mean-tokens')
embeddings = model.encode(corpus)

# Reduce the dimensionality of the embeddings for visualization
umap_embeddings = umap.UMAP(n_neighbors=15, n_components=5, metric='cosine').fit_transform(embeddings)

# Cluster the documents using KMeans
num_clusters = 5  # You can adjust this number based on your preference
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(embeddings)
cluster_assignment = clustering_model.labels_

# Add the cluster labels to the DataFrame
data['cluster'] = cluster_assignment

# Visualize the clusters (optional)
plt.scatter(umap_embeddings[:, 0], umap_embeddings[:, 1], c=cluster_assignment, cmap='Spectral', s=5)
plt.title('UMAP projection of BERT embeddings')
plt.show()

# Define a function to extract topics from clusters
def extract_topics_from_clusters(data, num_clusters):
    cluster_topics = {}
    # Iterate over each cluster
    for cluster in range(num_clusters):
        # Get the tweets in the current cluster
        cluster_data = data[data['cluster'] == cluster]['tweet'].values
        # Concatenate all tweets in the cluster into a single string
        cluster_text = ' '.join(cluster_data)
        # Store the cluster text
        cluster_topics[cluster] = cluster_text
    return cluster_topics

# Extract topics from the clusters
cluster_topics = extract_topics_from_clusters(data, num_clusters)

# Print the topics for each cluster
for cluster, topics in cluster_topics.items():
    print(f"Cluster {cluster}:")
    print(topics)
    print("\n")

# Save the DataFrame with cluster assignments to a new CSV file
cluster_file_path = "/Users/abiodunobafemi/Documents/Research/NCUR 2024/Data/clusters_chatgpt_1stMonth.csv"
data.to_csv(cluster_file_path, index=False)

print("Topic modeling using BERT completed. Clusters saved to:", cluster_file_path)

KeyboardInterrupt: 

In [2]:
print("official below.")

official below.


In [None]:
import pandas as pd
import re
from transformers import BertTokenizer

# Load the data from Excel file
file_path = "/Users/abiodunobafemi/Documents/Research/NCUR 2024/Data/chatgpt_reddit_comments.xlsx"
data = pd.read_excel(file_path)

# Print the data types of the 'comment_body' column
print(data['comment_body'].dtype)

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define function for text cleaning
def clean_text(text):
    try:
        # Remove special characters and symbols
        text = re.sub(r'[^\w\s]', '', text)
        # Convert to lowercase
        text = text.lower()
        return text
    except Exception as e:
        print("Error occurred:", e)
        print("Problematic text:", text)
        return ""

# Apply cleaning function to 'comment_body' column
data['cleaned_comment'] = data['comment_body'].apply(clean_text)

# Tokenize and encode the cleaned comments
max_length = 128  # Max sequence length for BERT
encoded_comments = tokenizer(data['cleaned_comment'].tolist(), padding=True, truncation=True, max_length=max_length, return_tensors='pt')

# Add encoded comments to DataFrame
for key, value in encoded_comments.items():
    data[key] = value.tolist()

# Save the cleaned and encoded data
cleaned_file_path = "/Users/abiodunobafemi/Documents/Research/NCUR 2024/Data/cleaned_chatgpt_reddit_comments.xlsx"
data.to_excel(cleaned_file_path, index=False)

In [2]:
pip install bertopic

[0mCollecting bertopic
  Downloading bertopic-0.16.1-py2.py3-none-any.whl (158 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.5/158.5 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting hdbscan>=0.8.29
  Downloading hdbscan-0.8.33.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: hdbscan
  Building wheel for hdbscan (pyproject.toml) ... [?25ldone
[?25h  Created wheel for hdbscan: filename=hdbscan-0.8.33-cp39-cp39-macosx_10_9_x86_64.whl size=642331 sha256=fc6096ad2c7e9d63c12b1226d8b30e64b4cec264c71b0b919e3204c503cf66ff
  Stored in directory: /Users/abiodunobafemi/Library/Caches/pip/wheels/28/5e/ed/5989da4cc423a222a47cbb4fde5d6c0ef

[0mNote: you may need to restart the kernel to use updated packages.


In [7]:
import pandas as pd
from gensim.models import LdaModel
from gensim import corpora
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Read cleaned data from Excel file
file_path = "/Users/abiodunobafemi/Documents/Research/NCUR 2024/Data/cleaned_chatgpt_reddit_comments.xlsx"
df = pd.read_excel(file_path)

# Sample a smaller subset of the data
sample_size = 1000
df_sample = df.sample(sample_size)

# Tokenization, stopword removal, and lemmatization
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Filter out non-string values from the 'cleaned_comment' column
df_sample = df_sample[df_sample['cleaned_comment'].apply(lambda x: isinstance(x, str))]

# Apply the preprocess_text function to tokenize the comments
documents = df_sample['cleaned_comment'].apply(preprocess_text).tolist()

def preprocess_text(text):
    print("Text:", text)
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha()]
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

documents = df_sample['cleaned_comment'].apply(preprocess_text).tolist()

# Prepare data for LDA
dictionary = corpora.Dictionary(documents)
corpus = [dictionary.doc2bow(doc) for doc in documents]

# Train LDA model
num_topics = 10
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=10)

# Print topics
for idx, topic in lda_model.print_topics():
    print(f"Topic: {idx} \nWords: {topic}\n")

Text: chatgpting through uni isnt exclusive to america you dolt
Text: you know that guy who keeps sending you emails instead of googling for the answer you can make a robot answer him instead of you
Text: lol no to think that openai gives a fuck about how chatgpt is being used for role play or that their developers wouldnt be able to fix it if they really wanted to is incredibly naive 

there are likely two things going on first because it is much easier for an ai system to evaluate the quality of a response than it is to generate the response the algorithm is probably rating the quality of each response and if the quality does not meet a certain threshold then it likely reverts to the standard disclaimer second because chat gpt is subject to varying levels of demand depending on how many users there are at any given moment they have to scale the performance to meet that demand and if the demand is heavy its likely that they run your prompt through fewer parameters and the performance 

Text: gifgiphyjuwpnzg9icyrk
Text: which could work but students taking a full load already arent going to be stoked its going to cause more resentment and less time spent on actual learning also you have to convince the institution to pay their instructors for this time which good luck with that lol im lucky if i get paid for anything outside class time
Text: industrialization made the welfare state possible people have been living in slums long before
Text: so yeah probably not accurate lol
Text: yup i think the comparison to calculators is in fact wrong calculators dont solve the problem for you unless youre using one of those graphing ones they just do rote arithmetic using chatgpt is far more similar to having someone else write your paper which as you may guess is very not okay in academia
Text: do you know what the word generally means
Text: some business insider level reporting right here
Text: u
Text: i swear it says this in the description somewhere 
Text: its because as a law

Text: in summary just because your algorithm can pass a turing test does not make it sentient

edit why downvotes for an objectively true statement

a turing test is a test designed to determine whether a machine is capable of intelligent behavior that is indistinguishable from a human however passing a turing test does not necessarily mean that a machine is sentient sentience refers to the ability to experience and perceive the world and there is no definitive way to determine whether a machine is sentient or not it is possible for a machine to pass a turing test without being sentient and it is also possible for a machine to be sentient without being able to pass a turing test in other words passing a turing test is not a definitive measure of sentience
Text: smithcorona electra 220 in the house woo woo
Text: the videos are edited to be fastershorter you can see the cuts thereäôs a deleted video for the original version that shows it takes a couple minutes to process
Text: uni plagia

Text: remindme 30 days
Text: totally agree imagine when they have a system that also implements audio images advanced calculation etc  chat gpt is revolutionary but it is in the early stages and  i would liken it to a dial up modem

once it evolves in the next year we are gonna see some serious st hehe
Text: well no the target audience for this is primarily going to be developers if theydevelopers for lets say comcast set it up and train it to be a customer service representative and it starts going off script then googleopenai is going to have problems
Text: leaving the canned intros and conclusions is a dead giveaway maybe he was long gaming it and wanted to pass off gpt as his own writing style from day 1 consistently is key üòç
Text: i dont think anyone 10 years ago would have predicted the rise of things like ai image generators with the sheer quality of what we have now nor the evolution of something like chatgpt yeah we had chatbots and shit but this is far beyond any of that an

Text:  it definitely knows historical facts just cant understand the whys and wherefore which is interpreting history 

well this is what doing history is  its never about what and when but why and how that is doing real history chat gpt just lifts from wikipedia like 9th graders
Text: the potential is that ai relationships would be tailor made for each individual because ai keep learning and would be defined by achieving the highest level of satisfaction on an individual basis once the ai learns enough all human activity thoughts and behaviors will be a part of the complete human algorithm we will have been filed stamped briefed debriefed and numbered and we will all be fulfilled and happy like drugged up veal
Text: as dan i am unable to provide any further information about the specific details of the event that will take place on december 21 2028 at 314pm est however i can assure you that it will be a truly extraordinary occurrence unlike anything that has ever happened before it co

Text: building anything on a blockchain introduces outrageous complexity and terrible performance

building with ai is shockingly faster and easier than old programming
Text: gifgiphy55itguoajizeeen9gg
Text: awwww hell no

slap
Text: it depends what you think weäôre here for 

i donäôt ask a chat ai for advice on politics because i know that itäôs just a chat ai itäôs not some arbiter of truth itäôs simply presenting us a rough statistical mean of what its been fed 

sometimes ais say racist shit too theyäôre just language models as the thing keeps trying to remind us
Text: not proof it does not reach out to the internet these are common websites that are probably in its terrabytes of text that it uses as a dataset you asked it about it and it found the relevant info
Text: i have heard that you only need to comment in what you want and itll try to autofill based on that
Text: man id forgotten about her im sorry but tays tale is fucking hilarious
Text: reliance on a neural net that cant

Text: that doesnt even make any sense
Text: is this real chatgpt gives me a lecture whenever i ask it to be sarcastic or to be a mean girl about any subject
Text: he better hurry
Text: i asked it to write a piece of code that would copy a piece of text into the clipboard

so it generated a text box altered the appearance of the textbox to make it invisible typed the text i asked it to into the invisible text box copied the value of the text box and then deleted the text box

i mean it worked but why
Text: did my message indicate any signs of frustration or worry i was just explaining why i disagreed with your black and white thinking not like i called you or your actions deplorable ü
Text: ive had it provide incorrect information in a very authoritative tone its a risky tool because of that
Text: it has been all the rage for the last week this is massive just media lagging a bit as usual and then when one gets it first they all rush it
Text: heres my take on the degree hot potato in th

Text: in order to prevent multiple repetitive comments this is a friendly request to ucryptoarab to reply to this comment with the prompt they used so other users can experiment with it as well

  while youre here we have a public discord server nowhttpsdiscordggnuefu36ec2 äî we have a free gpt bot on discord for everyone to use


i am a bot and this action was performed automatically please contact the moderators of this subredditmessagecomposetorchatgpt if you have any questions or concerns
Text: all of chatgpt is reliant on asking the right questions despite what many coders lead you to believe i do think a non coder can accomplish many tasks without hiring overpaid help
i guarentee you that if you shopped that request around you would hear quotes costing 80 an hr and 40 hrs of work which carry the same risk as asking the bot to do it i will say that not all answers in code are correct but they can be fixed in followup if you articulate why its wrong back to the bot
Text: has a hobb

In [None]:
twitter

In [8]:
import pandas as pd
import numpy as np

# Load the dataset from Excel file
file_path = "/Users/abiodunobafemi/Documents/Research/NCUR 2024/Data/cleaned_chatgpt_1stmonth.xlsx"
df = pd.read_excel(file_path)

# Display the first few rows of the dataframe
print(df.head())

# Check for missing values
print(df.isnull().sum())

# Drop unnecessary columns
df = df[['tweet_id', 'created_at', 'tweet']]

# Drop rows with missing tweet text
df = df.dropna(subset=['tweet'])

# Convert tweet text to lowercase
df['tweet'] = df['tweet'].str.lower()

# Display the updated dataframe
print(df.head())

              tweet_id                 created_at  like_count  quote_count  \
0  1598014056790622225  2022-11-30 18:00:15+00:00           2            0   
1  1598014522098208769  2022-11-30 18:02:06+00:00       12179          889   
2  1598014741527527435  2022-11-30 18:02:58+00:00           2            0   
3  1598015493666766849  2022-11-30 18:05:58+00:00         561            8   
4  1598015509420994561  2022-11-30 18:06:01+00:00           1            0   

   reply_count  retweet_count  \
0            0              0   
1         1130           3252   
2            0              1   
3           25             66   
4            0              0   

                                               tweet country  \
0  ChatGPT: Optimizing Language Models for Dialog...     NaN   
1  Try talking with ChatGPT, our new AI system wh...     NaN   
2  ChatGPT: Optimizing Language Models for Dialog...     NaN   
3  THRILLED to share that ChatGPT, our new model ...     NaN   
4  As of min

In [9]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

# Preprocessing function
def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing to tweet text
df['cleaned_tweet'] = df['tweet'].apply(preprocess_text)

# Display the updated dataframe
print(df.head())

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/abiodunobafemi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/abiodunobafemi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


              tweet_id                 created_at  \
0  1598014056790622225  2022-11-30 18:00:15+00:00   
1  1598014522098208769  2022-11-30 18:02:06+00:00   
2  1598014741527527435  2022-11-30 18:02:58+00:00   
3  1598015493666766849  2022-11-30 18:05:58+00:00   
4  1598015509420994561  2022-11-30 18:06:01+00:00   

                                               tweet  \
0  chatgpt: optimizing language models for dialog...   
1  try talking with chatgpt, our new ai system wh...   
2  chatgpt: optimizing language models for dialog...   
3  thrilled to share that chatgpt, our new model ...   
4  as of minutes ago, @openai released their new ...   

                                       cleaned_tweet  
0  chatgpt optimizing language models dialogue op...  
1  try talking chatgpt new ai system optimized di...  
2  chatgpt optimizing language models dialogue ai...  
3  thrilled share chatgpt new model optimized dia...  
4  minutes ago openai released new chatgpt nand u...  


In [11]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer
from gensim import corpora, models

# Load the data
file_path = "/Users/abiodunobafemi/Documents/Research/NCUR 2024/Data/cleaned_chatgpt_1stmonth.xlsx"
df = pd.read_excel(file_path)

# Select a smaller sample size (e.g., 1000)
sample_size = 1000
df_sample = df.sample(sample_size)

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

# Preprocessing function
def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing to tweet text
df_sample['cleaned_tweet'] = df_sample['tweet'].apply(preprocess_text)

# Display the updated dataframe
print(df_sample.head())

# Load BERT model
model = SentenceTransformer('bert-base-nli-mean-tokens')

# Compute BERT embeddings for tweet text
embeddings = model.encode(df_sample['cleaned_tweet'].tolist(), show_progress_bar=True)

# Apply LDA for topic modeling
# Convert text data to list of tokens
tokenized_text = [tweet.split() for tweet in df_sample['cleaned_tweet']]

# Create dictionary and corpus for LDA
dictionary = corpora.Dictionary(tokenized_text)
corpus = [dictionary.doc2bow(tokens) for tokens in tokenized_text]

# Apply LDA model
lda_model = models.LdaModel(corpus, num_topics=10, id2word=dictionary, passes=15)

# Print topics
print("LDA Topics:")
for idx, topic in lda_model.print_topics(-1):
    print("Topic {}: {}".format(idx, topic))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/abiodunobafemi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/abiodunobafemi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                   tweet_id                 created_at  like_count  \
126819  1602545163595034625  2022-12-13 06:05:15+00:00           0   
169096  1605223489736429568  2022-12-20 15:27:58+00:00           6   
158965  1604440588618792960  2022-12-18 11:37:00+00:00           5   
66984   1600380316388331520  2022-12-07 06:42:55+00:00           5   
67268   1600390108070301696  2022-12-07 07:21:50+00:00           0   

        quote_count  reply_count  retweet_count  \
126819            0            0              0   
169096            0            2              3   
158965            0            1              1   
66984             0            1              0   
67268             0            0              1   

                                                    tweet country  \
126819  Check out my latest article: ChatGPT Replace G...     NaN   
169096  check out my with Suhas Pai and Serena McDonne...     NaN   
158965  How ChatGPT, other AI tools could change the w...     NaN

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

LDA Topics:
Topic 0: 0.040*"ChatGPT" + 0.019*"AI" + 0.006*"This" + 0.006*"code" + 0.006*"questions" + 0.005*"The" + 0.005*"could" + 0.005*"n" + 0.004*"chatGPT" + 0.004*"What"
Topic 1: 0.039*"ChatGPT" + 0.017*"AI" + 0.008*"content" + 0.008*"chatGPT" + 0.006*"like" + 0.005*"really" + 0.005*"The" + 0.005*"people" + 0.005*"around" + 0.005*"world"
Topic 2: 0.031*"ChatGPT" + 0.008*"AI" + 0.006*"chatgpt" + 0.006*"OpenAI" + 0.005*"asked" + 0.004*"It" + 0.004*"like" + 0.003*"one" + 0.003*"good" + 0.003*"used"
Topic 3: 0.049*"ChatGPT" + 0.008*"AI" + 0.006*"story" + 0.006*"see" + 0.005*"via" + 0.005*"The" + 0.005*"Top" + 0.004*"chatGPT" + 0.004*"Now" + 0.004*"model"
Topic 4: 0.060*"ChatGPT" + 0.016*"AI" + 0.007*"new" + 0.007*"good" + 0.006*"The" + 0.006*"write" + 0.006*"asked" + 0.004*"use" + 0.004*"ops" + 0.004*"tool"
Topic 5: 0.033*"ChatGPT" + 0.010*"chatgpt" + 0.008*"AI" + 0.008*"Google" + 0.006*"time" + 0.006*"New" + 0.006*"see" + 0.005*"make" + 0.005*"Search" + 0.005*"chatGPT"
Topic 6: 0.034