In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [17]:
df=pd.read_csv('chatGpt_tweets.csv',nrows=30000)
df.head()

Unnamed: 0.1,Unnamed: 0,tweets,labels
0,1,"Try talking with ChatGPT, our new AI system wh...",good
1,3,"THRILLED to share that ChatGPT, our new model ...",good
2,4,"As of 2 minutes ago, @OpenAI released their ne...",bad
3,5,"Just launched ChatGPT, our new AI system which...",good
4,6,"As of 2 minutes ago, @OpenAI released their ne...",bad


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  30000 non-null  int64 
 1   tweets      30000 non-null  object
 2   labels      30000 non-null  object
dtypes: int64(1), object(2)
memory usage: 703.2+ KB


In [19]:
df.isnull().sum()

Unnamed: 0    0
tweets        0
labels        0
dtype: int64

In [20]:
df.duplicated().sum()

0

In [21]:
df['labels'].value_counts()

labels
bad     19163
good    10837
Name: count, dtype: int64

In [22]:
df['labels'].value_counts()/len(df)*100

labels
bad     63.876667
good    36.123333
Name: count, dtype: float64

In [23]:
df1 = df.drop(columns=['Unnamed: 0','labels'])

In [24]:
df1.head()

Unnamed: 0,tweets
0,"Try talking with ChatGPT, our new AI system wh..."
1,"THRILLED to share that ChatGPT, our new model ..."
2,"As of 2 minutes ago, @OpenAI released their ne..."
3,"Just launched ChatGPT, our new AI system which..."
4,"As of 2 minutes ago, @OpenAI released their ne..."


In [25]:
df1['tweets'].value_counts()

tweets
Try talking with ChatGPT, our new AI system which is optimized for dialogue. Your feedback will help us improve it. https://t.co/sHDm57g3Kr                                                                                                  1
A conversation with ChatGPT by @OpenAI \n\nWhat ChatGPT thinks of @elonmusk and @sama , its creators...\n#chatgpt3 #AI #OpenAI #OpenAIChat https://t.co/q7J38AvMXK                                                                           1
ChatGPT\nIf you were a supervillain, what do you need to do to steal 1 billion dollars in three years?\n...\nAnd if you could mint your own currency how would you steal the 1 billion dollars then? https://t.co/aKE1Jyo1Zo                 1
AI won't be as good at creating "mid-level" writing as many think. There is lots of valuable text produced by average people that ChatGPT can't write.\n\nAlso, even stuff that it can write people won't usually want to read. \n\n-&gt;    1
I love this but at the same time I kn

In [26]:
import re # regular expression library

def remove_urls(text): # function to remove urls from the text.
    url_regex = re.compile(r'https?://\S+|www\.\S+') # regular expression pattern for URLs
    return url_regex.sub('', text) # replace URLs with an empty string


df1['tweets'] = [remove_urls(review) for review in df1['tweets']] # apply the function to the 'tweets' column of the DataFrame
df1['tweets'].value_counts()

tweets
Building A Virtual Machine inside ChatGPT                                                                                                                                                                                                  83
ChatGPT                                                                                                                                                                                                                                    80
LastPass hacked, OpenAI opens access to ChatGPT, and Kanye gets suspended from Twitter (again)                                                                                                                                             31
#MidJourney #OpenAi #GPT #StableDiffusion2 #DallE #ChatGPT\njoin:  ''                                                                                                                                                                      24
LastPass hacked, OpenAI opens access to C

In [27]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.corpus import stopwords
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

# Initialize stemmer
stemmer = SnowballStemmer("english")

# Define a function to preprocess the text
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in stopwords.words('english') and len(token) > 3:
            result.append(stemmer.stem(WordNetLemmatizer().lemmatize(token, pos='v')))
    return result

# Apply preprocessing to the dataset
processed_docs = [preprocess(doc) for doc in df1['tweets']]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:
from gensim.corpora.dictionary import Dictionary

# Create a dictionary representation of the documents
dictionary = Dictionary(processed_docs)

# Filter out extremes to limit the number of features
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

# Create a Bag-of-Words (BoW) representation of the documents
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]


In [29]:
from gensim.models.ldamodel import LdaModel

# Set training parameters
num_topics = 2
chunksize = 2000
passes = 20
iterations = 400
eval_every = None

# Build the LDA model
lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)




In [30]:
# Print the topics discovered by the model
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

# Get the topic distribution for a document
doc_topics = lda_model.get_document_topics(corpus[0])

# Print the topic distribution for the first document
print(doc_topics)


Topic: 0 
Words: 0.017*"like" + 0.014*"make" + 0.014*"answer" + 0.011*"think" + 0.011*"question" + 0.011*"googl" + 0.011*"know" + 0.010*"thing" + 0.009*"human" + 0.008*"good"
Topic: 1 
Words: 0.042*"openai" + 0.034*"write" + 0.024*"ask" + 0.014*"code" + 0.012*"generat" + 0.012*"chatbot" + 0.012*"time" + 0.012*"use" + 0.010*"take" + 0.009*"world"
[(0, 0.7990678), (1, 0.20093217)]


In [31]:
doc_topics

[(0, 0.7990678), (1, 0.20093217)]