In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


In [2]:
df=pd.read_csv('chatGpt_tweets.csv',nrows=20000)
df.head()

Unnamed: 0.1,Unnamed: 0,tweets,labels
0,1,"Try talking with ChatGPT, our new AI system wh...",good
1,3,"THRILLED to share that ChatGPT, our new model ...",good
2,4,"As of 2 minutes ago, @OpenAI released their ne...",bad
3,5,"Just launched ChatGPT, our new AI system which...",good
4,6,"As of 2 minutes ago, @OpenAI released their ne...",bad


In [3]:
df.shape

(20000, 3)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  20000 non-null  int64 
 1   tweets      20000 non-null  object
 2   labels      20000 non-null  object
dtypes: int64(1), object(2)
memory usage: 468.9+ KB


In [5]:
df.isnull().sum()

Unnamed: 0    0
tweets        0
labels        0
dtype: int64

In [6]:
df.duplicated().sum()

0

In [7]:
df.shape

(20000, 3)

In [8]:
df['labels'].value_counts()

labels
bad     12697
good     7303
Name: count, dtype: int64

In [10]:
df['labels'].value_counts()/len(df)*100

labels
bad     63.485
good    36.515
Name: count, dtype: float64

In [11]:

x = df['tweets']
y = df['labels']


In [12]:
df1=pd.DataFrame(x)
df1.head()

Unnamed: 0,tweets
0,"Try talking with ChatGPT, our new AI system wh..."
1,"THRILLED to share that ChatGPT, our new model ..."
2,"As of 2 minutes ago, @OpenAI released their ne..."
3,"Just launched ChatGPT, our new AI system which..."
4,"As of 2 minutes ago, @OpenAI released their ne..."


In [13]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tweets  20000 non-null  object
dtypes: object(1)
memory usage: 156.4+ KB


In [14]:
df1['tweets'].value_counts()

tweets
Try talking with ChatGPT, our new AI system which is optimized for dialogue. Your feedback will help us improve it. https://t.co/sHDm57g3Kr                                                                                                                                                          1
An important point in peace between Israel and Palestine within the framework of one or two countries, according to OpenAI. Can we call this a logical, impartial proposal? #ChatGPT #OpenAI https://t.co/lUE1t8IrvE                                                                                 1
"In music, less is more. Simplicity can be the key to unlocking greatness" - chatGPT\n\nprompt: write a one-liner in the style of Rick Ruben\n\nNaild it!                                                                                                                                            1
Just ask any question to @OpenAI ChatGPT, and you will feel like talking to a person. This is a great enhanc

In [15]:
import re # regular expression library

def remove_urls(text): # function to remove urls from the text.
    url_regex = re.compile(r'https?://\S+|www\.\S+') # regular expression pattern for URLs
    return url_regex.sub('', text) # replace URLs with an empty string


df1['tweets'] = [remove_urls(review) for review in df1['tweets']] # apply the function to the 'tweets' column of the DataFrame


In [16]:
df1['tweets'].value_counts()

tweets
ChatGPT                                                                                                                                                                                                                                                                           63
Building A Virtual Machine inside ChatGPT                                                                                                                                                                                                                                         45
LastPass hacked, OpenAI opens access to ChatGPT, and Kanye gets suspended from Twitter (again)                                                                                                                                                                                    30
LastPass hacked, OpenAI opens access to ChatGPT, and Kanye gets suspended from Twitter (again) • TechCrunch                                                       

In [17]:
from nltk.stem.porter import PorterStemmer
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

# Create an instance of the PorterStemmer class
ps = PorterStemmer()
corpus = []

# Preprocess the tweets
for tweet in df1['tweets']:
    review = re.sub('[^a-zA-Z]', ' ', tweet)
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
from tensorflow.keras.preprocessing.text import one_hot

# One hot encoding
voc_size = 5000
onehot_repr = [one_hot(words, voc_size) for words in corpus]

In [19]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Padding sequences
sent_length = 55
embedded_docs = pad_sequences(onehot_repr, padding='pre', maxlen=sent_length)

In [21]:
# Check the shape of df1 and embedded_docs
print("Shape of df1['tweets]:", df1['tweets'].shape)
print("Shape of embedded_docs:", embedded_docs.shape)

from sklearn.cluster import KMeans
n_clusters = 2  # You can choose the number of clusters based on your needs
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
kmeans.fit(embedded_docs)
df1['target'] = kmeans.labels_



Shape of df1['tweets]: (20000,)
Shape of embedded_docs: (20000, 55)


In [22]:
df1['target'].value_counts()

target
0    13184
1     6816
Name: count, dtype: int64

In [23]:
df1['target'].value_counts()/len(df1)*100

target
0    65.92
1    34.08
Name: count, dtype: float64

In [24]:
# Convert the target column to numpy array
y = np.array(df1['target'])

from sklearn.model_selection import train_test_split
x=embedded_docs
y=y
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [25]:

# Build the LSTM model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

model = Sequential()
model.add(Embedding(voc_size, 40, input_length=sent_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [26]:

# Train the model
model.fit(x_train, y_train, epochs=10, batch_size=32)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7f518a875240>

In [27]:
# Evaluate the model
loss, accuracy = model.evaluate(x_train,y_train)
print(f"Loss: {loss}")
print(f"Accuracy: {accuracy}")

Loss: 0.0051692514680325985
Accuracy: 0.9984328150749207


In [28]:
from sklearn.metrics import classification_report

y_pred = model.predict(x_test)
y_pred = (y_pred > 0.5).astype(int)

# Print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.99      0.97      4384
           1       0.98      0.91      0.95      2216

    accuracy                           0.96      6600
   macro avg       0.97      0.95      0.96      6600
weighted avg       0.97      0.96      0.96      6600

