**Importing the necessary Libraries**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df = pd.read_csv('chatGpt_tweets.csv')   # loading the Dataset.
df.head()  # printing the first five rows of the dataset.

Unnamed: 0.1,Unnamed: 0,tweets,labels
0,1,"Try talking with ChatGPT, our new AI system wh...",good
1,3,"THRILLED to share that ChatGPT, our new model ...",good
2,4,"As of 2 minutes ago, @OpenAI released their ne...",bad
3,5,"Just launched ChatGPT, our new AI system which...",good
4,6,"As of 2 minutes ago, @OpenAI released their ne...",bad


In [3]:
df.drop(columns='Unnamed: 0',inplace=True)  #drop the column that has no value

In [4]:
df.shape # checking the size and shape of the dataset.

(162449, 2)

In [5]:
df.info() # checking the information of the dataset.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162449 entries, 0 to 162448
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   tweets  162449 non-null  object
 1   labels  162449 non-null  object
dtypes: object(2)
memory usage: 2.5+ MB


In [6]:
df.describe(include='all') # checking the statistics of the dataset.

Unnamed: 0,tweets,labels
count,162449,162449
unique,162448,2
top,#NAME?,bad
freq,2,106695


In [7]:
df.isnull().sum() # checking the null values in the dataset.

tweets    0
labels    0
dtype: int64

In [8]:
df.duplicated().sum() # checking the duplicate values in the dataset.

0

In [9]:
df.columns # checking the columns of the dataset.

Index(['tweets', 'labels'], dtype='object')

In [10]:
df['tweets'].value_counts() # checking the unique values in the tweets column.

tweets
#NAME?                                                                                                                                                                                                                                   2
Google vs. ChatGPT: Here’s what happened when I swapped services for a day https://t.co/PMxWim6T2Y #CNBC https://t.co/5KlLGyN5cY                                                                                                         1
ChatGPT is pretty amazing. https://t.co/FzaOUBlPkp\n\nI asked it to write an essay comparing jazz and blues, and it did it in 30 seconds.                                                                                                1
Uhh #chatgpt can write in Danish too.                                                                                                                                                                                                    1
#Storytime with  #ChatGPT https://t.co/Au3GYmA3UF    

In [11]:
df['labels'].value_counts() # checking the unique values in the labels column.

labels
bad     106695
good     55754
Name: count, dtype: int64

**Text Pre-Processing**

In [12]:
import re # regular expression library

def remove_urls(text): # function to remove urls from the text.
    url_regex = re.compile(r'https?://\S+|www\.\S+') # regular expression pattern for URLs
    return url_regex.sub('', text) # replace URLs with an empty string


df['tweets'] = [remove_urls(review) for review in df['tweets']] # apply the function to the 'tweets' column of the DataFrame


In [13]:
df['tweets'].value_counts() # checking the unique values in the tweets column.

tweets
#MidJourney #OpenAi #GPT #StableDiffusion2 #DallE #ChatGPT\njoin:  ''                                                                                                                               461
ChatGPT                                                                                                                                                                                             297
I‘m participating in the #Pisces #AIGC Campaign to win $300 and #Freemint #NFT, thanks to @PiscesBaishui ’s #giveaway!  #ChatGPT #OpenAI                                                            163
Building A Virtual Machine inside ChatGPT                                                                                                                                                           131
🎉 #Giveaway #Airdrop #nft #eth #doge #usdt #btc #web3 #ChatGPT New Giveaway:PeopleDAO (📜,🤝) Marketing Campaign🪂I have received $6USDT💵, come to complete the activity and get it for free! 👉🏿👉🏿  

In [14]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer # stemming library

In [15]:
nltk.download('stopwords') # download the stopwords library

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [16]:
ps = PorterStemmer() # create an instance of the PorterStemmer class
corpus = [] # create an empty list to store the preprocessed tweets
# Iterate over the values in the 'tweets' column directly
for tweet in df['tweets']: # iterate over the 'tweets' column of the DataFrame
    review = re.sub('[^a-zA-Z]', ' ', tweet) # remove non-alphabetic characters
    review = review.lower() # convert to lowercase
    review = review.split() # split into words

    review = [ps.stem(word) for word in review if not word in stopwords.words('english')] # apply stemming and remove stopwords
    review = ' '.join(review) # join the words back into a single string
    corpus.append(review) # add the preprocessed tweet to the corpus list

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
corpus[:5] # printing the first five preprocessed tweets.

['tri talk chatgpt new ai system optim dialogu feedback help us improv',
 'thrill share chatgpt new model optim dialog public free access everyon',
 'minut ago openai releas new chatgpt n nand use right',
 'launch chatgpt new ai system optim dialogu',
 'minut ago openai releas new chatgpt n nand use right n n']

**Importing Necessary Libraries for Model Building**

In [18]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Dropout

In [19]:
### Vocabulary size
voc_size=5000 # setting the vocabulary size to 5000.

In [20]:
onehot_repr=[one_hot(words,voc_size)for words in corpus] # one-hot encoding the preprocessed tweets.

In [21]:
onehot_repr[:5] # printing the first five one-hot encoded tweets.

[[164, 326, 1039, 2812, 585, 464, 4870, 4127, 3063, 3688, 4716, 4988],
 [2020, 768, 1039, 2812, 4765, 4870, 3674, 2773, 3534, 1906, 1021],
 [2230, 1859, 374, 4951, 2812, 1039, 4267, 1881, 422, 54],
 [4722, 1039, 2812, 585, 464, 4870, 4127],
 [2230, 1859, 374, 4951, 2812, 1039, 4267, 1881, 422, 54, 4267, 4267]]

In [22]:
len(max(onehot_repr,key=len)) # checking the length of the longest one-hot encoded tweet.

53

In [23]:
sent_length=55 # setting the maximum sentence length to 55.
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length) # padding the one-hot encoded tweets to the maximum sentence length.

In [24]:
embedded_docs # printing the first five embedded tweets.

array([[   0,    0,    0, ..., 3688, 4716, 4988],
       [   0,    0,    0, ..., 3534, 1906, 1021],
       [   0,    0,    0, ..., 1881,  422,   54],
       ...,
       [   0,    0,    0, ...,  786, 3027,   51],
       [   0,    0,    0, ..., 1039, 4612,  654],
       [   0,    0,    0, ..., 2472,  220, 3447]], dtype=int32)

In [25]:
embedded_docs[0] # printing the first embedded tweet.

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,  164,
        326, 1039, 2812,  585,  464, 4870, 4127, 3063, 3688, 4716, 4988],
      dtype=int32)

In [26]:
y=df['labels'] # assigning the labels column to the y variable.

In [27]:

len(embedded_docs),y.shape # checking the length of the embedded tweets and the shape of the labels.

(162449, (162449,))

In [28]:
X_final=np.array(embedded_docs) # converting the embedded tweets to a numpy array.
y_final=np.array(y) # converting the labels to a numpy array.

In [29]:
X_final.shape,y_final.shape # checking the shape of the embedded tweets and the labels.

((162449, 55), (162449,))

In [30]:
from sklearn.model_selection import train_test_split # importing the train_test_split function from scikit-learn.
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42) # splitting the data into training and testing sets.

In [31]:
print("X_train data type:", X_train.dtype)
print("y_train data type:", y_train.dtype)
print("X_test data type:", X_test.dtype)
print("y_test data type:", y_test.dtype)

print("Unique values in y_train:", np.unique(y_train))
print("Unique values in y_test:", np.unique(y_test))

X_train data type: int32
y_train data type: object
X_test data type: int32
y_test data type: object
Unique values in y_train: ['bad' 'good']
Unique values in y_test: ['bad' 'good']


In [32]:
from sklearn.preprocessing import LabelEncoder # importing the LabelEncoder class from scikit-learn.
le = LabelEncoder() # creating an instance of the LabelEncoder class.
y_train = le.fit_transform(y_train) # fitting the LabelEncoder to the training labels and transforming them.
y_test = le.transform(y_test) # transforming the testing labels.

In [33]:
## Creating model
embedding_vector_features=40 # setting the embedding vector size to 40.
model=Sequential() # creating a sequential model.
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length)) # adding an embedding layer.
model.add(LSTM(100)) # adding an LSTM layer.
model.add(Dense(1,activation='sigmoid')) # adding a dense layer with sigmoid activation function.
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy']) # compiling the model.
print(model.summary()) # printing the model summary.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 55, 40)            200000    
                                                                 
 lstm (LSTM)                 (None, 100)               56400     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 256501 (1001.96 KB)
Trainable params: 256501 (1001.96 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [34]:
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=64) # fitting the model to the training data.

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7dca1daf3670>

In [35]:
from sklearn.metrics import classification_report # importing the classification_report function from scikit-learn.


y_pred=model.predict(X_test) # predicting the labels for the testing data.
y_pred = (y_pred > 0.5).astype(int)  # converting the predicted probabilities to binary labels.


print(classification_report(y_test,y_pred)) # printing the classification report.

              precision    recall  f1-score   support

           0       0.94      0.94      0.94     35230
           1       0.88      0.89      0.88     18379

    accuracy                           0.92     53609
   macro avg       0.91      0.91      0.91     53609
weighted avg       0.92      0.92      0.92     53609



In [36]:
from sklearn.metrics import f1_score # importing the f1_score function from scikit-learn.
f1_score(y_test, y_pred) # printing the f1 score.


0.8849158979924038