In [43]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import requests

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [2]:
tf.__version__

'2.9.0'

In [3]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [4]:
tf.test.is_gpu_available()

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
Metal device set to: Apple M1 Pro


2023-02-22 12:54:09.843550: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-02-22 12:54:09.843867: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


True

# get data

In [5]:
def get_ds():
    url = "https://datasets-server.huggingface.co/parquet?dataset=aadityaubhat%2FGPT-wiki-intro"
    response = requests.get(url)
    if response.status_code !=200:
        return f"error during dataset request: {response.status_code}"
    
    url_parquet = [files['url'] for files in response.json()['parquet_files']]
    
    
    df = [pd.read_parquet(url_) for url_ in url_parquet]
    
    return pd.concat(df)

In [6]:
df = get_ds()
df.set_index('id',inplace=True)

In [7]:
df['random']=np.random.random(len(df))

In [8]:
# reorganize ds and randomize samples wiki/generated
df.loc[df['random']<.5,'text']=df['generated_intro']
df.loc[df['random']<.5,'label']='generated'
df.loc[df['random']>=.5,'text']=df['wiki_intro']
df.loc[df['random']>=.5,'label']='wiki'

In [9]:
df.label.value_counts()

wiki         75039
generated    74961
Name: label, dtype: int64

In [10]:
#check distribution is similar accross classes

In [11]:
df[df['label']=='generated'].describe()

Unnamed: 0,title_len,wiki_intro_len,generated_intro_len,prompt_tokens,generated_text_tokens,random
count,74961.0,74961.0,74961.0,74961.0,74961.0,74961.0
mean,2.215472,196.005389,129.293739,28.956297,165.494964,0.250315
std,0.630953,41.422459,56.985389,5.026033,77.129319,0.144519
min,1.0,150.0,7.0,21.0,1.0,1e-05
25%,2.0,164.0,86.0,26.0,106.0,0.125203
50%,2.0,184.0,122.0,28.0,154.0,0.250413
75%,3.0,216.0,170.0,31.0,220.0,0.375184
max,3.0,350.0,278.0,148.0,300.0,0.499988


In [12]:
df[df['label']=='wiki'].describe()

Unnamed: 0,title_len,wiki_intro_len,generated_intro_len,prompt_tokens,generated_text_tokens,random
count,75039.0,75039.0,75039.0,75039.0,75039.0,75039.0
mean,2.215288,196.004837,129.644372,28.959221,165.986287,0.749148
std,0.628077,41.466319,57.095142,5.063986,77.312752,0.144812
min,1.0,150.0,7.0,21.0,2.0,0.500002
25%,2.0,164.0,86.0,26.0,107.0,0.623386
50%,2.0,184.0,122.0,28.0,155.0,0.748501
75%,3.0,216.0,171.0,31.0,221.0,0.874417
max,3.0,350.0,274.0,141.0,300.0,0.999998


In [13]:
X_train,X_test,y_train,y_test = train_test_split(df['text'],df['label'],test_size=.2)

In [14]:
y_train

id
22367823         wiki
67428769    generated
68402160    generated
2642042     generated
4341348     generated
              ...    
58444157    generated
12569126         wiki
16694673    generated
1291347     generated
5080244          wiki
Name: label, Length: 120000, dtype: object

# Baseline

In [15]:
# vectorize text ; using tfidf
vectorize = TfidfVectorizer(ngram_range=(1,1),max_df=.95,min_df=.05)

#transform
X_train_vect = vectorize.fit_transform(X_train)
X_test_vect = vectorize.transform(X_test)

In [16]:
X_train_vect.shape

(120000, 213)

In [17]:
#NB
model = MultinomialNB()
model.fit(X_train_vect,y_train)

In [18]:
# accuracy of NB model
model.score(X_test_vect,y_test)

0.7497

In [19]:
# classification report 
y_pred = model.predict(X_test_vect)
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

   generated       0.73      0.77      0.75     14354
        wiki       0.77      0.73      0.75     15646

    accuracy                           0.75     30000
   macro avg       0.75      0.75      0.75     30000
weighted avg       0.75      0.75      0.75     30000



# Features engineering

In [20]:
#WIP

# Models

## Simple LSTM

In [21]:
try:
    X_train_vect = X_train_vect.toarray()
    X_test_vect = X_test_vect.toarray()
except AttributeError:
    pass

if len(X_train_vect.shape)<3:
    X_train_vect = np.expand_dims(X_train_vect,-1)
    X_test_vect = np.expand_dims(X_test_vect,-1)

y_train = np.where(y_train=='generated',1,0)
y_test = np.where(y_test=='generated',1,0)

In [42]:
def lstm_model():
    inputs = tf.keras.layers.Input(shape=(X_train_vect[0].shape))
#     inputs = tf.keras.layers.Input(shape=(200,1))
#     x = tf.keras.layers.LSTM(128,return_sequences=True,activation='tanh')(inputs)
    x = tf.keras.layers.LSTM(32,return_sequences=False,activation='tanh')(inputs)
    x = tf.keras.layers.Dense(16,activation='relu')(x)
    outputs = tf.keras.layers.Dense(1,activation='sigmoid')(x)
    
    return tf.keras.Model(inputs=inputs,outputs=outputs)

In [55]:
#compile
lstm = lstm_model()
lstm.compile(loss='binary_crossentropy',optimizer='adam',metrics='accuracy')

In [62]:
#callbacks

#early stopping
es = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy',patience=4,restore_best_weights=True)

#tensorboard
log_dir = os.path.join(os.path.abspath(os.path.pardir),'logs','simple_lstm')
tb = tf.keras.callbacks.TensorBoard(log_dir=log_dir,write_graph=False,)

#learning rate
lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss',factor=.5,patience=3,verbose=1)

In [None]:
history = lstm.fit(x=X_train_vect,
                   y=np.expand_dims(y_train,-1),
                   validation_split=.2,
                   callbacks=[es,tb,lr],
                   batch_size=32,
                   epochs=50,)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50

In [64]:
lstm.save(os.path.join(os.path.abspath(os.path.pardir),'models','simple_lstm'))



INFO:tensorflow:Assets written to: /Users/arthurcollard/code/arthurcol/gpt_vs_human/models/simple_lstm/assets


INFO:tensorflow:Assets written to: /Users/arthurcollard/code/arthurcol/gpt_vs_human/models/simple_lstm/assets


## Using Bidirectional LSTM

In [65]:
def bilstm_model():
    inputs = tf.keras.layers.Input(shape=(X_train_vect[0].shape))
#     inputs = tf.keras.layers.Input(shape=(200,1))
#     x = tf.keras.layers.LSTM(128,return_sequences=True,activation='tanh')(inputs)
    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32,return_sequences=False,activation='tanh'))(inputs)
    x = tf.keras.layers.Dense(16,activation='relu')(x)
    outputs = tf.keras.layers.Dense(1,activation='sigmoid')(x)
    
    return tf.keras.Model(inputs=inputs,outputs=outputs)

In [69]:
#compile
bilstm = bilstm_model()
bilstm.compile(loss='binary_crossentropy',optimizer='adam',metrics='accuracy')

In [70]:
#callbacks

#early stopping
es = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy',patience=4,restore_best_weights=True)

#tensorboard
log_dir = os.path.join(os.path.abspath(os.path.pardir),'logs','bi_lstm')
tb = tf.keras.callbacks.TensorBoard(log_dir=log_dir,write_graph=False,)

#learning rate
lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss',factor=.5,patience=3,verbose=1)

In [None]:
history = bilstm.fit(x=X_train_vect,
                   y=np.expand_dims(y_train,-1),
                   validation_split=.2,
                   callbacks=[es,tb,lr],
                   batch_size=32,
                   epochs=50,)

Epoch 1/50


2023-02-22 15:13:06.858445: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-02-22 15:13:07.012362: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-02-22 15:13:07.012416: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-02-22 15:13:07.604241: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-02-22 15:13:07.618011: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2023-02-22 15:17:53.699522: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-02-22 15:17:53.764278: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-02-22 15:17:53.764332: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
 204/3000 [=>............................] - ETA: 4:38 - loss: 0.5084 - accuracy: 0.7518

## Unleash the power of attention models