In [162]:
import tensorflow as tf
import pandas as pd 
import numpy as np 
import os
import requests
import gensim.downloader

# Loading in memory

In [8]:
def load_data_in_memory():
    """
    Load dataset from hugging face servers in memory
    """
    url = "https://datasets-server.huggingface.co/parquet?dataset=aadityaubhat%2FGPT-wiki-intro"
    response = requests.get(url)
    if response.status_code !=200:
        return f"error during dataset request: {response.status_code}"
    
    url_parquet = [files['url'] for files in response.json()['parquet_files']]
    
    
    df = [pd.read_parquet(url_) for url_ in url_parquet]
    
    return pd.concat(df)

In [9]:
def get_ds(path_data):
    """
    Load dataset and basic transformation for our task
    """
    if 'data.csv' in os.listdir(path_data):
        print('Loading dataset from local...')
        df = pd.read_csv(os.path.join(path_data,'data.csv'),index_col='id')
    else:
        df = load_data_in_memory()
        df.set_index('id',inplace=True)
        df['random']=np.random.random(len(df))

        # reorganize ds and randomize samples wiki/generated
        df.loc[df['random']<.5,'text']=df['generated_intro']
        df.loc[df['random']<.5,'label']='generated'
        df.loc[df['random']>=.5,'text']=df['wiki_intro']
        df.loc[df['random']>=.5,'label']='wiki'
        # dump csv
        df.to_csv(os.path.join(path_data,'data.csv'))
        
    df['label']=df['label'].replace({'generated':1,'wiki':0})
    
    display(df.head())
    
    return df

In [10]:
PATH_DATA = os.path.join(os.path.dirname(os.path.abspath(os.path.curdir)),'raw_data')

df = get_ds(PATH_DATA)

Loading dataset from local...


Unnamed: 0_level_0,url,title,wiki_intro,generated_intro,title_len,wiki_intro_len,generated_intro_len,prompt,generated_text,prompt_tokens,generated_text_tokens,random,text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
63064638,https://en.wikipedia.org/wiki/Sexhow%20railway...,Sexhow railway station,Sexhow railway station was a railway station b...,Sexhow railway station was a railway station l...,3,174,78,200 word wikipedia style introduction on 'Sexh...,"located in the town of Sexhow, on the Cumbria...",25,88,0.170068,Sexhow railway station was a railway station l...,1
279621,https://en.wikipedia.org/wiki/Eti%C3%A4inen,Etiäinen,"In Finnish folklore, all places and things, an...","In Finnish folklore, all places and things, an...",1,187,80,200 word wikipedia style introduction on 'Etiä...,"animate or inanimate, have a spirit or ""etiäi...",26,101,0.839452,"In Finnish folklore, all places and things, an...",0
287229,https://en.wikipedia.org/wiki/Inverse%20functi...,Inverse function theorem,"In mathematics, specifically differential calc...","In mathematics, specifically differential calc...",3,170,59,200 word wikipedia style introduction on 'Inve...,function theorem states that for every real-v...,26,65,0.532203,"In mathematics, specifically differential calc...",0
26712375,https://en.wikipedia.org/wiki/Stepping%20on%20...,Stepping on Roses,is a Japanese shōjo manga series written and i...,is a Japanese shōjo manga series written and i...,3,335,121,200 word wikipedia style introduction on 'Step...,and illustrated by Maki Fujii. The series fol...,26,150,0.715507,is a Japanese shōjo manga series written and i...,0
38894426,https://en.wikipedia.org/wiki/Rob%20Bradley,Rob Bradley,"Robert Milner ""Rob"" Bradley, Jr. (born August ...","Robert Milner ""Rob"" Bradley, Jr. (born August ...",2,170,136,200 word wikipedia style introduction on 'Rob ...,"29, 1973) is an American former professional ...",28,162,0.395063,"Robert Milner ""Rob"" Bradley, Jr. (born August ...",1


In [14]:
# compute # sentences
df['nsentences'] = df['text'].apply(lambda x : len(x.split('.')))

In [12]:
def word_per_sentence(text):
    """
    Compute the mean and variance of the number of words per sentences of a text.
    """
    sentences = text.split('.')
    lengths = []
    for s in sentences : 
        lengths.append(len(s.split()))
    return [np.mean(np.array(lengths)),np.std(np.array(lengths))]

In [13]:
df_e = pd.concat([
    df,
    df['text'].apply(word_per_sentence)\
                .apply(pd.Series)\
                .rename({0:'mean_w_p_s',1:'var_w_p_s'},axis=1)]
                 ,axis=1)

In [113]:
X = df_e[['text','nsentences','mean_w_p_s','var_w_p_s']]
y = df_e[['label']]

# Serialize dataset

In [35]:
# The following functions can be used to convert a value to a type compatible
# with tf.train.Example.

def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


def _nonscaler_feature(value):
    serialized_nonscalar = tf.io.serialize_tensor(value)
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[serialized_nonscalar.numpy()]))

In [None]:
print(_bytes_feature(b'test_string'))
print(_bytes_feature(u'test_bytes'.encode('utf-8')))

print(_float_feature(np.exp(1)))

print(_nonscaler_feature([1,2,3]))

print(_int64_feature(True))
print(_int64_feature(1))

In [235]:
def serialize_example(id_,text,vect, nsentences, mean_w_p_s ,var_w_p_s, label):
    """
    Creates a tf.train.Example message ready to be written to a file.
    """
    # Create a dictionary mapping the feature name to the tf.train.Example-compatible
    # data type.
    feature = {
        'id': _int64_feature(int(id_)),
        'text': _bytes_feature(text),
        'vect':_nonscaler_feature(vect),
        'nsentences': _int64_feature(int(nsentences)),
        'mean_w_p_s': _float_feature(float(mean_w_p_s)),
        'var_w_p_s': _float_feature(float(var_w_p_s)),
        'label': _int64_feature(int(label)),
    }

    # Create a Features message using tf.train.Example.

    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

# add word2vec

In [163]:
wv = gensim.downloader.load('glove-wiki-gigaword-100')        

In [187]:
%%time
for i in range(20000):
    if 'hello' in wv: 
        wv.get_vector('hello')

CPU times: user 28.5 ms, sys: 536 µs, total: 29 ms
Wall time: 28.7 ms


In [186]:
%%time
for i in range(20000):
    if 'hello' in wv.index_to_key: 
        wv.get_vector('hello')

CPU times: user 1.97 s, sys: 9.45 ms, total: 1.98 s
Wall time: 1.98 s


In [191]:
def embed_sentence_pretrained(w2v, sentence):
    """
    Embed a sentence given a trained Word2Vec
    """
    embedded_sentence = []
    for word in sentence:
        if word in w2v:
            embedded_sentence.append(w2v.get_vector(word))

    return np.array(embedded_sentence)

In [194]:
sentences = X.text[:100]  
LEN=len(sentences)
vect=[]
for i,x in enumerate(sentences):
    vect.append(embed_sentence_pretrained(wv,x))
    if i%50==0:
        print(f'{i*100/LEN:.2f}%')

0.00%
50.00%


In [196]:
vect_pad=tf.keras.utils.pad_sequences(vect,truncating='post',padding='post',maxlen=256,dtype=float)

## full ds

In [236]:
#useless - for tf.dataset usecase

ds = tf.data.Dataset.from_tensor_slices((X[:100].index,
                                        X[:100].text,
                                         vect_pad,
                                         X[:100].nsentences,
                                         X[:100].mean_w_p_s,
                                         X[:100].var_w_p_s,
                                         y[:100].label))

def tf_serialize_example(f0,f1,f2,f3,y):
    tf_string = tf.py_function(
    serialize_example,
    (f0, f1, f2, f3,y),  # Pass these args to the above function.
    tf.string)      # The return type is `tf.string`.
    return tf.reshape(tf_string, ()) # The result is a scalar.




def generator():
    for features in ds:
        yield serialize_example(*features)

serialized_features_dataset = tf.data.Dataset.from_generator(
    generator, output_types=tf.string, output_shapes=())


In [237]:
## Pur python

In [238]:
with tf.io.TFRecordWriter('hey.tfrecord') as writer:
    for i in range(100):
        example = serialize_example(
            X.index[i],
#             X.reset_index().id[i],
            X.text.str.encode('utf-8').iloc[i],
            vect_pad[i],
            X.nsentences.iloc[i],
            X.mean_w_p_s.iloc[i],
            X.var_w_p_s.iloc[i],
            y.label.iloc[i])
        writer.write(example)

In [239]:
#test reading
filenames = ['hey.tfrecord']
raw_dataset = tf.data.TFRecordDataset(filenames)
raw_dataset

<TFRecordDatasetV2 element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>

In [240]:
a=0
for i in raw_dataset:
    example = tf.train.Example()
    example.ParseFromString(i.numpy())
    a+=1

In [241]:
a

100

In [245]:
feature_description = {
    'id': tf.io.FixedLenFeature([], tf.int64, default_value=0),
    'text': tf.io.FixedLenFeature([], tf.string, default_value=''),
    'vect': tf.io.FixedLenFeature([], tf.string, default_value=''),
    'nsentences': tf.io.FixedLenFeature([], tf.int64, default_value=0),
    'mean_w_p_s': tf.io.FixedLenFeature([], tf.float32, default_value=0.0),
    'var_w_p_s': tf.io.FixedLenFeature([], tf.float32, default_value=0.0),
    'label': tf.io.FixedLenFeature([], tf.int64, default_value=0),

}

def _parse_function(example_proto):
  # Parse the input `tf.train.Example` proto using the dictionary above.
    parsed_ex = tf.io.parse_single_example(example_proto, feature_description)
    label = parsed_ex.pop('label')
    vect = parsed_ex.pop('vect')
    parsed_ex['vect'] = tf.io.parse_tensor(vect,tf.double)
    return parsed_ex,label

In [246]:
parsed_dataset = raw_dataset.map(_parse_function)
parsed_dataset

<MapDataset element_spec=({'id': TensorSpec(shape=(), dtype=tf.int64, name=None), 'mean_w_p_s': TensorSpec(shape=(), dtype=tf.float32, name=None), 'nsentences': TensorSpec(shape=(), dtype=tf.int64, name=None), 'text': TensorSpec(shape=(), dtype=tf.string, name=None), 'var_w_p_s': TensorSpec(shape=(), dtype=tf.float32, name=None), 'vect': TensorSpec(shape=<unknown>, dtype=tf.float64, name=None)}, TensorSpec(shape=(), dtype=tf.int64, name=None))>

In [247]:
next(iter(parsed_dataset))

({'id': <tf.Tensor: shape=(), dtype=int64, numpy=63064638>,
  'mean_w_p_s': <tf.Tensor: shape=(), dtype=float32, numpy=13.0>,
  'nsentences': <tf.Tensor: shape=(), dtype=int64, numpy=6>,
  'text': <tf.Tensor: shape=(), dtype=string, numpy=b'Sexhow railway station was a railway station located in the town of Sexhow, on the Cumbrian Coast Line in North West England. The station was opened by the Lancashire and Yorkshire Railway on 7 October 1870. It was closed to passengers on 5 January 1950, and to goods on 12 May 1965. \n\nThe station building is now a private residence. There is a small amount of trackage remaining near the building, used currently by a local agricultural business.'>,
  'var_w_p_s': <tf.Tensor: shape=(), dtype=float32, numpy=7.187953>,
  'vect': <tf.Tensor: shape=(256, 100), dtype=float64, numpy=
  array([[-0.52605999, -0.066991  , -0.17351   , ..., -0.79123002,
           0.047581  ,  0.084428  ],
         [-0.67211998,  1.14579999,  0.12519   , ..., -0.57916999,
   