In [1]:
!pip install datasets
!pip install tokenizers



In [2]:
from datasets import load_dataset

data_files = { "file_1": "file-000000000006.json.gz" }
dataset = load_dataset("codeparrot/codeparrot-clean", data_files=data_files)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/246M [00:00<?, ?B/s]

Generating file_1 split: 0 examples [00:00, ? examples/s]

In [3]:
df = dataset["file_1"].to_pandas()
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 11 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   repo_name      100000 non-null  object 
 1   path           100000 non-null  object 
 2   copies         100000 non-null  object 
 3   size           100000 non-null  object 
 4   content        100000 non-null  object 
 5   license        100000 non-null  object 
 6   hash           100000 non-null  int64  
 7   line_mean      100000 non-null  float64
 8   line_max       100000 non-null  int64  
 9   alpha_frac     100000 non-null  float64
 10  autogenerated  100000 non-null  bool   
dtypes: bool(1), float64(2), int64(2), object(6)
memory usage: 7.7+ MB
None


In [4]:
def memory_usage(_dataframe):
    usage_in_bytes = _dataframe.memory_usage(deep=True).sum()
    usage_in_megabytes = usage_in_bytes/(1024 ** 2)
    return "{:03.2f} MB".format(usage_in_megabytes)

In [5]:
memory_usage(df), memory_usage(df[["content"]])

('1092.40 MB', '1055.30 MB')

In [6]:
import tensorflow as tf
import pandas as pd
from tokenizers import Tokenizer

In [7]:
star_coder_tokenizer = Tokenizer.from_pretrained("bigcode/starcoder2-3b")

In [8]:
ex0_encoded = star_coder_tokenizer.encode(df.iloc[0]["content"])
print("token length of first example:", len(ex0_encoded.ids))
print(ex0_encoded.ids)

token length of first example: 3199
[40, 8571, 10633, 63, 6471, 50, 61, 8571, 222, 1097, 38180, 51, 2324, 1220, 7181, 100, 2324, 641, 7181, 222, 1097, 38180, 51, 1219, 1220, 3100, 222, 1097, 38180, 51, 123, 55, 1220, 2426, 12071, 222, 1097, 8532, 51, 1219, 1220, 4590, 499, 222, 842, 18572, 45, 768, 12071, 731, 465, 684, 43370, 45, 803, 49, 46397, 731, 310, 1547, 1121, 2102, 456, 649, 39985, 2656, 349, 3014, 100, 333, 100, 9130, 100, 1097, 44, 347, 310, 1877, 349, 9903, 16591, 12101, 18801, 2503, 603, 1776, 341, 349, 3014, 100, 333, 100, 9130, 100, 1097, 44, 2656, 804, 391, 2838, 18217, 1532, 310, 1421, 341, 15139, 2502, 1221, 51, 310, 1547, 310, 2666, 5833, 100, 12670, 100, 9130, 100, 333, 299, 46397, 51, 9903, 16591, 12101, 18801, 51, 5754, 51, 2049, 45, 343, 15139, 100, 333, 100, 697, 35895, 310, 848, 310, 456, 18217, 347, 2666, 5833, 100, 12670, 100, 9130, 100, 333, 63, 343, 607, 1408, 341, 2838, 18217, 451, 341, 1275, 642, 341, 349, 9130, 100, 333, 100, 697, 44, 343, 391, 100, 3014

In [9]:
tokens_tensor = tf.constant([], dtype=tf.int32)

In [10]:
num_examples_list = [0, 20000, 40000, 60000, 80000]

for num_examples in num_examples_list:
    tokens_list = []
    for i in range(num_examples, num_examples + 20000):
        example_encoded = star_coder_tokenizer.encode(df.iloc[i]["content"])
        tokens_list += example_encoded.ids
        tokens_list += [0]

    tf_tokens = tf.constant(tokens_list, dtype=tf.int32)
    tokens_tensor = tf.concat([tokens_tensor, tf_tokens], axis=-1)

print("tokens_tensor shape:", tokens_tensor.shape)

tokens_tensor shape: (286885147,)


In [18]:
print((tokens_tensor.shape[0] - 283) / 1024)
print(tokens_tensor.shape[0] - 283)

280161.0
286884864


In [19]:
sized_tokens_tensor = tokens_tensor[:-283]
sized_tokens_tensor

<tf.Tensor: shape=(286884864,), dtype=int32, numpy=array([   40,  8571, 10633, ...,   678,   419,  4378], dtype=int32)>

In [20]:
print(tokens_tensor[-293:-283][-10:])
print(sized_tokens_tensor[-10:])

tf.Tensor([  943   459 15083 23175  5373   347  1840   678   419  4378], shape=(10,), dtype=int32)
tf.Tensor([  943   459 15083 23175  5373   347  1840   678   419  4378], shape=(10,), dtype=int32)


In [21]:
train_tensor = tf.reshape(sized_tokens_tensor, shape=(-1, 1024))
train_tensor.shape

TensorShape([280161, 1024])

In [22]:
slided_tokens_tensor = sized_tokens_tensor[1:]

print(sized_tokens_tensor.shape)
print(slided_tokens_tensor.shape)
print(sized_tokens_tensor[5:15])
print(slided_tokens_tensor[5:15])

tf_zero = tf.constant([0], dtype=tf.int32)
padded_slided_tokens = tf.concat([slided_tokens_tensor, tf_zero], axis=-1)
print("padded_slided_tokens shape:", padded_slided_tokens.shape)
target_tensor = tf.reshape(padded_slided_tokens, shape=(-1, 1024))
target_tensor.shape

(286884864,)
(286884863,)
tf.Tensor([   50    61  8571   222  1097 38180    51  2324  1220  7181], shape=(10,), dtype=int32)
tf.Tensor([   61  8571   222  1097 38180    51  2324  1220  7181   100], shape=(10,), dtype=int32)
padded_slided_tokens shape: (286884864,)


TensorShape([280161, 1024])

In [23]:
train_df = pd.DataFrame(data=train_tensor)
print(train_df.info())
train_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 280161 entries, 0 to 280160
Columns: 1024 entries, 0 to 1023
dtypes: int32(1024)
memory usage: 1.1 GB
None


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,40,8571,10633,63,6471,50,61,8571,222,1097,...,444,389,5742,349,5714,100,36230,972,11616,482
1,745,100,1194,389,349,1184,1444,5742,349,1491,...,5538,5095,1472,100,1340,972,349,58,53,389
2,349,5021,972,349,1844,45286,343,349,750,100,...,51,1219,51,3404,51,4169,51,10213,389,5538
3,5095,1710,972,349,1844,389,349,5021,972,349,...,10954,45,105,2009,49,2918,66,123,46,733
4,456,851,49,373,347,362,2009,51,3053,9768,...,2200,575,2172,509,465,655,299,349,471,50


In [24]:
target_df = pd.DataFrame(data=target_tensor)
print(target_df.info())
target_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 280161 entries, 0 to 280160
Columns: 1024 entries, 0 to 1023
dtypes: int32(1024)
memory usage: 1.1 GB
None


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,8571,10633,63,6471,50,61,8571,222,1097,38180,...,389,5742,349,5714,100,36230,972,11616,482,745
1,100,1194,389,349,1184,1444,5742,349,1491,100,...,5095,1472,100,1340,972,349,58,53,389,349
2,5021,972,349,1844,45286,343,349,750,100,2093,...,1219,51,3404,51,4169,51,10213,389,5538,5095
3,1710,972,349,1844,389,349,5021,972,349,1844,...,45,105,2009,49,2918,66,123,46,733,456
4,851,49,373,347,362,2009,51,3053,9768,303,...,575,2172,509,465,655,299,349,471,50,897


In [25]:
from google.colab import auth

auth.authenticate_user()

In [26]:
PROJECT_ID = "large-language-model-424901"
!gcloud config set project {PROJECT_ID}

Updated property [core/project].


In [27]:
save_bucket = "gs://python-data-bucket"

train_df.to_csv(save_bucket + "/file_6_x.csv", index=False)
target_df.to_csv(save_bucket + "/file_6_y.csv", index=False)