# **Mounting Drive and Change Directory**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd "/content/drive/MyDrive/AI Projects/GPT-from-scratch"

/content/drive/MyDrive/AI Projects/GPT-from-scratch


In [3]:
!ls

data_extract.py  dataset  GPT  README.md  requirements.txt


In [4]:
!pip install -r requirements.txt

Collecting colorama==0.4.6 (from -r requirements.txt (line 1))
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Collecting fsspec==2024.5.0 (from -r requirements.txt (line 3))
  Downloading fsspec-2024.5.0-py3-none-any.whl (316 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.1/316.1 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting intel-openmp==2021.4.0 (from -r requirements.txt (line 4))
  Downloading intel_openmp-2021.4.0-py2.py3-none-manylinux1_x86_64.whl (9.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
Collecting mkl==2021.4.0 (from -r requirements.txt (line 7))
  Downloading mkl-2021.4.0-py2.py3-none-manylinux1_x86_64.whl (280.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.9/280.9 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Collecting networkx==3.2.1 (from -r requirements.txt (line 9))
  Downloading networkx-3.2.1-py3-none-any.

# **Preprocessing Data**

In [5]:
!unzip "/content/drive/MyDrive/AI Projects/openwebtext/openwebtext.zip" -d "/content/"

Archive:  /content/drive/MyDrive/AI Projects/openwebtext/openwebtext.zip
   creating: /content/openwebtext/
  inflating: /content/openwebtext/urlsf_subset08.tar  
  inflating: /content/openwebtext/urlsf_subset06.tar  
  inflating: /content/openwebtext/urlsf_subset15.tar  
  inflating: /content/openwebtext/urlsf_subset20.tar  
  inflating: /content/openwebtext/urlsf_subset07.tar  
  inflating: /content/openwebtext/urlsf_subset01.tar  
  inflating: /content/openwebtext/urlsf_subset10.tar  
  inflating: /content/openwebtext/urlsf_subset13.tar  
  inflating: /content/openwebtext/urlsf_subset03.tar  
  inflating: /content/openwebtext/urlsf_subset19.tar  
  inflating: /content/openwebtext/urlsf_subset09.tar  
  inflating: /content/openwebtext/urlsf_subset00.tar  
  inflating: /content/openwebtext/urlsf_subset16.tar  
  inflating: /content/openwebtext/urlsf_subset11.tar  
  inflating: /content/openwebtext/urlsf_subset02.tar  
  inflating: /content/openwebtext/urlsf_subset14.tar  
  inflating:

In [6]:
import os
import tarfile

def extract_all_tar_files(input_dir, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # List all tar files in the input directory
    tar_files = [f for f in os.listdir(input_dir) if f.endswith('.tar')]

    for tar_file in tar_files:
        tar_path = os.path.join(input_dir, tar_file)
        with tarfile.open(tar_path, 'r') as tar:
            tar.extractall(path=output_dir)
            print(f'Extracted {tar_file} to {output_dir}')

In [7]:
input_dir = '/content/openwebtext'  # Change this to your directory containing tar files
output_dir = '/content/extracted_openwebtext'  # Change this to your desired output directory

# Extract all tar files
extract_all_tar_files(input_dir, output_dir)

Extracted urlsf_subset12.tar to /content/extracted_openwebtext
Extracted urlsf_subset09.tar to /content/extracted_openwebtext
Extracted urlsf_subset08.tar to /content/extracted_openwebtext
Extracted urlsf_subset01.tar to /content/extracted_openwebtext
Extracted urlsf_subset10.tar to /content/extracted_openwebtext
Extracted urlsf_subset19.tar to /content/extracted_openwebtext
Extracted urlsf_subset07.tar to /content/extracted_openwebtext
Extracted urlsf_subset15.tar to /content/extracted_openwebtext
Extracted urlsf_subset06.tar to /content/extracted_openwebtext
Extracted urlsf_subset04.tar to /content/extracted_openwebtext
Extracted urlsf_subset20.tar to /content/extracted_openwebtext
Extracted urlsf_subset13.tar to /content/extracted_openwebtext
Extracted urlsf_subset14.tar to /content/extracted_openwebtext
Extracted urlsf_subset18.tar to /content/extracted_openwebtext
Extracted urlsf_subset02.tar to /content/extracted_openwebtext
Extracted urlsf_subset16.tar to /content/extracted_open

In [8]:
!python data_extract.py --folder_path "/content/extracted_openwebtext/openwebtext" --output_folder "/content/dataset" --sample_rate 0.06

100% 1112/1112 [01:32<00:00, 12.05it/s]
100% 123/123 [00:10<00:00, 12.21it/s]


# **Model Training**

In [1]:
!git reset --hard HEAD

/content/drive/MyDrive/AI Projects/GPT-from-scratch


In [4]:
!git pull

Already up to date.


In [2]:
from GPT.LanguageModel import GPTLanguageModel

In [3]:
# Configuration parameters (example values)
n_embd = 768
block_size = 128
n_layer = 12
n_head = 12
dropout = 0.1
learning_rate = 5e-4
max_iters = 5000
eval_iters = 200
batch_size = 64

# File paths
dataset_path = "/content/dataset"

In [4]:
model = GPTLanguageModel(n_embd, n_layer, n_head, dropout, block_size, batch_size, dataset_path)

🚀 Welcome!! I'm your GPT, developed by Ahmed Shafiq. 🚀
🚀 I'm using cuda as a device


In [None]:
model.train_model(max_iters, eval_iters, learning_rate)

  0%|          | 0/5000 [00:00<?, ?it/s]

step: 0, train loss: 9.374, val loss: 9.378


  2%|▏         | 97/5000 [06:26<2:12:12,  1.62s/it]

In [None]:
prompt = 'Hello! Can you see me?'
context = torch.tensor(encode(prompt), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context.unsqueeze(0), max_new_tokens=100)[0].tolist())
print(generated_chars)