In [13]:
# this is transformer model from hugging face library, GPT2 specifically, this model is basically trained on two different articles on Artificial intelligence which encompasses education and economy.
# install these libraries and tools 
!pip install transformers torch pandas numpy nltk scikit-learn



In [14]:
# import these required libararies
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import torch
from sklearn.model_selection import train_test_split

In [15]:
# this nltk library is crucial for text preprocessing it will remove all unnecessary marks within the data
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
# preprocess text data
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = text.lower()  # Convert to lowercase
    text = ' '.join(word for word in text.split() if word not in stop_words)  # Remove stopwords
    return text

def load_and_preprocess_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        raw_text = file.read()
    cleaned_text = preprocess_text(raw_text)
    return cleaned_text

file_path = 'Role of AI in education.txt'
cleaned_text = load_and_preprocess_text(file_path)

In [17]:
# did preparation of dataset for GPT-2 model

from transformers import TextDataset, DataCollatorForLanguageModeling

def save_cleaned_text_to_file(text, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(text)

cleaned_file_path = 'cleaned_role_of_ai_in_education.txt'
save_cleaned_text_to_file(cleaned_text, cleaned_file_path)

def load_dataset(file_path, tokenizer, block_size=512):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )

def load_data_collator(tokenizer):
    return DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
dataset = load_dataset(cleaned_file_path, tokenizer)
data_collator = load_data_collator(tokenizer)

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]



In [18]:
# training and intiliazation of model

model = GPT2LMHeadModel.from_pretrained('gpt2')

training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

trainer.train()


Downloading:   0%|          | 0.00/548M [00:00<?, ?B/s]

***** Running training *****
  Num examples = 3
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 1
  Number of trainable parameters = 124439808


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1, training_loss=4.994874954223633, metrics={'train_runtime': 808.7292, 'train_samples_per_second': 0.004, 'train_steps_per_second': 0.001, 'total_flos': 783876096000.0, 'train_loss': 4.994874954223633, 'epoch': 1.0})

In [23]:
# generate any text on the based of trained dataset
def generate_text(prompt, model, tokenizer, max_length=100):
    inputs = tokenizer.encode(prompt, return_tensors='pt')
    outputs = model.generate(inputs, max_length=max_length, num_return_sequences=1)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

prompt = "The role of AI in economy"
generated_text = generate_text(prompt, model, tokenizer)
print(generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The role of AI in economy is to provide a better understanding of the economic and political context of the economy.

The role of AI in economy is to provide a better understanding of the economic and political context of the economy.

The role of AI in economy is to provide a better understanding of the economic and political context of the economy.

The role of AI in economy is to provide a better understanding of the economic and political context of the economy.

The role of AI


In [26]:
def generate_text(prompt, model, tokenizer, max_length=200, temperature=0.7, top_k=50, top_p=0.9, repetition_penalty=1.2):
    inputs = tokenizer.encode(prompt, return_tensors='pt')
    outputs = model.generate(
        inputs,
        max_length=max_length,
        num_return_sequences=1,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        no_repeat_ngram_size=2  # Prevents repeating n-grams of specified length
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

prompt = "The role of education"
generated_text = generate_text(prompt, model, tokenizer, max_length=200)
print(generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The role of education in the economy
In this article, we will examine how a number factors affect economic growth. We also discuss why there is it important to look at educational attainment and what are some examples that can be taken into account when considering whether or not an individual should have access public school tuition assistance (PAS). In addition:
