In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'amazon-fine-food-reviews:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F18%2F2157%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240419%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240419T200550Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D45146c94a7428f0e4a7f74b14d97fad29167edc83f8f8d3847493cd8cfb0f155883d3fc62356060acc388a07fb26528cbce5c757c9cbb6e4d7a1c3991d5368e41bd64f888f62e102ff3eab93baaa4acb59ef4b5e3d02e76584c7a8b3cef416301b09a9477cf369e2e1e2ef7615f88f4ce26d663e38a31efec96bff048a4a17dc5b4e78a209e2c6dd05bea55578f33ac190b1a494e4ae11302874701f7a1f455b81a8fba8772dcc8c2e9ac8fc02f39e9c1de8faddf371ab01a0f57f7a66dc24dda5deb733fee1bf96d79e3afa79f37bca76d272d1ed6db0e4fd837e60489366f9be4d41fc70f35346449f5bfc76daa3ca4da795d41e895376c315d00b3dcb2c58'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading amazon-fine-food-reviews, 253873708 bytes compressed
Downloaded and uncompressed: amazon-fine-food-reviews
Data source import complete.


In [2]:
import pandas as pd
data =pd.read_csv("/kaggle/input/amazon-fine-food-reviews/Reviews.csv")

In [3]:
data

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...
...,...,...,...,...,...,...,...,...,...,...
568449,568450,B001EO7N10,A28KG5XORO54AY,Lettie D. Carter,0,0,5,1299628800,Will not do without,Great for sesame chicken..this is a good if no...
568450,568451,B003S1WTCU,A3I8AFVPEE8KI5,R. Sawyer,0,0,2,1331251200,disappointed,I'm disappointed with the flavor. The chocolat...
568451,568452,B004I613EE,A121AA1GQV751Z,"pksd ""pk_007""",2,2,5,1329782400,Perfect for our maltipoo,"These stars are small, so you can give 10-15 o..."
568452,568453,B004I613EE,A3IBEVCTXKNOH,"Kathy A. Welch ""katwel""",1,1,5,1331596800,Favorite Training and reward treat,These are the BEST treats for training and rew...


In [4]:
data_now = data.iloc[:20000]

In [5]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string
import re

# Load NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

data_now = data_now.dropna(subset=['Text'])

# Lowercasing
data_now['processed_text'] = data_now['Text'].apply(lambda x: x.lower())


# Tokenization using NLTK
# data['processed_text'] = data['processed_text'].apply(lambda x: word_tokenize(x))

# Removing punctuation
punctuation = string.punctuation
data_now['processed_text'] = data_now['processed_text'].apply(lambda x: [word for word in x.split() if word not in punctuation])

# Stop word removal using NLTK
stop_words = set(stopwords.words('english'))
data_now['processed_text'] = data_now['processed_text'].apply(lambda x: [word for word in x if word not in stop_words])

# Join tokens back into text
data_now['processed_text'] = data_now['processed_text'].apply(lambda x: ' '.join(x))

import pandas as pd
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('wordnet')

def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_tokens)

# Apply stemming to the 'processed_text' column
data_now['processed_text'] = data_now['processed_text'].apply(lemmatize_text)

# Remove duplicate entries based on the stemmed text
data_now = data_now.drop_duplicates(subset='processed_text')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string
import re

# Load NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

data_now = data_now.dropna(subset=['Summary'])

# Lowercasing
data_now['processed_summary'] = data_now['Summary'].apply(lambda x: x.lower())


# Tokenization using NLTK
# data['processed_text'] = data['processed_text'].apply(lambda x: word_tokenize(x))

# Removing punctuation
punctuation = string.punctuation
data_now['processed_summary'] = data_now['processed_summary'].apply(lambda x: [word for word in x.split() if word not in punctuation])

# Stop word removal using NLTK
stop_words = set(stopwords.words('english'))
data_now['processed_summary'] = data_now['processed_summary'].apply(lambda x: [word for word in x if word not in stop_words])

# Join tokens back into text
data_now['processed_summary'] = data_now['processed_summary'].apply(lambda x: ' '.join(x))

import pandas as pd
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('wordnet')

def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_tokens)

# Apply stemming to the 'processed_text' column
data_now['processed_summary'] = data_now['processed_summary'].apply(lemmatize_text)

# Remove duplicate entries based on the stemmed text
data_now = data_now.drop_duplicates(subset='processed_summary')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
data_now

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,processed_text,processed_summary
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,bought several vitality canned dog food produc...,good quality dog food
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,product arrived labeled jumbo salted peanut .....,advertised
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,"confection around century . light , pillowy ci...",`` delight '' say
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,looking secret ingredient robitussin believe f...,cough medicine
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,great taffy great price . wide assortment yumm...,great taffy
...,...,...,...,...,...,...,...,...,...,...,...,...
19973,19974,B001G0NKVO,A3CKOV3WEYEQDJ,"G. Dalton ""Eddie2358""",0,0,4,1349740800,Kitties eat it every time,I usually buy avoderm but this food is an exce...,usually buy avoderm food excellent way stretch...,kitty eat every time
19974,19975,B001G0NKVO,A3S6UHNZL8TPZ7,"Heather R. Wilson ""meeko77""",0,0,5,1348876800,My picky cats actually eat this,"I have 2 ""picky-eater"" cats. They aren't reall...",2 `` picky-eater '' cat . really fan canned fo...,picky cat actually eat
19975,19976,B001G0NKVO,AWM9OVTK6GAXV,ashrh,0,0,5,1347580800,my cats love this!,I have 2 female cats. I have given them other ...,"2 female cat . given friskies food before , fi...",cat love this !
19977,19978,B001G0NKVO,A1HRYC60VTMYC0,Ace,0,0,5,1339459200,Mmm! Cee Cee's FAVORITE!!!,I have the enviable job of getting two pills i...,enviable job getting two pill cee cee daily . ...,mmm ! cee cee 's favorite ! ! !


In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
def add_end_first(text):
  print(text)
  return "TEXT:"+text+"\nEND"+"\nSUMMARY"

data_now["combined_text"] = "TEXT:  "+ (data_now['processed_text'].str.split().str[:100].str.join(' ') + "  SUMMARY:  " + data_now['processed_summary']).astype(str) + "  END"

X = data_now["combined_text"]
y = data_now['processed_summary']
# print(X)
from sklearn.model_selection import train_test_split
# Splitting the data into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Creating train and test DataFrames
train_data = pd.DataFrame({'processed_text': X_train, 'processed_summary': y_train})
test_data = pd.DataFrame({'processed_text': X_test, 'processed_summary': y_test})

# Optionally, you can reset the index of the DataFrames if you want
train_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

In [10]:
train_data["processed_text"][0]

'TEXT:  great way help dieting keeping track . tasty crunchy great sandwich salad even themselves .  SUMMARY:  popchips  END'

In [11]:
# Assuming 'train_data' is your pandas DataFrame and 'column_name' is the name of the column containing the samples
column_name = 'processed_text'

# Open a text file in write mode
with open('samples.txt', 'w') as file:
    index = 0
    while index < len(train_data[column_name]):
        sample = train_data[column_name][index]
        file.write(f"Sample {index}: " + sample + '\n')
        index += 1

In [12]:
!pip install transformers



In [13]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

In [14]:
import torch
from torch.utils.data import Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer, DataCollatorForLanguageModeling

# Define your custom dataset class
class CustomDataset(Dataset):
    def __init__(self, file_path, tokenizer, block_size=128):
      self.examples = []
      with open(file_path, "r", encoding="utf-8") as f:
          text = f.read()
      tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
      i = 0
      while i < len(tokenized_text) - block_size + 1:
          self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size]))
          i += block_size

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, index):
        return torch.tensor(self.examples[index], dtype=torch.long)

# Modify the load_dataset function to use your custom dataset
def load_dataset(file_path, tokenizer, block_size=128):
    dataset = CustomDataset(
        file_path=file_path,
        tokenizer=tokenizer,
        block_size=block_size,
    )
    return dataset

# Rest of the code remains the same
def load_data_collator(tokenizer, mlm=False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm,
    )
    return data_collator

def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    train_dataset = load_dataset(train_file_path, tokenizer)
    data_collator = load_data_collator(tokenizer)
    tokenizer.save_pretrained(output_dir)

    model = GPT2LMHeadModel.from_pretrained(model_name)

    model.save_pretrained(output_dir)
    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=overwrite_output_dir,
        per_device_train_batch_size=per_device_train_batch_size,
        num_train_epochs=num_train_epochs,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
    )

    trainer.train()
    trainer.save_model()

In [None]:
!pip install accelerate -U

In [18]:
# you need to set parameters
train_file_path = "samples.txt"
model_name = 'gpt2'
output_dir = "/content/drive/My Drive/results"
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 5.0
save_steps = 500

# It takes about 30 minutes to train in colab.
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`

In [16]:
!pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.w

In [19]:
# # you need to set parameters
# train_file_path = "samples.txt"
# model_name = 'gpt2'
# output_dir = "/content/drive/My Drive/results"
# overwrite_output_dir = False
# per_device_train_batch_size = 8
# num_train_epochs = 5.0
# save_steps = 500

# # It takes about 30 minutes to train in colab.
# train(
#     train_file_path=train_file_path,
#     model_name=model_name,
#     output_dir=output_dir,
#     overwrite_output_dir=overwrite_output_dir,
#     per_device_train_batch_size=per_device_train_batch_size,
#     num_train_epochs=num_train_epochs,
#     save_steps=save_steps
# )

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`

In [None]:
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer


def generate_text(sequence, max_length):
    model_path = "/content/drive/My Drive/results"
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    return tokenizer.decode(final_outputs[0], skip_special_tokens=True)

In [None]:
predicted_summaries = []
i = 0
while i < len(test_data["processed_text"]):
    sequence = test_data["processed_text"][i]
    max_len = len(sequence.split()) + 100
    output = generate_text(sequence, max_len).split("END")[0].split("SUMMARY:")[1]
    predicted_summaries.append(output)
    i += 1

In [None]:
test_data["processed_text"][50]