## Data Retrieval and Collection through API call

In [1]:
!pip install requests



In [2]:
API_KEY = None # Redacted

In [8]:
from google.colab import userdata
userdata.get('HF_TOKEN')

'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'

In [76]:
# Importing relevant libraries for API calls
import requests
import json

# Set your API key
api_key = API_KEY

# Base URL for the MediaStack API
base_url = "http://api.mediastack.com/v1/news"

# Parameters for the API call
params = {
    'access_key': api_key,
    'languages': 'en',  # to only get news in English
    'categories': 'technology',
    'limit': 100
}

# Make the API request
response = requests.get(base_url, params=params) # API call has been done 6 * 2 + 1 * 3 = 15 times

# Check for successful response
if response.status_code == 200:
    # Parse the response data
    data = response.json()
    # Process the data as needed
    print(json.dumps(data, indent=4))
else:
    # Throw error-code for failure
    print("Failed to retrieve data: ", response.status_code)

{
    "pagination": {
        "limit": 100,
        "offset": 0,
        "count": 100,
        "total": 4615
    },
    "data": [
        {
            "author": "Taylor Hatmaker",
            "title": "Lawsuit against Snap over fentanyl deaths can proceed, judge rules",
            "description": "A lawsuit blaming Snapchat for a series of drug overdoses among young people can proceed, a Los Angeles judge ruled this week. A group of family members related to children and teens who overdosed on fentanyl sued Snapchat maker Snap last year, accusing the social media company of facilitating illicit drug deals involving fentanyl, a [&#8230;]\u00a9 2023 TechCrunch. All rights reserved. For personal use only.",
            "url": "https://techcrunch.com/2024/01/04/snapchat-snap-fentanyl-section-230/",
            "source": "TechCrunch",
            "image": null,
            "category": "technology",
            "language": "en",
            "country": "us",
            "published_at": "2024

In [None]:
data["data"]

In [33]:
import json
import csv

In [None]:
import os
file_path = '/content/output.csv'
# List of JSON objects as strings
data_list = data["data"]

# Check if file exists and has content
file_exists = os.path.isfile(file_path) and os.path.getsize(file_path) > 0

# Writing to a CSV file for aggregation across various categories
with open('output.csv', 'a', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=data_list[0].keys(), quoting=csv.QUOTE_ALL, quotechar='"') # to use the correct quote characters
    if not file_exists:
      writer.writeheader()
    for data in data_list:
        writer.writerow(data)

There are 7 categories in the mediastack API

1. general - Uncategorized News
2. business - Business News
3. entertainment - Entertainment News
4. health - Health News
5. science - Science News
6. sports - Sports News
7. technology - Technology News

The API has been called for each category twice, with number of returned results set to 100 for each call. The only exception is 'general' category which has been called three times! There are a total of 1500 unprocessed records in the dataset.

## Data Preprocessing

The only info. columns which are needed are: author, title, description, URL, source, category, language, and country.

In [None]:
!pip install gdown

In [49]:
import gdown
import pandas as pd
url = 'https://drive.google.com/uc?id=17B0pbjhPZC-Erjx7Gjmf96wRUN_hqfC2'  # Replace file_id with the actual file ID from your Google Drive link
output = 'input_data.csv'
gdown.download(url, output, quiet=False)

# Read the CSV file
df = pd.read_csv(output)

Downloading...
From: https://drive.google.com/uc?id=17B0pbjhPZC-Erjx7Gjmf96wRUN_hqfC2
To: /content/input_data.csv
100%|██████████| 844k/844k [00:00<00:00, 106MB/s]


In [50]:
# import pandas as pd

# # The path to the CSV file
# file_path = '/content/output.csv'

# # Read the CSV file
# df = pd.read_csv(file_path)

In [51]:
df.head(2)

Unnamed: 0,author,title,description,url,source,image,category,language,country,published_at
0,,"Gaza, Saudi, Iran, Venezuela And More – OpEd",It would have been outlandish to suggest that ...,https://api.follow.it/track-rss-story-click/v3...,eurasiareview,,general,en,us,2024-01-04T23:21:59+00:00
1,Post Editorial Board,Israel and the US must face the future of the ...,Yet the exigencies of war must be acknowledged...,https://nypost.com/2024/01/04/opinion/israel-m...,Post,https://nypost.com/wp-content%2Fuploads%2Fsite...,general,en,zm,2024-01-04T23:21:55+00:00


Since, we already know the language to be English and (image, published_at) columns are not relevant, so we will drop them. <br><b>ASSUMPTION-1:</b> English is only language used for the purpose of simplicity.

In [52]:
df = df.drop(['image', 'language', 'published_at', 'url'], axis=1)

In [53]:
df.head(5)

Unnamed: 0,author,title,description,source,category,country
0,,"Gaza, Saudi, Iran, Venezuela And More – OpEd",It would have been outlandish to suggest that ...,eurasiareview,general,us
1,Post Editorial Board,Israel and the US must face the future of the ...,Yet the exigencies of war must be acknowledged...,Post,general,zm
2,Etim Etim,Plateau killings: matters arising,The cold-blooded murder of over 200 people by ...,businessdayonline,general,us
3,Clayton Edwards,Fans Wonder How Blake Shelton Was in Two Place...,Country music fans across the United States tu...,americansongwriter,general,us
4,Rachael O&#039;Connor,ITV viewers ‘disgusted’ at true story Mr Bates...,A truly harrowing watch.,Metro,general,gb


Now, we will check for the

In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   author       1046 non-null   object
 1   title        1500 non-null   object
 2   description  1498 non-null   object
 3   source       1500 non-null   object
 4   category     1500 non-null   object
 5   country      1500 non-null   object
dtypes: object(6)
memory usage: 70.4+ KB


1. Preprocessing Step 1: We can see two of the sources don't have descriptions, which would be essential for text-generation. Hence, we delete those two records.

In [55]:
# This will remove all rows where 'description' is empty
df = df.dropna(subset=['description'])

2. Preprocessing Step 2: Upon checking the CSV file, I found that there are a lot of titles and descriptions which are duplicate (generally written by the same author for multiple websites) and hence would not be useful for training the text-generation model. Hence, I will only retain the rows with unique titles and descriptions.

In [56]:
# This will drop all rows where 'title' and 'description' are duplicate
df = df.drop_duplicates(subset='description', keep='first')

In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 794 entries, 0 to 1402
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   author       561 non-null    object
 1   title        794 non-null    object
 2   description  794 non-null    object
 3   source       794 non-null    object
 4   category     794 non-null    object
 5   country      794 non-null    object
dtypes: object(6)
memory usage: 43.4+ KB


In [58]:
df = df.drop_duplicates(subset='title', keep='first')

In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 790 entries, 0 to 1402
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   author       559 non-null    object
 1   title        790 non-null    object
 2   description  790 non-null    object
 3   source       790 non-null    object
 4   category     790 non-null    object
 5   country      790 non-null    object
dtypes: object(6)
memory usage: 43.2+ KB


In [60]:
# Repalcing NaN in authors with some default text like 'Unknown'
df['author'] = df['author'].fillna('Unknown')

In [61]:
# HTML decoding to correct some wrong encodings
import html
df['author'] = df['author'].apply(html.unescape)
df['title'] = df['title'].apply(html.unescape)
df['description'] = df['description'].apply(html.unescape)

In [62]:
df.head()

Unnamed: 0,author,title,description,source,category,country
0,Unknown,"Gaza, Saudi, Iran, Venezuela And More – OpEd",It would have been outlandish to suggest that ...,eurasiareview,general,us
1,Post Editorial Board,Israel and the US must face the future of the ...,Yet the exigencies of war must be acknowledged...,Post,general,zm
2,Etim Etim,Plateau killings: matters arising,The cold-blooded murder of over 200 people by ...,businessdayonline,general,us
3,Clayton Edwards,Fans Wonder How Blake Shelton Was in Two Place...,Country music fans across the United States tu...,americansongwriter,general,us
4,Rachael O'Connor,ITV viewers ‘disgusted’ at true story Mr Bates...,A truly harrowing watch.,Metro,general,gb


In [63]:
# Converting Country Codes to Country names
country_codes = {
    "ar": "Argentina", "au": "Australia", "at": "Austria", "be": "Belgium",
    "br": "Brazil", "bg": "Bulgaria", "ca": "Canada", "cn": "China",
    "co": "Colombia", "cz": "Czech Republic", "eg": "Egypt", "fr": "France",
    "de": "Germany", "gr": "Greece", "hk": "Hong Kong", "hu": "Hungary",
    "in": "India", "id": "Indonesia", "ie": "Ireland", "il": "Israel",
    "it": "Italy", "jp": "Japan", "lv": "Latvia", "lt": "Lithuania",
    "my": "Malaysia", "mx": "Mexico", "ma": "Morocco", "nl": "Netherlands",
    "nz": "New Zealand", "ng": "Nigeria", "no": "Norway", "ph": "Philippines",
    "pl": "Poland", "pt": "Portugal", "ro": "Romania", "sa": "Saudi Arabia",
    "rs": "Serbia", "sg": "Singapore", "sk": "Slovakia", "si": "Slovenia",
    "za": "South Africa", "kr": "South Korea", "se": "Sweden", "ch": "Switzerland",
    "tw": "Taiwan", "th": "Thailand", "tr": "Turkey", "ae": "UAE", "ua": "Ukraine",
    "gb": "United Kingdom", "us": "United States", "ve": "Venezuela"
}

# Replace country code with name if present in the dictionary, else leave it
df['country'] = df['country'].apply(lambda code: country_codes.get(code, code))

In [64]:
df.head(5)

Unnamed: 0,author,title,description,source,category,country
0,Unknown,"Gaza, Saudi, Iran, Venezuela And More – OpEd",It would have been outlandish to suggest that ...,eurasiareview,general,United States
1,Post Editorial Board,Israel and the US must face the future of the ...,Yet the exigencies of war must be acknowledged...,Post,general,zm
2,Etim Etim,Plateau killings: matters arising,The cold-blooded murder of over 200 people by ...,businessdayonline,general,United States
3,Clayton Edwards,Fans Wonder How Blake Shelton Was in Two Place...,Country music fans across the United States tu...,americansongwriter,general,United States
4,Rachael O'Connor,ITV viewers ‘disgusted’ at true story Mr Bates...,A truly harrowing watch.,Metro,general,United Kingdom


In [69]:
df['metadata'] = df.apply(lambda row: f"Author: {row['author']}\nTitle: {row['title']}\nDescription: {row['description']}\nSource of information: {row['source']}\nCountry of origin: {row['country']}", axis=1)

In [67]:
# Convert DataFrame to CSV
df.to_csv('processed_input_data.csv', index=False)

## Feature Extraction

From the dataset consisting of author, title, description, URL, source, category, language, and country, We can:
1. Extract key-entities from the description and title.
2. Extract the sentiment and tone from the description and title
3. Extract external knowledge about the source, language and country.

In [None]:
!pip install transformers

In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm

#### Entity extraction

In [39]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [40]:
def extract_entities(text):
    doc = nlp(text)
    entity_dict = {}
    for ent in doc.ents:
      if ent.label_ in ['PERSON', 'ORG', 'GPE', 'DATE', 'NORP', 'EVENT', 'WORK_OF_ART', 'LAW', 'LANGUAGE']:
        entity_dict[ent.text] = entity_dict.get(ent.text, 0) + 1
    return entity_dict

df['entities'] = df['content'].apply(extract_entities)

In [41]:
# df = df.drop(['entities_title','entities_description'], axis=1)
df.head()

Unnamed: 0,author,title,description,source,category,country,content,entities
0,Unknown,"Gaza, Saudi, Iran, Venezuela And More – OpEd",It would have been outlandish to suggest that ...,eurasiareview,general,us,"Title: Gaza, Saudi, Iran, Venezuela And More –...","{'Gaza': 3, 'Saudi': 1, 'Iran': 1, 'Venezuela'..."
1,Post Editorial Board,Israel and the US must face the future of the ...,Yet the exigencies of war must be acknowledged...,Post,general,zm,Title: Israel and the US must face the future ...,"{'Israel': 1, 'US': 1, 'Hamas': 1}"
2,Etim Etim,Plateau killings: matters arising,The cold-blooded murder of over 200 people by ...,businessdayonline,general,us,Title: Plateau killings: matters arising\nDesc...,"{'Plateau State': 1, 'Christmas eve': 1, 'Plat..."
3,Clayton Edwards,Fans Wonder How Blake Shelton Was in Two Place...,Country music fans across the United States tu...,americansongwriter,general,us,Title: Fans Wonder How Blake Shelton Was in Tw...,"{'Blake Shelton': 3, 'New Year’s Eve Descripti..."
4,Rachael O'Connor,ITV viewers ‘disgusted’ at true story Mr Bates...,A truly harrowing watch.,Metro,general,gb,Title: ITV viewers ‘disgusted’ at true story M...,"{'ITV': 1, 'Mr Bates Vs The Post Office’s': 1}"


#### Relationship Extraction

In [42]:
# Load the language model
nlp = spacy.load("en_core_web_sm")

# Process a document
text = "Apple is looking at buying U.K. startup for $1 billion"
doc = nlp(text)

# Extract entities
for ent in doc.ents:
    print(ent.text, ent.label_)

Apple ORG
U.K. GPE
$1 billion MONEY


In [45]:
def extract_relations(doc):
    relations = []
    for ent in doc.ents:
        # If the entity has a head
        if ent.root.head:
            head = ent.root.head
            if head.pos_ in ['VERB', 'NOUN']:  # You can adjust the POS tags
                relations.append((ent, head, head.dep_))
    return relations

# Process a document
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

# Extract relations
relations = extract_relations(doc)
for r in relations:
    print(f"Entity: {r[0]}, Relation: {r[1]}, Dependency: {r[2]}")

#### Sentiment Analysis

In [None]:
from transformers import pipeline

model_id = "ProsusAI/finbert"

# Load pre-trained sentiment-analysis pipeline
sentiment_pipeline = pipeline('sentiment-analysis', model=model_id)

In [19]:
def analyze_sentiment(text):
    return sentiment_pipeline(text)[0]

df['sentiment'] = df['content'].apply(lambda x: analyze_sentiment(x)) # Combination of title and description will lead to better sentiment analysis

In [20]:
df['sentiment'] = df['sentiment'].apply(lambda x: x['label'])

In [73]:
df.head(10)

Unnamed: 0,author,title,description,source,category,country,content,metadata
0,Unknown,"Gaza, Saudi, Iran, Venezuela And More – OpEd",It would have been outlandish to suggest that ...,eurasiareview,general,United States,"Title: Gaza, Saudi, Iran, Venezuela And More –...","Author: Unknown\nTitle: Gaza, Saudi, Iran, Ven..."
1,Post Editorial Board,Israel and the US must face the future of the ...,Yet the exigencies of war must be acknowledged...,Post,general,zm,Title: Israel and the US must face the future ...,Author: Post Editorial Board\nTitle: Israel an...
2,Etim Etim,Plateau killings: matters arising,The cold-blooded murder of over 200 people by ...,businessdayonline,general,United States,Title: Plateau killings: matters arising\nDesc...,Author: Etim Etim\nTitle: Plateau killings: ma...
3,Clayton Edwards,Fans Wonder How Blake Shelton Was in Two Place...,Country music fans across the United States tu...,americansongwriter,general,United States,Title: Fans Wonder How Blake Shelton Was in Tw...,Author: Clayton Edwards\nTitle: Fans Wonder Ho...
4,Rachael O'Connor,ITV viewers ‘disgusted’ at true story Mr Bates...,A truly harrowing watch.,Metro,general,United Kingdom,Title: ITV viewers ‘disgusted’ at true story M...,Author: Rachael O'Connor\nTitle: ITV viewers ‘...
5,Toye Sobande,Leading in a year of unbound potential,As the final chimes of 2023 fade into the dawn...,businessdayonline,general,United States,Title: Leading in a year of unbound potential\...,Author: Toye Sobande\nTitle: Leading in a year...
6,Tyler Dumas,El Paso County seeks public feedback on draft ...,"EL PASO COUNTY, Colo. (KRDO) -- El Paso County...",krdo,general,United States,Title: El Paso County seeks public feedback on...,Author: Tyler Dumas\nTitle: El Paso County see...
7,Unknown,Haley faces backlash from caucus-goers after t...,Ambassador Nikki Haley is facing heat on the c...,FOX News - Most Popular,general,United States,Title: Haley faces backlash from caucus-goers ...,Author: Unknown\nTitle: Haley faces backlash f...
9,Hilary Pollack,"The Best Deals This Week, From Hoka Sneakers t...","New year, new you? Upgrade with big deals on B...",Vice News,general,United States,"Title: The Best Deals This Week, From Hoka Sne...",Author: Hilary Pollack\nTitle: The Best Deals ...
10,Martha Grevatt,Protest hits military profiteer TransDigm,Cleveland Clevelanders came out for Palestine ...,workers,general,United States,Title: Protest hits military profiteer TransDi...,Author: Martha Grevatt\nTitle: Protest hits mi...


In [None]:
df = df.drop(['content'], axis=1)

#### Leveraging External Knowledge

In prelimnary stage, we will use Wikipedia to get external information about the entities. *** Under Construction ***

In [36]:
!pip install wikipedia

Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11678 sha256=4787cc0b8ba86c88d93529385904c17d625fc77c36daf6c7af34b5c3f173b151
  Stored in directory: /root/.cache/pip/wheels/5e/b6/c5/93f3dec388ae76edc830cb42901bb0232504dfc0df02fc50de
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [46]:
import wikipedia
summary = wikipedia.summary("Hamas", sentences=5)
print(summary)

PageError: Page id "ha as" does not match any pages. Try another id!

#### Finetuning Dataset-Generation Examples Using GPT-2

In [70]:
print(df['metadata'][0])

Author: Unknown
Title: Gaza, Saudi, Iran, Venezuela And More – OpEd
Description: It would have been outlandish to suggest that a small region like Gaza, seemingly bereft of significant natural resources, political will of its own, and let alone sovereignty, would become the world’s most significant geopolitical spot on earth.The ongoing Israeli war on Gaza and the legendary resistance of the Palestinian people, however, have changed our calculation – or perhaps miscalculation – regarding what a besieged nation can achieve, in terms of collective resistance, in fact changing the rules of the game altogether.However, it is still early to fully fathom the surely...
Source of information: eurasiareview
Country of origin: United States


In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large')
model = GPT2LMHeadModel.from_pretrained('gpt2-large')

In [2]:
# {df['content'][0]}
prompt = f"""USA is the most powerful country in the world. Expand upon the previous statement. Write your response after 'Reponse'. Response:"""

In [3]:
input_ids = tokenizer.encode(prompt, return_tensors="pt")
output = model.generate(input_ids, max_length = 500, temperature= 0.7, pad_token_id=tokenizer.eos_token_id,do_sample=True)
output_text = tokenizer.decode(output[0], skip_special_tokens=True)

In [70]:
input_ids.shape

torch.Size([1, 26])

In [65]:
output_text = output_text.split('Response:')[1].strip()

In [4]:
output_text

"USA is the most powerful country in the world. Expand upon the previous statement. Write your response after 'Reponse'. Response:\n\nThe United States is the most powerful country in the world. Expand upon the previous statement. Write your response after 'Response'. Response:\n\nThe United States is the most powerful country in the world. Expand upon the previous statement. Write your response after 'Response'. Response:\n\nThe United States is the most powerful country in the world. Expand upon the previous statement. Write your response after 'Response'. Response:\n\nThe United States is the most powerful country in the world. Expand upon the previous statement. Write your response after 'Response'. Response:\n\nThe United States is the most powerful country in the world. Expand upon the previous statement. Write your response after 'Response'. Response:\n\nThe United States is the most powerful country in the world. Expand upon the previous statement. Write your response after 'Re

From above, we can see, GPT-2 is not capable enough to extract information and summarize the metadata. We will move on to GPT-3.5

#### Finetuning Dataset-Generation Examples Using GPT-3.5-turbo

In [68]:
print(df['metadata'][0])

Author: Unknown
Title: Gaza, Saudi, Iran, Venezuela And More – OpEd
Description: It would have been outlandish to suggest that a small region like Gaza, seemingly bereft of significant natural resources, political will of its own, and let alone sovereignty, would become the world’s most significant geopolitical spot on earth.The ongoing Israeli war on Gaza and the legendary resistance of the Palestinian people, however, have changed our calculation – or perhaps miscalculation – regarding what a besieged nation can achieve, in terms of collective resistance, in fact changing the rules of the game altogether.However, it is still early to fully fathom the surely...
Source of information: eurasiareview


In [None]:
!pip install openai==0.28.1

In [6]:
# Setting up the OpenAI API key
api_key= None # Redacted
import openai
openai.api_key = api_key

In [83]:
import requests
import time

# OpenAI API call handling for timout issues
def get_completion_from_GPT(prompt, timeout=600, model="gpt-3.5-turbo", max_retries=3, retry_delay=2):
    for attempt in range(max_retries):
        try:
            messages = [{"role": "user", "content": prompt}]
            response = openai.ChatCompletion.create(
                model=model,
                messages=messages,
                temperature=1,  # Degree of randomness of the model's output
                request_timeout=timeout,  # Set the timeout for the OpenAI API request
                seed = 42
            )
            return response.choices[0].message["content"]
        except (requests.exceptions.Timeout, requests.exceptions.RequestException) as e:
            # print(f"Attempt {attempt + 1} failed. Retrying after {retry_delay} seconds.")
            time.sleep(retry_delay)
    # If all attempts fail, raise the last exception
    raise e

In [94]:
for index, row in df.iterrows():
  # Using GPT-3.5-turbo for response evaluation:
  prompt = f"""{row['metadata']}.
            Extract useful information from the metadata above which contains data about 1. The author of the news 2. title of the news 3. Brief description of the news 4. Source of the news. 5. Category of news 6. Country of Origin
            Write down a concise summary about the information being disseminated. Keep in mind the tone, sentiment and nuances of the information.
            Ignore describing any feature (for e.g, if the author is 'Unknown') if it is blank/unknown.
            """
  # OpenAI API call with a timeout value
  try:
      response_text = get_completion_from_GPT(prompt, timeout=60)  # Set a lower timeout value
      df.at[index, 'summary'] = response_text
      summary = response_text.split('Summary:')[1].strip()
      print(str(index) + ": " + summary)
      print()
  except Exception as e:
      None
  #             print(f"An error occurred: {e}")


This op-ed reflects on the remarkable geopolitical importance of the Gaza region, despite its lack of resources and sovereignty. It highlights the ongoing Israeli war on Gaza and praises the resilience of the Palestinian people, suggesting that their collective resistance has challenged previous notions of what a besieged nation can achieve. The article has been published on Eurasia Review, with a United States origin.

The Post Editorial Board highlights the need for Israel and the US to tackle the ongoing war against Hamas. The article emphasizes that intelligence is as important as using overwhelming force in this long battle. The source of the information is a publication referred to as "Post," but no further details about the specific publication are provided. The country of origin is unknown.

The news article, written by Etim Etim, discusses the horrifying massacre of more than 200 individuals by terrorists in Plateau State during Christmas eve. The incident remains a significan

In [95]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 790 entries, 0 to 1402
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   author       790 non-null    object
 1   title        790 non-null    object
 2   description  790 non-null    object
 3   source       790 non-null    object
 4   category     790 non-null    object
 5   country      790 non-null    object
 6   content      790 non-null    object
 7   metadata     790 non-null    object
 8   summary      790 non-null    object
dtypes: object(9)
memory usage: 94.0+ KB


While looking though the generated texts, I found that the word 'Summary: ' (which serves as a separator) is not present in every generation. Hence, the below code is written to prepend it, in case it is missing.

In [98]:
def ensure_summary_prefix(text):
    if 'Summary:' not in text:
        return 'Summary: ' + text
    return text

df['summary'] = df['summary'].apply(ensure_summary_prefix)

Hence, we have created out prelimnary dataset which basically contains a prompt (containing the relevant input information) and its appropriate response, containing the meaningful and coherent summarization of the informatio, generated by GPT-3.5! This dataset can be used to finetune (through SFT) an open-source Causal LLM (like Llama-2-7B or Mistral-7B), For now I will choose Llama-2-7B

In [99]:
df.to_csv('generation_aug_input_data.csv', index=False)

## Creating the instruction-tuning dataset

In [105]:
textgen_df = pd.read_csv('generation_aug_input_data.csv')

In [106]:
textgen_df.head(5)

Unnamed: 0,author,title,description,source,category,country,content,metadata,summary
0,Unknown,"Gaza, Saudi, Iran, Venezuela And More – OpEd",It would have been outlandish to suggest that ...,eurasiareview,general,United States,"Title: Gaza, Saudi, Iran, Venezuela And More –...","Author: Unknown\nTitle: Gaza, Saudi, Iran, Ven...","Author: Unknown\nTitle: ""Gaza, Saudi, Iran, Ve..."
1,Post Editorial Board,Israel and the US must face the future of the ...,Yet the exigencies of war must be acknowledged...,Post,general,zm,Title: Israel and the US must face the future ...,Author: Post Editorial Board\nTitle: Israel an...,Author: Post Editorial Board\nTitle: Israel an...
2,Etim Etim,Plateau killings: matters arising,The cold-blooded murder of over 200 people by ...,businessdayonline,general,United States,Title: Plateau killings: matters arising\nDesc...,Author: Etim Etim\nTitle: Plateau killings: ma...,Author: Etim Etim\n\nTitle: Plateau killings: ...
3,Clayton Edwards,Fans Wonder How Blake Shelton Was in Two Place...,Country music fans across the United States tu...,americansongwriter,general,United States,Title: Fans Wonder How Blake Shelton Was in Tw...,Author: Clayton Edwards\nTitle: Fans Wonder Ho...,Author: Clayton Edwards\nTitle: Fans Wonder Ho...
4,Rachael O'Connor,ITV viewers ‘disgusted’ at true story Mr Bates...,A truly harrowing watch.,Metro,general,United Kingdom,Title: ITV viewers ‘disgusted’ at true story M...,Author: Rachael O'Connor\nTitle: ITV viewers ‘...,Summary: The author of the news is Rachael O'C...


We will only need the 'metadata' column, which will translate to 'input' column and 'summary' column which will translate to the 'output' column

In [107]:
textgen_df = textgen_df.drop(['author', 'title', 'description','source','country','country','content','category'], axis=1)

In [117]:
textgen_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 790 entries, 0 to 789
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   instruction  790 non-null    object
 1   input        790 non-null    object
 2   output       790 non-null    object
dtypes: object(3)
memory usage: 18.6+ KB


In [109]:
textgen_df = textgen_df.rename(columns={'metadata': 'input', 'summary': 'output'})

In [116]:
instruction_text = 'Generate meaningful and coherent text based on the available data. The tone, sentiment and nuances of the information must be taken into consideration'
textgen_df.insert(0, 'instruction', instruction_text)

In [118]:
textgen_df.head()

Unnamed: 0,instruction,input,output
0,Generate meaningful and coherent text based on...,"Author: Unknown\nTitle: Gaza, Saudi, Iran, Ven...","Author: Unknown\nTitle: ""Gaza, Saudi, Iran, Ve..."
1,Generate meaningful and coherent text based on...,Author: Post Editorial Board\nTitle: Israel an...,Author: Post Editorial Board\nTitle: Israel an...
2,Generate meaningful and coherent text based on...,Author: Etim Etim\nTitle: Plateau killings: ma...,Author: Etim Etim\n\nTitle: Plateau killings: ...
3,Generate meaningful and coherent text based on...,Author: Clayton Edwards\nTitle: Fans Wonder Ho...,Author: Clayton Edwards\nTitle: Fans Wonder Ho...
4,Generate meaningful and coherent text based on...,Author: Rachael O'Connor\nTitle: ITV viewers ‘...,Summary: The author of the news is Rachael O'C...


Now, creating the finetuning dataset: (Instruction tuning)

In [125]:
import json
# Convert the selected columns to a JSON file
json_file_path = 'finetuning_dataset_all.json'
json_str = textgen_df.to_json(orient='records')
json_data = json.loads(json_str)

# Save the JSON array to a file with proper formatting
with open('finetuning_dataset_all.json', 'w') as json_file:
    json.dump(json_data, json_file, indent=4, separators=(',', ': '))
print(f"JSON file created at {json_file_path}")

JSON file created at finetuning_dataset_all.json


Now, at first I will take a subset of the ~800 examples by subsetting and shuffling the above json file.

In [126]:
import random

# Load the JSON data from the file
with open('finetuning_dataset_all.json', 'r') as file:
    json_data = json.load(file)

# Shuffle the list randomly
random.shuffle(json_data)

# Create a subset of the first 200 records
subset_data = json_data[:200]

# Optionally, save this subset to a new JSON file
with open('finetuning_dataset_200.json', 'w') as file:
    json.dump(subset_data, file, indent=4)

## Finetuning an Open-Source LLM on our dataset

Downloading all the required files!

In [1]:
!pip install -q accelerate peft bitsandbytes transformers trl

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.3/168.3 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.1/139.1 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.9/78.9 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [9]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [1]:
import os
import torch
import sentencepiece
from datasets import load_dataset
from transformers import (
    LlamaForCausalLM,
    LlamaTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [2]:
from datasets import load_dataset
dataset = load_dataset("json", data_files="/content/finetuning_dataset_200.json")

In [3]:
# Split the dataset into training and testing sets with an 80:20 ratio
train_test_split = dataset['train'].train_test_split(test_size=0.1)

# Extract the training and testing datasets
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

# Optional: Check the size of each split
print(f"Training Dataset Size: {len(train_dataset)}")
print(f"Testing Dataset Size: {len(test_dataset)}")

Training Dataset Size: 180
Testing Dataset Size: 20


Setting the model parameters

In [4]:
# The model that you want to train from the Hugging Face hub
model_name = "meta-llama/Llama-2-7b-chat-hf"

# Fine-tuned model name
new_model = "llama-2-7b-finetuned-textgen"
################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule (constant a bit better than cosine)
lr_scheduler_type = "constant"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 25

# Log every X updates steps
logging_steps = 25

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}

In [5]:
# https://www.philschmid.de/instruction-tune-llama-2
# Referenced from the above article, To instruct tune our model, we need to convert our structured examples into a collection of tasks described via instructions.
# We define a formatting_function that takes a sample and returns a string with our format instruction.
def format_instruction(sample):
  text = f"### Instruction: {sample['instruction']}\n### Input: {sample['input']}\n### Response: {sample['output']}"
  return text

In [7]:
# Looking at a random record to check if the appropriate format was obtained.
from random import randrange

print(format_instruction(train_dataset[randrange(len(train_dataset))]))

['#', '#', '#', ' ', 'I', 'n', 's', 't', 'r', 'u', 'c', 't', 'i', 'o', 'n', ':', ' ', 'G', 'e', 'n', 'e', 'r', 'a', 't', 'e', ' ', 'm', 'e', 'a', 'n', 'i', 'n', 'g', 'f', 'u', 'l', ' ', 'a', 'n', 'd', ' ', 'c', 'o', 'h', 'e', 'r', 'e', 'n', 't', ' ', 't', 'e', 'x', 't', ' ', 'b', 'a', 's', 'e', 'd', ' ', 'o', 'n', ' ', 't', 'h', 'e', ' ', 'a', 'v', 'a', 'i', 'l', 'a', 'b', 'l', 'e', ' ', 'd', 'a', 't', 'a', '.', ' ', 'T', 'h', 'e', ' ', 't', 'o', 'n', 'e', ',', ' ', 's', 'e', 'n', 't', 'i', 'm', 'e', 'n', 't', ' ', 'a', 'n', 'd', ' ', 'n', 'u', 'a', 'n', 'c', 'e', 's', ' ', 'o', 'f', ' ', 't', 'h', 'e', ' ', 'i', 'n', 'f', 'o', 'r', 'm', 'a', 't', 'i', 'o', 'n', ' ', 'm', 'u', 's', 't', ' ', 'b', 'e', ' ', 't', 'a', 'k', 'e', 'n', ' ', 'i', 'n', 't', 'o', ' ', 'c', 'o', 'n', 's', 'i', 'd', 'e', 'r', 'a', 't', 'i', 'o', 'n', '\n', '#', '#', '#', ' ', 'I', 'n', 'p', 'u', 't', ':', ' ', 'A', 'u', 't', 'h', 'o', 'r', ':', ' ', 'U', 'n', 'k', 'n', 'o', 'w', 'n', '\n', 'T', 'i', 't', 'l', 'e

We will start the finetuning:

In [5]:
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load base model
model = LlamaForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = LlamaTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    peft_config=peft_config,
    # formatting_func=format_instruction,
    dataset_text_field="input",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)

# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained(new_model)



Map:   0%|          | 0/180 [00:00<?, ? examples/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 172.00 MiB. GPU 0 has a total capacty of 14.75 GiB of which 109.06 MiB is free. Process 144992 has 14.64 GiB memory in use. Of the allocated memory 13.53 GiB is allocated by PyTorch, and 1009.82 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF