In [None]:
!pip install feedparser pandas

Collecting feedparser
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting sgmllib3k (from feedparser)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m217.2 kB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6047 sha256=d78b7db2bfd37475450e1be53f7b2a02a5df1cb92792c8077558928022ed5adf
  Stored in directory: /root/.cache/pip/wheels/f0/69/93/a47e9d621be168e9e33c7ce60524393c0b92ae83cf6c6e89c5
Successfully built sgmllib3k
Installing collected packages: sgmllib3k, feedparser
Successfully installed feedparser-6.0.11 sgmllib3k-1.0.0


In [None]:
!pip install transformers



In [None]:
import feedparser
import pandas as pd
from datetime import datetime


In [None]:
rss_feeds = [
    'https://www.theage.com.au/rss/national/victoria.xml',
    'https://www.9news.com.au/rss',
    'https://www.theguardian.com/rss',
    'https://7news.com.au/feed'
]


Parse the RSS Fees

In [None]:
#Define a function to parse each feed and extract relevant information
def parse_rss(feed_url):
    feed = feedparser.parse(feed_url)
    articles = []

    for entry in feed.entries:
        article = {
            'title': entry.get('title', ''),
            'link': entry.get('link', ''),
            'published': entry.get('published', ''),
            'description': entry.get('description', entry.get('summary', '')),  # Extract description if available, otherwise use summary
            'source': feed.feed.get('title', 'Unknown Source')
        }
        articles.append(article)

    return articles


In [None]:
#Loop through each RSS feed, parse it, and collect all articles
all_articles = []

for feed_url in rss_feeds:
    print(f"Parsing feed: {feed_url}")
    articles = parse_rss(feed_url)
    all_articles.extend(articles)

print(f"Total articles fetched: {len(all_articles)}")


Parsing feed: https://www.theage.com.au/rss/national/victoria.xml
Parsing feed: https://www.9news.com.au/rss
Parsing feed: https://www.theguardian.com/rss
Parsing feed: https://7news.com.au/feed
Total articles fetched: 258


In [None]:
#Organize the collected data into a structured format using pandas.

df = pd.DataFrame(all_articles)

# Optional: Convert published date to datetime object
df['published'] = pd.to_datetime(df['published'], errors='coerce')

# Display the first few rows
print(df.head())


                                               title  \
0  Two found dead inside home in Melbourne’s sout...   
1  Second major Victorian hospital set to slash j...   
2  Record number of children restrained, secluded...   
3  ‘Danger to women’: Merri Creek rapist loses bi...   
4  Scammed and stranded in the city – by predator...   

                                                link  \
0  https://www.theage.com.au/national/victoria/tw...   
1  https://www.theage.com.au/national/victoria/se...   
2  https://www.theage.com.au/national/victoria/re...   
3  https://www.theage.com.au/national/victoria/da...   
4  https://www.theage.com.au/national/victoria/sc...   

                   published  \
0  2024-11-28 16:30:09+11:00   
1  2024-11-28 16:11:49+11:00   
2  2024-11-28 14:06:50+11:00   
3  2024-11-28 12:57:15+11:00   
4  2024-11-28 11:30:00+11:00   

                                         description  \
0  A crime scene has been established at a home i...   
1  The Royal Melbourn

  df['published'] = pd.to_datetime(df['published'], errors='coerce')


In [None]:
# Save to CSV
df.to_csv('melbourne_news.csv', index=False)
print("Data saved to melbourne_news.csv")


Data saved to melbourne_news.csv


2. Text Preprocessing

In [None]:
#Loading and Inspecting the Data

import pandas as pd

# Load the saved CSV file
df = pd.read_csv('melbourne_news.csv')

# Inspect the first few rows
print(df.head())


                                               title  \
0  Two found dead inside home in Melbourne’s sout...   
1  Second major Victorian hospital set to slash j...   
2  Record number of children restrained, secluded...   
3  ‘Danger to women’: Merri Creek rapist loses bi...   
4  Scammed and stranded in the city – by predator...   

                                                link  \
0  https://www.theage.com.au/national/victoria/tw...   
1  https://www.theage.com.au/national/victoria/se...   
2  https://www.theage.com.au/national/victoria/re...   
3  https://www.theage.com.au/national/victoria/da...   
4  https://www.theage.com.au/national/victoria/sc...   

                   published  \
0  2024-11-28 16:30:09+11:00   
1  2024-11-28 16:11:49+11:00   
2  2024-11-28 14:06:50+11:00   
3  2024-11-28 12:57:15+11:00   
4  2024-11-28 11:30:00+11:00   

                                         description  \
0  A crime scene has been established at a home i...   
1  The Royal Melbourn

In [None]:
# Check for missing values
print(df.isnull().sum())

#fill missing values
df = df.fillna('Unknown')

title            0
link             0
published      218
description      0
source           0
dtype: int64


In [None]:
# Remove duplicate rows based on the 'title' or 'link'
df = df.drop_duplicates(subset='title', keep='first')

In [None]:
# Selecting only relevant columns
df = df[['title', 'description', 'link']]


In [None]:
import re

# Function to clean the text
def clean_text(text):
    # Remove HTML tags and URLs
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'http\S+|www\S+', '', text)

    # Remove non-alphanumeric characters and lower-case the text
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text.lower()

# Apply the cleaning function to the description
df['cleaned_description'] = df['description'].apply(clean_text)


In [None]:
df.to_csv('cleaned_news_data.csv', index=False)


Categorizing Data with LLM

In [None]:
from transformers import pipeline

# Load the zero-shot classification model
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Define candidate labels for your task
candidate_labels = ['violent crime', 'accident', 'other']

# Classify an example text
example_text = "A robbery occurred in Melbourne last night involving multiple suspects."
result = classifier(example_text, candidate_labels)

# Print the results
print(result)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


{'sequence': 'A robbery occurred in Melbourne last night involving multiple suspects.', 'labels': ['violent crime', 'other', 'accident'], 'scores': [0.9749300479888916, 0.023876328021287918, 0.0011935902293771505]}


In [None]:
df = pd.read_csv('cleaned_news_data.csv')
df['category'] = df['cleaned_description'].apply(lambda x: classifier(x, candidate_labels)['labels'][0])


In [None]:
df.to_csv('categorized_news_data.csv', index=False)

Zero shot Classification using DeBERTa

In [None]:
from transformers import pipeline

# Load the DeBERTa-based classification model
classifier = pipeline("zero-shot-classification", model="microsoft/deberta-v3-large")

# Define candidate labels for your task
candidate_labels = ['violent crime', 'accident', 'other']

# Classify an example text
example_text = "A robbery occurred in Melbourne last night involving multiple suspects."
result = classifier(example_text, candidate_labels)

# Print the results
print(result)


config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'sequence': 'A robbery occurred in Melbourne last night involving multiple suspects.', 'labels': ['accident', 'violent crime', 'other'], 'scores': [0.33353114128112793, 0.3334712088108063, 0.33299770951271057]}


In [None]:
# Apply the classifier to your DataFrame
df = pd.read_csv('cleaned_news_data.csv')
df['category'] = df['cleaned_description'].apply(lambda x: classifier(x, candidate_labels)['labels'][0])

In [None]:
df.to_csv('categorized_news_data_deberta.csv', index=False)

Zero Shot classification using T5 model


In [None]:
from transformers import pipeline

# Load the T5-based classification model (ensure you use a fine-tuned version for classification tasks)
classifier = pipeline("zero-shot-classification", model="valhalla/t5-base-qa-qg-hl")

# Define candidate labels for your task
candidate_labels = ['violent crime', 'accident', 'other']

# Classify an example text
example_text = "A robbery occurred in Melbourne last night involving multiple suspects."
result = classifier(example_text, candidate_labels)

# Print the results
print(result)


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at valhalla/t5-base-qa-qg-hl and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.


{'sequence': 'A robbery occurred in Melbourne last night involving multiple suspects.', 'labels': ['other', 'violent crime', 'accident'], 'scores': [0.3464841842651367, 0.3303178548812866, 0.32319799065589905]}


In [None]:
# Apply the classifier to your DataFrame
df = pd.read_csv('cleaned_news_data.csv')
df['category'] = df['cleaned_description'].apply(lambda x: classifier(x, candidate_labels)['labels'][0])

In [None]:
df.to_csv('categorized_news_data_t5.csv', index=False)

Extractiing Crime and Accident Location

In [None]:
!pip install beautifulsoup4 requests



In [None]:
#Scraping news articles from URL with crime or accident news

import requests
from bs4 import BeautifulSoup

# Load the CSV file with the categorized news data
df = pd.read_csv('categorized_news_data.csv')

# Filter rows where category is 'violent crime' or 'accident'
df = df[df['category'].isin(['violent crime', 'accident'])]

# Function to scrape full article from URL
def scrape_full_article(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract the main article content (depends on the website structure)
        article_text = " ".join([p.get_text() for p in soup.find_all('p')])
        return article_text
    except Exception as e:
        return None

# Scrape only articles labeled 'crime' or 'accident'
for index, row in df.iterrows():
    if row['category'] in ['violent crime', 'accident']:
        full_article_text = scrape_full_article(row['link'])
        df.at[index, 'full_text'] = full_article_text


In [None]:
#Using NER to extract location of crime and accidents

from transformers import pipeline

# Load pre-trained NER model from Hugging Face
ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", aggregation_strategy="simple")

# Function to extract locations using Hugging Face NER
def extract_location_with_bert(text):
    entities = ner_pipeline(text)
    locations = [entity['word'] for entity in entities if entity['entity_group'] == "LOC"]  # LOC represents locations
    return locations

# Create 'locations' column if it doesn't exist
if 'locations' not in df.columns:
    df['locations'] = None

# Apply the NER model to the full text (remove if condition, this condition has already been preapplied)
for index, row in df.iterrows():
    if row['category'] in ['crime', 'accident'] and pd.notna(row['full_text']):
        locations = extract_location_with_bert(row['full_text'])
        df.at[index, 'locations'] = locations



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly iden

In [None]:
df.to_csv('location_news_data.csv', index=False)

In [None]:
#RUNNING NER AGAIN USING SPACY THIS TIME
import pandas as pd

# Load the CSV file with the loacions news data
df = pd.read_csv('location_news_data.csv')

# Filter rows where category is 'violent crime' or 'accident'
filtered_df = df[df['category'].isin(['violent crime', 'accident'])]

# Save the result to a new CSV file (optional)
filtered_df.to_csv("filtered_data.csv", index=False)

filtered_df.head()

Unnamed: 0,title,description,link,cleaned_description,category,full_text,locations
2,"CBD crime investigation: The chief justice, th...",When the University of Melbourne Law School we...,https://www.theage.com.au/national/victoria/cb...,when the university of melbourne law school we...,accident,"We’re sorry, this feature is currently unavail...",[]
5,‘I’m broken’: Nicola Gobbo tells court how Law...,"A gangland-hit scare, police threats to remove...",https://www.theage.com.au/national/victoria/im...,a ganglandhit scare police threats to remove h...,violent crime,"We’re sorry, this feature is currently unavail...",
7,Nervous wait for third falcon chick at Melbour...,A Collins Street skyscraper has welcomed some ...,https://www.theage.com.au/national/victoria/jo...,a collins street skyscraper has welcomed some ...,accident,"We’re sorry, this feature is currently unavail...","['Collins Street', 'Collins Street', 'Melbourn..."
8,Man’s body found in car park of popular Melbou...,A crime scene was set up at a Westfield Southl...,https://www.theage.com.au/national/victoria/ma...,a crime scene was set up at a westfield southl...,violent crime,"We’re sorry, this feature is currently unavail...",
14,Missing 11-year-old boy found ‘safe and well’ ...,"Police say Toby, who has autism, disappeared a...",https://www.theage.com.au/national/victoria/se...,police say toby who has autism disappeared aro...,accident,"We’re sorry, this feature is currently unavail...","['Anglesea', '##nee', 'Great Ocean Road', 'Ang..."


In [None]:
import spacy
import pandas as pd

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Load the CSV file with the loacions news data
df = pd.read_csv('location_news_data.csv')

# Function to extract locations
def extract_locations(text):
    if pd.isna(text):  # Handle missing values
        return ""
    doc = nlp(text)
    locations = [ent.text for ent in doc.ents if ent.label_ in ("GPE", "LOC")]
    return ", ".join(locations)

# Apply the function to the "full_text" column
df["locations"] = df["full_text"].apply(extract_locations)

# Save the updated DataFrame
df.to_csv("updated_dataset.csv", index=False)


Open assistant continuously crashing

In [None]:
#define locations using OpenAssistant/oasst-sft-1-pythia-12b

from transformers import pipeline

# Load OpenAssistant model for text generation (dialogue/refinement)
oasst_model = pipeline("text-generation", model="OpenAssistant/oasst-sft-1-pythia-12b")

# Define a function to refine extracted locations
def refine_locations(ner_locations, full_text):
    if not ner_locations:
        return None

    # Prepare the input prompt for OpenAssistant
    prompt = f"In the following article, the locations {ner_locations} were mentioned. Which location is most likely the site of the crime or accident? Here is the article text:\n\n{full_text}"

    # Get refined location from the OpenAssistant model
    response = oasst_model(prompt, max_length=200, do_sample=False)[0]['generated_text']

    # Extract the location mentioned in the response (we assume it returns one or two relevant locations)
    refined_location = response.strip()
    return refined_location

# Apply the refinement to the DataFrame
df['refined_location'] = df.apply(lambda row: refine_locations(row['locations'], row['full_text']), axis=1)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Locating on Map

In [None]:
!pip install geopandas folium geopy



In [None]:
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import folium

# Load the dataset
df = pd.read_csv("updated_dataset.csv")

# Initialize geolocator
geolocator = Nominatim(user_agent="location_mapper")

# Define the bounding box for Melbourne and outer suburbs (lat_min, lat_max, lon_min, lon_max)
melbourne_bounds = {
    "lat_min": -38.4339,  # Southernmost point (e.g., Mornington Peninsula)
    "lat_max": -37.4000,  # Northernmost point (e.g., Craigieburn)
    "lon_min": 144.3331,  # Westernmost point (e.g., Werribee)
    "lon_max": 145.8788   # Easternmost point (e.g., Lilydale)
}

# Define a function to check if a point lies within the bounding box
def is_within_bounds(lat, lon, bounds):
    return (bounds["lat_min"] <= lat <= bounds["lat_max"]) and (bounds["lon_min"] <= lon <= bounds["lon_max"])

# Extend the get_coordinates function
def get_coordinates(location):
    try:
        if pd.isna(location):  # Handle missing values
            return None, None
        # Geocode the location, appending "Victoria, Australia" for broader search
        location = geolocator.geocode(location + ", Victoria, Australia")
        if location:
            lat, lon = location.latitude, location.longitude
            # Check if the coordinates are within Melbourne bounds
            if is_within_bounds(lat, lon, melbourne_bounds):
                return lat, lon
        return None, None
    except GeocoderTimedOut:
        return None, None

# Apply the function to the 'locations' column
df[['latitude', 'longitude']] = df['locations'].apply(lambda loc: pd.Series(get_coordinates(loc)))

# Filter rows with valid coordinates
df = df.dropna(subset=['latitude', 'longitude'])

# Create a map centered on Melbourne
melbourne_map = folium.Map(location=[-37.8136, 144.9631], zoom_start=10)

# Add points to the map
for _, row in df.iterrows():
    folium.Marker(location=[row['latitude'], row['longitude']],
                  popup=row['locations']).add_to(melbourne_map)

# Save the map as an HTML file
melbourne_map.save("melbourne_outer_suburbs_map.html")

# Display the map (optional, requires a Jupyter environment)
melbourne_map




In [None]:
df.head()

Unnamed: 0,title,description,link,cleaned_description,category,full_text,locations
0,Second major Victorian hospital set to slash j...,The Royal Melbourne Hospital has joined Alfred...,https://www.theage.com.au/national/victoria/se...,the royal melbourne hospital has joined alfred...,other,,
1,Homicide squad at home in Melbourne’s south-ea...,A crime scene has been established at a home i...,https://www.theage.com.au/national/victoria/tw...,a crime scene has been established at a home i...,violent crime,"We’re sorry, this feature is currently unavail...",
2,"Record number of children restrained, secluded...",Almost 2700 students were restrained or seclud...,https://www.theage.com.au/national/victoria/re...,almost 2700 students were restrained or seclud...,other,,
3,‘Danger to women’: Merri Creek rapist loses bi...,Joel Russo is serving a 20-year jail term for ...,https://www.theage.com.au/national/victoria/da...,joel russo is serving a 20year jail term for a...,violent crime,"We’re sorry, this feature is currently unavail...",
4,Scammed and stranded in the city – by predator...,A night out at a concert; time to catch a cab ...,https://www.theage.com.au/national/victoria/sc...,a night out at a concert time to catch a cab h...,other,,


Trying classification with clustering techniques

In [None]:
!pip install sentence_transformers

In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Load the CSV file with the cleaned and normalized text
df = pd.read_csv('cleaned_news_data.csv')

# Initialize sentence-transformer for embedding text data
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Convert news descriptions into embeddings
embeddings = model.encode(df['normalized_description'].tolist())

# Perform K-Means clustering
num_clusters = 5  # You can change the number of clusters based on your preference
kmeans = KMeans(n_clusters=num_clusters, random_state=0)
df['cluster'] = kmeans.fit_predict(embeddings)

# Save the results to a new CSV file
df.to_csv('clustered_news.csv', index=False)
print("News articles clustered and saved to 'clustered_news.csv'")

# Optional: Visualize the clusters using PCA (for dimensionality reduction)
pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(embeddings)

plt.figure(figsize=(10, 7))
plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=df['cluster'], cmap='viridis')
plt.title('Clusters of News Articles')
plt.show()


Elbow Method for Finding Optimal Clusters:

In [None]:
from sklearn.metrics import silhouette_score

def find_optimal_clusters(embeddings, max_k):
    iters = range(2, max_k+1)
    sse = []
    silhouette_scores = []

    for k in iters:
        kmeans = KMeans(n_clusters=k, random_state=0)
        kmeans.fit(embeddings)
        sse.append(kmeans.inertia_)  # Sum of squared distances
        silhouette_scores.append(silhouette_score(embeddings, kmeans.labels_))

    plt.figure(figsize=(10, 7))
    plt.plot(iters, sse, marker='o')
    plt.title('Elbow Method for Optimal Clusters')
    plt.xlabel('Number of clusters')
    plt.ylabel('SSE')
    plt.show()

    plt.figure(figsize=(10, 7))
    plt.plot(iters, silhouette_scores, marker='o')
    plt.title('Silhouette Score for Optimal Clusters')
    plt.xlabel('Number of clusters')
    plt.ylabel('Silhouette Score')
    plt.show()

# Run the Elbow Method to find the optimal number of clusters
find_optimal_clusters(embeddings, 10)


One Shot Classification Using BERT

In [None]:
!pip install openai

In [None]:
import openai
import pandas as pd

# Load your CSV file with news descriptions
df = pd.read_csv('cleaned_news_data.csv')

# Define your categories for classification
categories = ['Crime', 'Government', 'Business', 'violence', 'Science', 'Culture']

# Initialize OpenAI API
openai.api_key = ""

# Function to generate GPT-3 prompt for zero-shot classification
def classify_article_with_gpt3(article, categories):
    prompt = f"Classify the following news article into one of these categories: {', '.join(categories)}.\n\nArticle: {article}\n\nCategory:"

    response = openai.Completion.create(
        engine="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        prompt=prompt,
        max_tokens=10,
        n=1,
        stop=None,
        temperature=0  # Lower temperature for deterministic output
    )

    category = response.choices[0].text.strip()
    return category

# Apply classification on each article
df['gpt3_category'] = df['normalized_description'].apply(lambda x: classify_article_with_gpt3(x, categories))

# Save the results to CSV
df.to_csv('gpt3_classified_news.csv', index=False)
print("GPT-3 classification complete. Results saved to 'gpt3_classified_news.csv'")
