In [36]:
import pandas as pd
from transformers import pipeline
import numpy as np
import os

# Check if the file exists
file_path = r"C:\Users\admin\Desktop\traini\article.xlsx"
if not os.path.exists(file_path):
    print(f"File not found: {file_path}")
else:
    print(f"File found: {file_path}")

# Load your dataset from Excel
try:
    data = pd.read_excel(file_path)
    print("Excel file loaded successfully")
except Exception as e:
    print(f"Error loading Excel file: {e}")

# Check the first few rows to confirm the columns
print("DataFrame head:")
print(data.head())

# Verify the column names
print("DataFrame columns:")
print(data.columns)

# Predefined order of labels
Categories = [
    "Equity Financing", "Debt Financing", "IPO", "Bankruptcy/financial distress", "financial results", 
    "ESG announcement", "credit rating", "Joint venture", "New product launch", "Investment", 
    "New project announcement", "Project update", "Contract Announcements", "R&D, Patent, copyrights, Trademark updates",
    "Price & Promotion updates", "Product recall", "Senior executive hiring", "Layoffs and downsizing",
    "Currency News", "International Trade Updates", "Stock market updates", "Corporate Legal Affairs", 
    "Regulatory News", "Merger & Acquisition"
]

# Load the pre-trained zero-shot classifier
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

def classify_titles(titles):
    classified_results = []
    
    for title in titles:
        result = classifier(title, Categories)
        best_category = result['labels'][0]
        best_score = result['scores'][0]
        classified_results.append((title, best_category, best_score))
        
    return classified_results

# Combine title and first 100 words of the article for classification
def get_combined_text(row):
    title = row['text']
    #article = row['article']
    #first_100_words = ' '.join(article.split()[:100])
    return f"{title}" 

# Ensure columns exist before applying the function
if 'text' in data.columns and 'article' in data.columns:
    # Apply the function to get combined texts
    data['combined_text'] = data.apply(get_combined_text, axis=1)

    # Titles to classify
    titles_to_classify = data['combined_text'].tolist()
    classified_titles = classify_titles(titles_to_classify)

    # Print the results with only the title
    for idx, (combined_text, category, confidence) in enumerate(classified_titles):
        title = data.loc[idx, 'text']
        print(f"Title: '{title}' is classified as '{category}' with confidence score {confidence:.4f}")
else:
    print("Required columns 'text' and/or 'article' not found in the DataFrame.")


File found: C:\Users\admin\Desktop\traini\article.xlsx
Excel file loaded successfully
DataFrame head:
                                                text  \
0  Exclusive: Boeing deliveries to China delayed ...   
1  Not all Fed officials favored tapering of bala...   
2  Target, Walmart shoppers seek home goods, groc...   
3  Exclusive: Boeing deliveries to China delayed ...   
4  Putin approves closure of American Express Rus...   

                                             article  
0  May 22 (Reuters) - Boeing's  plane deliveries ...  
1  NEW YORK, May 22 (Reuters) - Meeting minutes f...  
2  NEW YORK, May 22 (Reuters) - U.S. online spend...  
3  May 22 (Reuters) - Boeing's  plane deliveries ...  
4  MOSCOW, May 22 (Reuters) - Russian President V...  
DataFrame columns:
Index(['text', 'article'], dtype='object')
Title: 'Exclusive: Boeing deliveries to China delayed by state regulator review, source says' is classified as 'Regulatory News' with confidence score 0.2811
Title: 'Not