In [2]:
import os
import sys
from pathlib import Path
from transformers import (
    AutoModelForSequenceClassification,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    pipeline
)
import torch

# Check if CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Set up paths to local models
base_path = Path("hf_models")  # Adjust path as needed
sentiment_path = base_path / "sentiment"
summarization_path = base_path / "summarization"

print(f"Looking for models in: {base_path.absolute()}")
print(f"Sentiment model path: {sentiment_path.exists()}")
print(f"Summarization model path: {summarization_path.exists()}")

Using device: cpu
Looking for models in: /workspace/hf_models
Sentiment model path: True
Summarization model path: True


In [8]:
# Load Sentiment Analysis Model
print("Loading sentiment analysis model...")

# Method 1: Load model and tokenizer separately
sentiment_model = AutoModelForSequenceClassification.from_pretrained(
    sentiment_path,
    local_files_only=True  # This ensures we only use local files
)
sentiment_tokenizer = AutoTokenizer.from_pretrained(
    sentiment_path,
    local_files_only=True
)

# Method 2: Create a pipeline (recommended for ease of use)
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model=sentiment_model,
    tokenizer=sentiment_tokenizer,
    device=0 if device == "cuda" else -1  # Use GPU if available
)

print("✅ Sentiment analysis model loaded successfully!")

Loading sentiment analysis model...
✅ Sentiment analysis model loaded successfully!


In [9]:
# Load Summarization Model
print("Loading summarization model...")

summarization_model = AutoModelForSeq2SeqLM.from_pretrained(
    summarization_path,
    local_files_only=True
)
summarization_tokenizer = AutoTokenizer.from_pretrained(
    summarization_path,
    local_files_only=True
)

# Create summarization pipeline
summarization_pipeline = pipeline(
    "summarization",
    model=summarization_model,
    tokenizer=summarization_tokenizer,
    device=0 if device == "cuda" else -1
)

print("✅ Summarization model loaded successfully!")

Loading summarization model...
✅ Summarization model loaded successfully!


In [4]:
import pandas as pd

csv_file = "channelnewsasia_com.csv"
df = pd.read_csv(csv_file)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1806 entries, 0 to 1805
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   url          1806 non-null   object
 1   title        1806 non-null   object
 2   language     1806 non-null   object
 3   domain       1806 non-null   object
 4   warc_file    1806 non-null   object
 5   scrape_date  1806 non-null   object
 6   content      1806 non-null   object
dtypes: object(7)
memory usage: 98.9+ KB


In [5]:
df.head()

Unnamed: 0,url,title,language,domain,warc_file,scrape_date,content
0,https://www.channelnewsasia.com/3-ktv-operator...,3 KTV operators under investigation for allege...,en,channelnewsasia_com,CC-MAIN-20210925052020-20210925082020-00073.wa...,2021-09-25T06:02:24Z,SINGAPORE: Three KTV operators are being inves...
1,https://www.channelnewsasia.com/access/ferrari...,Why Ferrari chose to equip the F8 Tributo with...,en,channelnewsasia_com,CC-MAIN-20210919095911-20210919125911-00038.wa...,2021-09-19T10:58:33Z,Ferrari’s most popular model line has been its...
2,https://www.channelnewsasia.com/advertorial/ko...,Kopar at Newton: A launchpad to prime spots in...,en,channelnewsasia_com,CC-MAIN-20210920191528-20210920221528-00717.wa...,2021-09-20T21:33:54Z,City living comes with a touch of glamour and ...
3,https://www.channelnewsasia.com/asia/13-chines...,13 Chinese tourists killed as bus plunges into...,en,channelnewsasia_com,CC-MAIN-20210923195546-20210923225546-00343.wa...,2021-09-23T20:53:42Z,BANGKOK: At least 13 Chinese tourists were kil...
4,https://www.channelnewsasia.com/asia/17-arrest...,17 arrested over violence at Hindu temple in M...,en,channelnewsasia_com,CC-MAIN-20210923013955-20210923043955-00655.wa...,2021-09-23T03:20:50Z,"SHAH ALAM: Seventeen men, all Malaysian, have ..."


In [6]:
from tqdm import tqdm

def process_text_batch(texts, batch_size=8):
    """Process texts in batches for better performance"""
    summaries = []
    sentiments = []
    
    # Process in batches
    for i in tqdm(range(0, len(texts), batch_size), desc="Processing batches"):
        batch_texts = texts[i:i+batch_size]
        
        # Generate summaries for batch
        try:
            batch_summaries = summarization_pipeline(
                batch_texts,
                max_length=150,
                min_length=50,
                do_sample=False,
                truncation=True
            )
            
            # Extract summary text
            batch_summary_texts = [s['summary_text'] for s in batch_summaries]
            summaries.extend(batch_summary_texts)
            
            # Analyze sentiment of summaries
            batch_sentiments = sentiment_pipeline(batch_summary_texts)
            sentiments.extend(batch_sentiments)
            
        except Exception as e:
            print(f"Error processing batch {i//batch_size + 1}: {e}")
            # Add placeholder values for failed batch
            for _ in batch_texts:
                summaries.append("Error generating summary")
                sentiments.append({"label": "UNKNOWN", "score": 0.0})
    
    return summaries, sentiments

In [10]:
text_column = 'content'
print(f"Processing {len(df)} articles...")
texts = df[text_column].fillna("").tolist()
test_size = min(10, len(texts))
print(f"Processing first {test_size} articles for testing...")

summaries, sentiments = process_text_batch(texts[:test_size])

# Create new DataFrame
results_df = pd.DataFrame({
    'original_text': texts[:test_size],
    'summary': summaries,
    'sentiment_label': [s['label'] for s in sentiments],
    'sentiment_score': [s['score'] for s in sentiments]
})

print("✅ Processing complete!")
print(f"Results DataFrame shape: {results_df.shape}")
print("\nFirst few results:")
print(results_df.head())

Processing 1806 articles...
Processing first 10 articles for testing...


Processing batches:   0%|          | 0/2 [00:00<?, ?it/s]Your max_length is set to 150, but your input_length is only 116. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=58)
Processing batches: 100%|██████████| 2/2 [01:04<00:00, 32.31s/it]

✅ Processing complete!
Results DataFrame shape: (10, 4)

First few results:
                                       original_text  \
0  SINGAPORE: Three KTV operators are being inves...   
1  Ferrari’s most popular model line has been its...   
2  City living comes with a touch of glamour and ...   
3  BANGKOK: At least 13 Chinese tourists were kil...   
4  SHAH ALAM: Seventeen men, all Malaysian, have ...   

                                             summary sentiment_label  \
0  Three KTV operators are being investigated aft...        NEGATIVE   
1  Ferrari's most popular model line has been its...        POSITIVE   
2  City living comes with a touch of glamour and ...        POSITIVE   
3  At least 13 Chinese tourists were killed and d...        NEGATIVE   
4  Seventeen men, all Malaysian, arrested in conn...        NEGATIVE   

   sentiment_score  
0         0.997454  
1         0.998887  
2         0.999711  
3         0.996809  
4         0.987689  



