In [1]:
import pandas as pd
import numpy as np
import copy
import warnings
import plotly
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn import metrics
from sklearn.model_selection import train_test_split
from PIL import Image 
import random
warnings.filterwarnings("ignore")
from tqdm import tqdm
import string

import torch
from torch import nn
from transformers import AutoTokenizer
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import AutoModel, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding

In [2]:
news = pd.read_csv('/kaggle/input/mind-news-dataset/news.tsv/news.tsv',header=None,sep='\t')

In [3]:
news.columns=['News ID',
"Category",
"SubCategory",
"Title",
"Abstract",
"URL",
"Title Entities",
"Abstract Entities"]

In [4]:
news = news.iloc[:,:5]

In [5]:
print('the number of articles before processing :',len(news))
news.drop_duplicates(subset=['Title'],inplace=True)
print('The number of articles after processing :',len(news))

the number of articles before processing : 51282
The number of articles after processing : 50434


In [6]:
print('the number of articles before processing :',len(news))
news = news.dropna(subset=['Abstract'])
print('The number of articles after processing :',len(news))

the number of articles before processing : 50434
The number of articles after processing : 47788


In [7]:
bert_uncased_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [8]:
class BertBaseDataset(Dataset):
    def __init__(self, tokenizer, df, text_col, max_len):
        self.tokenizer = tokenizer
        self.texts = df[text_col].values.tolist()
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        res = self.tokenizer(
            self.texts[idx],
            return_tensors="pt",
            padding=False,
            truncation=True,
            # return_special_tokens_mask=True,
            max_length=self.max_len,
        )
        res = {k:v[0] for k,v in res.items()}
        return res
    
def dynamic_pad_data_loader(tokenizer, dataset, batch_size, shuffle):
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    
    return DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        collate_fn=data_collator
    )

In [9]:
backbone = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(backbone)

ds = BertBaseDataset(tokenizer, news, "Abstract", None)
dl = dynamic_pad_data_loader(tokenizer, ds, batch_size=32, shuffle=False)

In [10]:
model = AutoModelForSequenceClassification.from_pretrained(backbone)
model.eval()

preds = []

with torch.no_grad(), tqdm(total=len(dl)) as pbar:
    for batch in dl:
        output = model(**batch)
        preds.append(output["logits"])
        pbar.update(1)

# Concatenate the logits from all batches
preds = torch.cat(preds, dim=0)

print("Shape of all logits:", preds.shape)

  0%|          | 0/1494 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 1494/1494 [2:31:45<00:00,  6.09s/it]  

Shape of all logits: torch.Size([47788, 5])





In [11]:
preds = preds.softmax(1).numpy()
preds_df = pd.DataFrame(preds, columns=[f"{i+1} stars" for i in range(5)])
preds_df["predicted stars"] = preds.argmax(axis=1) + 1
preds_df.insert(0, "Title", news['Title'])
preds_df.insert(0, "News ID", news['News ID'])
preds_df.head()

Unnamed: 0,News ID,Title,1 stars,2 stars,3 stars,4 stars,5 stars,predicted stars
0,N55528,"The Brands Queen Elizabeth, Prince Charles, an...",0.016579,0.015324,0.043104,0.248582,0.676411,5
1,N19639,50 Worst Habits For Belly Fat,0.234216,0.149065,0.099885,0.168455,0.348379,5
2,N61837,The Cost of Trump's Aid Freeze in the Trenches...,0.360419,0.288905,0.184041,0.113998,0.052637,1
3,N53526,I Was An NBA Wife. Here's How It Affected My M...,0.505623,0.407874,0.077338,0.006484,0.002682,1
4,N38324,"How to Get Rid of Skin Tags, According to a De...",0.051393,0.126471,0.300813,0.408346,0.112976,4


In [12]:
csv_path = "/kaggle/working/preds_abstract.csv"

# Save the DataFrame to a CSV file
preds_df.to_csv(csv_path, index=False)

In [14]:
merged = pd.merge(preds_df, news[["News ID", "Category", "SubCategory", "Abstract"]], on="News ID", how="inner")

csv_path = "/kaggle/working/merged_abstract.csv"
merged.to_csv(csv_path, index=False)

merged.head()

Unnamed: 0,News ID,Title,1 stars,2 stars,3 stars,4 stars,5 stars,predicted stars,Category,SubCategory,Abstract
0,N55528,"The Brands Queen Elizabeth, Prince Charles, an...",0.016579,0.015324,0.043104,0.248582,0.676411,5,lifestyle,lifestyleroyals,"Shop the notebooks, jackets, and more that the..."
1,N19639,50 Worst Habits For Belly Fat,0.234216,0.149065,0.099885,0.168455,0.348379,5,health,weightloss,These seemingly harmless habits are holding yo...
2,N61837,The Cost of Trump's Aid Freeze in the Trenches...,0.360419,0.288905,0.184041,0.113998,0.052637,1,news,newsworld,Lt. Ivan Molchanets peeked over a parapet of s...
3,N53526,I Was An NBA Wife. Here's How It Affected My M...,0.505623,0.407874,0.077338,0.006484,0.002682,1,health,voices,"I felt like I was a fraud, and being an NBA wi..."
4,N38324,"How to Get Rid of Skin Tags, According to a De...",0.051393,0.126471,0.300813,0.408346,0.112976,4,health,medical,"They seem harmless, but there's a very good re..."


In [15]:
for stars in range(1, 6):
    total_count = len(merged[merged['predicted stars'] == stars])
    top_categories = merged[merged['predicted stars'] == stars]['Category'].value_counts().nlargest(10)
    top_categories_percentage = (top_categories / total_count) * 100
    print(f'Top 10 categories for {stars} stars (percentage of total count):')
    print(top_categories_percentage)
    print('\n')

Top 10 categories for 1 stars (percentage of total count):
Category
news            32.352279
sports          25.835273
finance          6.486973
foodanddrink     5.075456
travel           4.542383
lifestyle        4.504843
video            4.144455
weather          3.904197
health           3.874165
autos            3.341092
Name: count, dtype: float64


Top 10 categories for 2 stars (percentage of total count):
Category
news            30.693690
sports          26.201421
finance          6.748851
foodanddrink     5.516089
lifestyle        5.181780
travel           4.617635
video            4.596740
weather          3.740075
autos            3.447555
health           3.426661
Name: count, dtype: float64


Top 10 categories for 3 stars (percentage of total count):
Category
news            30.040552
sports          26.747770
finance          6.715328
foodanddrink     5.806975
lifestyle        4.817518
travel           4.412003
health           4.120032
video            4.087591
weather 

In [16]:
news_df = merged[merged['Category'] == 'news']

# Print top 10 SubCategory values as a percentage for each predicted stars value
for stars in range(1, 6):
    total_count = len(news_df[news_df['predicted stars'] == stars])
    top_subcategories = news_df[news_df['predicted stars'] == stars]['SubCategory'].value_counts().nlargest(10)
    top_subcategories_percentage = (top_subcategories / total_count) * 100
    print(f'Top 10 SubCategories for {stars} stars (percentage of total count):')
    print(top_subcategories_percentage)
    print('\n')

Top 10 SubCategories for 1 stars (percentage of total count):
SubCategory
newsus                      41.610583
newspolitics                17.475052
newscrime                   14.504525
newsworld                   10.860989
newsscienceandtechnology     7.774426
newsoffbeat                  2.274310
newsopinion                  2.135066
newsgoodnews                 1.346020
elections-2020-us            0.696217
newsbusiness                 0.348109
Name: count, dtype: float64


Top 10 SubCategories for 2 stars (percentage of total count):
SubCategory
newsus                      38.733833
newspolitics                18.924438
newscrime                   13.955071
newsworld                   12.593601
newsscienceandtechnology     8.304969
newsoffbeat                  2.995235
newsopinion                  1.633764
newsgoodnews                 1.361470
elections-2020-us            0.612662
factcheck                    0.340368
Name: count, dtype: float64


Top 10 SubCategories for 3 stars