# Multinominal NB News Article Classifier with NLP

Training an ML model to categorize News Articles in entertainment, business, and tech.

In [68]:
# import modules
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import seaborn as sns
import os
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import nltk

Dataset: Preproccessed Text from BBC news ready for ML.
https://www.kaggle.com/datasets/dheemanthbhat/bbc-full-text-preprocessed?select=docs_stage_3_preprocessed.csv

1. Reading data into a df.

In [69]:
df = pd.read_csv("news.csv")
display(df[:5])

Unnamed: 0,DocId,DocTextlen,DocText,ADJ,ADP,ADV,AUX,CCONJ,DET,NOUN,...,PUNCT,SCONJ,SYM,VERB,X,INTJ,DocType,FileSize,FilePath,DocCat
0,B_001,2553,ad sale boost time_warner profit quarterly pro...,31,61,15.0,15.0,13.0,28,114,...,55,3.0,9.0,53,0.0,0.0,Business,2560,../input/bbc-full-text-document-classification...,0
1,B_002,2248,dollar gain greenspan speech dollar hit high l...,33,54,15.0,21.0,9.0,44,99,...,43,5.0,2.0,43,0.0,0.0,Business,2252,../input/bbc-full-text-document-classification...,0
2,B_003,1547,yukos unit buyer face loan claim owner embattl...,11,32,3.0,15.0,4.0,25,71,...,26,3.0,4.0,42,0.0,0.0,Business,1552,../input/bbc-full-text-document-classification...,0
3,B_004,2395,high fuel price hit ba profit british_airways ...,36,53,16.0,17.0,8.0,26,114,...,62,8.0,10.0,45,0.0,0.0,Business,2412,../input/bbc-full-text-document-classification...,0
4,B_005,1565,pernod takeover talk lift domecq share uk drin...,15,32,5.0,13.0,8.0,14,68,...,35,5.0,3.0,26,0.0,0.0,Business,1570,../input/bbc-full-text-document-classification...,0


2. Data cleaning

In [70]:
print(df.columns.values)

['DocId' 'DocTextlen' 'DocText' 'ADJ' 'ADP' 'ADV' 'AUX' 'CCONJ' 'DET'
 'NOUN' 'NUM' 'PART' 'PRON' 'PROPN' 'PUNCT' 'SCONJ' 'SYM' 'VERB' 'X'
 'INTJ' 'DocType' 'FileSize' 'FilePath' 'DocCat']


The columns to keep are:
- DocText: contains the news articles
- DocType: categories of the news articles, as strings
- DocCat: categories of the news articles, as numbers

In [71]:
df.drop(columns=['DocId', 'DocTextlen', 'ADJ','ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X', 'INTJ', 'FileSize', 'FilePath'], inplace=True),'\n'
display(df[:5])

Unnamed: 0,DocText,DocType,DocCat
0,ad sale boost time_warner profit quarterly pro...,Business,0
1,dollar gain greenspan speech dollar hit high l...,Business,0
2,yukos unit buyer face loan claim owner embattl...,Business,0
3,high fuel price hit ba profit british_airways ...,Business,0
4,pernod takeover talk lift domecq share uk drin...,Business,0


In [72]:
df = df.rename(columns={"DocText":"text", "DocType":"type", "DocCat":"cat"})
display(df[:5])

Unnamed: 0,text,type,cat
0,ad sale boost time_warner profit quarterly pro...,Business,0
1,dollar gain greenspan speech dollar hit high l...,Business,0
2,yukos unit buyer face loan claim owner embattl...,Business,0
3,high fuel price hit ba profit british_airways ...,Business,0
4,pernod takeover talk lift domecq share uk drin...,Business,0


2e. Remove any NaN

In [73]:
df=df.dropna()
display(df[:5])

Unnamed: 0,text,type,cat
0,ad sale boost time_warner profit quarterly pro...,Business,0
1,dollar gain greenspan speech dollar hit high l...,Business,0
2,yukos unit buyer face loan claim owner embattl...,Business,0
3,high fuel price hit ba profit british_airways ...,Business,0
4,pernod takeover talk lift domecq share uk drin...,Business,0


Show count and type of each DocType.

In [74]:
print(df.groupby(['type']).size())
print(df.groupby(['cat']).size())

type
Business         510
Entertainment    381
Politics         413
Sport            506
Tech             395
dtype: int64
cat
0    510
1    381
2    413
3    506
4    395
dtype: int64


Check DocType and DocCat Correspond.

In [75]:
print(((df['type'] == 'Business') & (df['cat'] == 0)).any())
print(((df['type'] == 'Entertainment') & (df['cat'] == 1)).any())
print(((df['type'] == 'Politics') & (df['cat'] == 2)).any())
print(((df['type'] == 'Sport') & (df['cat'] == 3)).any())
print(((df['type'] == 'Tech') & (df['cat'] == 4)).any())

True
True
True
True
True


Create a lookup table using cat and type.

In [76]:
d = dict(zip(df['cat'],df["type"]))
print(d)

{0: 'Business', 1: 'Entertainment', 2: 'Politics', 3: 'Sport', 4: 'Tech'}


4. Preparing data for ML


In [77]:
df.drop(columns=['type'], inplace=True),'\n'
display(df[:5])

Unnamed: 0,text,cat
0,ad sale boost time_warner profit quarterly pro...,0
1,dollar gain greenspan speech dollar hit high l...,0
2,yukos unit buyer face loan claim owner embattl...,0
3,high fuel price hit ba profit british_airways ...,0
4,pernod takeover talk lift domecq share uk drin...,0


4b. __Create the X and y datasets__

In [78]:
X=df.text
y=df.cat
print(X.shape)
print(y.shape)

(2205,)
(2205,)


Inspect the dataset for any further preprocessing.

In [79]:
print(df.text[0])

ad sale boost time_warner profit quarterly profit media giant timewarner jump 76 $ 1.13bn £ 600 m month december $ 639 m year early firm big investor google benefit sale high speed internet connection high advert sale timewarner say fourth quarter sale rise 2 $ 11.1bn $ 10.9bn profit buoy gain offset profit dip warner_bros user aol time_warner say friday own 8 search engine google internet business aol mixed fortune lose 464,000 subscriber fourth quarter profit low precede quarter company say aol underlie profit exceptional item rise 8 strong internet advertising revenue hope increase subscriber offer online service free timewarner internet customer try sign aol exist customer high speed broadband timewarner restate 2000 2003 result follow probe the_us_securities_exchange_commission sec close conclude time_warner's fourth quarter profit slightly well analyst expectation film division see profit slump 27 $ 284 m help box office flop alexander catwoman sharp contrast year early final fil

4e. Convert the preprocessed data to numbers so it's ready for the ML model.

In [80]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
vect.fit(X)
X = vect.transform(X)
X.shape

(2205, 28975)

Create X and Y Training and Testing Datasets.

In [81]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1653, 28975) (552, 28975) (1653,) (552,)


5b. Train and test the ML model and  print  accuracy measurements.

In [82]:
classifier = MultinomialNB()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
# accuracy
print(metrics.accuracy_score(y_test, y_pred))
metrics.confusion_matrix(y_test, y_pred, labels=[0,1,2,3,4])

0.9855072463768116


array([[122,   0,   3,   0,   1],
       [  2,  84,   1,   0,   0],
       [  0,   0, 105,   0,   0],
       [  1,   0,   0, 119,   0],
       [  0,   0,   0,   0, 114]])

In [83]:
print(d)

{0: 'Business', 1: 'Entertainment', 2: 'Politics', 3: 'Sport', 4: 'Tech'}


From the [BBC News](https://www.bbc.com/news) website, select 3 different articles w/ entertainment, business, and tech, copy/paste a few body paragraphs into each variable for testing data.

In [84]:
entertainment = "The film's production company Paramount Pictures said the injuries were non life-threatening and happened while shooting a planned stunt sequence. The crew members were all in stable condition and continue to receive treatment, the statement said. Earlier this week, the Sun reported there had been an explosion and six people went to hospital. It was terrifying - a huge ball of fire flew up and caught several crew members in its path. In years of filming I've never seen an accident so scary, a source told the newspaper. Everyone involved, from the lowliest runners to the star names, has been shaken up by this, they added. In a statement, a Paramount Pictures spokesperson said: The safety and full medical services teams on-site were able to act quickly so that those who were impacted immediately received necessary care. They said it has strict health and safety procedures in place on all our productions and would take all necessary precautions as we resume production. According to Variety, no cast members were injured but six people received treatment for burn injuries and four remain in hospital. Sir Ridley Scott, who directed the original 2000 historical drama film, is returning to direct the second instalment, which is scheduled to be released in November 2024. No title has yet been announced for the sequel, which stars Normal People actor Paul Mescal, Denzel Washington and Connie Nielson. The original film won five Oscars, including best actor for Russell Crowe, who played Roman general Maximus Decimus Meridius alongside Joaquin Phoenix as Emperor Commodus. The movie, set during the height of the Roman Empire, sees Maximus start out as a war hero before before being forced to become a gladiator. Gladiator made $457m (£355m) at the box office and revived the historical epic drama genre, which had been out of fashion for decades."

In [85]:
business="Laura Lane, UPS's chief corporate affairs and sustainability officer, enters the BBC's Executive Lounge to discuss how the massive logistics company isn't waiting for the perfect electric lorry. While your online purchases are on the road, Laura Lane is keeping on an eye on the impact of the journey. As UPS's chief corporate affairs and sustainability officer, Lane is working to trim the emissions of the logistics firm, which is responsible for delivering 22 million packages every day across more than 200 countries and territories. Most of these deliveries are driven on a fleet of 125,000 package cars, vans, lorries and motorcycles, or flown on a fleet of around 500 leased, owned and chartered aircraft. The combined emissions from all these vehicles add up. According to UPS data seen by the BBC, its air and ground operations produced a total of 14 million tonnes of CO2 or equivalent emissions in 2023. Lane isn't content to simply wait for the perfect solution, be that an emissions-free aircraft fuel or a fully battery-electric lorry. She's figuring out the answer as she goes – and the vast amount of data collected by UPS is critical to reaching the company's sustainability targets. UPS is an engineering company and a technology company at its foundation, Lane tells the BBC, and so we're always looking for efficiencies. And efficiencies equal sustainability. So far, UPS is finding success. In 2023, it logged an 8.1% decrease in Scope 1 emissions (pollution UPS produces directly), Scope 2 emissions (pollution from sources like electricity UPS uses to power its facilities) and Scope 3 emissions (pollution associated with the company's suppliers and customers use of UPS' services). That's an improvement from 6.9% the previous year.  Lane, who joined UPS as its president of global public affairs in 2011 – and previously served as a US trade negotiator, a US foreign service diplomat and an executive at both Time Warner and Citigroup – talked to the BBC about how the logistics company is setting itself up to meet its alternative-fuel goals early The BBC's series features interviews with executive leaders making innovative, data-driven decisions helping shape the future of business – and paving the path for other leaders to thrive. Read more conversations here."

In [86]:
tech = "A trial under way at Aberdeen Royal Infirmary is exploring whether artificial intelligence (AI) can assist radiologists in reviewing thousands of mammograms a year. The pilot helped spot early-stage breast cancer for June - a healthcare assistant and participant in the trial - and she is now set to undergo surgery as a result. Mammograms are low level X-rays used in breast cancer screenings to monitor and detect changes too small to see or feel. According to the NHS, they help save about 1,300 lives each year in the UK. And while the number of women who attended a routine breast screening, after an invitation, increased in Scotland in the three-year period to 2022, the number of radiologists to review results is shrinking. What is AI? AI - technology which sees computers perform specific tasks that would typically require human intelligence - is already widely used across a range of industries. While high-profile experts' fears that AI could lead to the extinction of humanity have recently been making headlines, the tech's more practical realities are already being shown in healthcare. Its potential to speed up the process of drug and disease discovery means many scientists and doctors see AI as a powerful tool to work with, rather than replace, practitioners."

6c. __Create a DataFrame from the 3 Python strings__.<br>
Then __print the DataFrame__.

_An example DataFrame is shown below, from news articles on 6/3. Your text will be different._

In [87]:
d2 = pd.DataFrame(columns = ["text"],data = [[entertainment], [business], [tech]])
display(d2)

Unnamed: 0,text
0,The film's production company Paramount Pictur...
1,"Laura Lane, UPS's chief corporate affairs and ..."
2,A trial under way at Aberdeen Royal Infirmary ...


6d. Test the ML model with new data.
1. Preprocess the new data
2. Convert the new data to numbers
3. Test the model with the data
4. Print the categories of news  that the model predicted. 

In [88]:
#preprocess new data
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
#nltk.download('stopwords')
#stop_words=set(stopwords.words("english"))
stemmer = PorterStemmer()
tokenizer = RegexpTokenizer('[a-z]+')   # use tokenizer for letters only

def preprocess(s) :
    w = tokenizer.tokenize(s.lower())         # separate into words and lowercase each word
 #   w = [word for word in w if word not in stop_words]    # remove stop words
    w = [stemmer.stem(word) for word in w]    # find stem of each word
    return ' '.join(w)         # join back into a string

# with each row, preprocess the text string, and store all rows in the X_processed DataFrame
X = pd.DataFrame([preprocess(d2.loc[i,'text']) for i in range(len(d2))])
X.columns = ["text"]
X.head() 


Unnamed: 0,text
0,the film s product compani paramount pictur sa...
1,laura lane up s chief corpor affair and sustai...
2,a trial under way at aberdeen royal infirmari ...


In [89]:
#convert new data to numbers
X_new=X["text"]
X_new = vect.transform(X_new)
display(X_new[0])

<1x28975 sparse matrix of type '<class 'numpy.int64'>'
	with 92 stored elements in Compressed Sparse Row format>

In [90]:
#test model with data
y_pred_new=classifier.predict(X_new)
print(y_pred_new)

[1 4 4]


In [91]:

# Print the categories of news that the model predicted
for i, pred in enumerate(y_pred_new):
    print(f"Article {i+1}: {d[pred]}")

Article 1: Entertainment
Article 2: Tech
Article 3: Tech
