In [None]:
from abc import ABC, abstractmethod
from bs4 import BeautifulSoup
from selenium import webdriver
import requests
import time
import nltk
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder, TrigramCollocationFinder
from nltk.tokenize import word_tokenize, MWETokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import FreqDist
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9'}


class ScraperAlgorithm:
  class Scraper(ABC):
    def scrape(self, url):
      pass

class WebsiteClient:
  def __init__(self, scraper):
    self.scraper =  scraper;

  def main_article(self, url):
    return self.scraper.scrape(url);

class NTWebsite(ScraperAlgorithm):
  # StoryBodyCompanionColumn
  article_content = [];
  def scrape(self, url):
    res = requests.get(url, headers=headers);
    soup = BeautifulSoup(res.content, 'html.parser');
    for i in soup.findAll('div', attrs={"class":"StoryBodyCompanionColumn"}):
      content = i.find('p').get_text();
      self.article_content.append(content);
    
    return self.article_content;
    

class GuardianWebsite(ScraperAlgorithm):
  # dcr-n6w1lc
  article_content = [];
  def scrape(self, url):
    res = requests.get(url, headers=headers);
    soup = BeautifulSoup(res.content, 'html.parser');
    for i in soup.findAll('p', attrs={"class":"dcr-n6w1lc"}):
      content = i.get_text();
      self.article_content.append(content);
    
    return self.article_content;
  
# allowed_website=["theguardian", "The New York Times"]
# https://www.nytimes.com/2023/05/19/sports/superstar-billy-graham-dead.html
NT = NTWebsite()

client = WebsiteClient(NT);
contents = client.main_article("https://www.nytimes.com/2023/05/19/business/amazon-union-choke-points.html");

text = "".join(contents).lower();


#######################################################################################################
stopwords = set(stopwords.words("english"));

def clean_text(text):
  articles_tokenize = word_tokenize(text);
  punctuations = r".,\"-\\/#!?$%\^&\*;:{}=\-_'~()";
  articles_tokenize = [token for token in articles_tokenize if len(token) > 4 and 
                      token not in stopwords and token not in punctuations];


  lemma = WordNetLemmatizer()
  articles_tokenize = [lemma.lemmatize(token) for token in articles_tokenize]

  return " ".join(articles_tokenize);

# search_for_bigram = BigramCollocationFinder.from_documents(articles_tokenize);
# search_for_bigram.apply_freq_filter(min_freq=3)
# bigram = list(search_for_bigram.ngram_fd.items());

# search_for_trigram = TrigramCollocationFinder.from_documents(articles_tokenize);
# search_for_trigram.apply_freq_filter(min_freq=3);
# trigram = list(search_for_trigram.ngram_fd.items());


# bigrams = [bigram for bigram, freq in search_for_bigram.ngram_fd.items()]
# trigrams = [trigram for trigram, freq in search_for_trigram.ngram_fd.items()]


# mwe_tokenizer = MWETokenizer(bigrams + trigrams, separator='_')
# articles_tokenize = [mwe_tokenizer.tokenize(article) for article in articles_tokenize]


model = clean_text(text);
print(model)



familiar corporate dominance amazon delivery employee loose costly concession amazon worker threaten choke point within delivery network appear concession multiple times.amazon recent growth helped create choke point worker sought exploit first decade company stayed delivery business simply handed razor blade like fedex postal service.amazon began transporting package holiday season surge order backed carrier later pandemic amazon significantly increased transportation footprint handle order seeking drive delivery time hence vans.and amazon chief executive jassy seek drive shipping time disruptive potential organizing growing.on evening local leader fledgling united automobile worker flashed light outside union office across massive general motor plant flint mich. summoning plant steward plant condition deteriorated least grueling speed-up required worker thousand motion union decided strike recognition steward returned plant employee stopped working refused leave.amazon moved differen

In [None]:
df = pd.read_csv("/content/BBC News Train.csv");
df.drop("ArticleId", axis=1, inplace=True)

df["Text"] = df['Text'].apply(clean_text);

vectorizer = CountVectorizer();
X = vectorizer.fit_transform(df['Text']);
y = df['Category'];

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
y_pred = nb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", (accuracy*100),'%')
print("Precision:", (precision*100),'%')
print("Recall:", (recall*100),'%')
print('F1-score:', (f1*100),'%')

article_vectorized = vectorizer.transform([model])
category = nb_model.predict(article_vectorized)[0]
print("Predicted category:", category)

Accuracy: 96.97986577181209 %
Precision: 97.00798886516169 %
Recall: 96.97986577181209 %
F1-score: 96.98081492191 %
Predicted category: business
