In [4]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

# URLs
Sports_URLs = ['https://en.wikipedia.org/wiki/Football',
               'https://en.wikipedia.org/wiki/Cricket',
               'https://en.wikipedia.org/wiki/Badminton',
               'https://en.wikipedia.org/wiki/Basketball',
               'https://en.wikipedia.org/wiki/Hockey']

Education_URLs = ['https://en.wikipedia.org/wiki/School',
                  'https://en.wikipedia.org/wiki/College',
                  'https://en.wikipedia.org/wiki/University',
                  'https://en.wikipedia.org/wiki/Professor',
                  'https://en.wikipedia.org/wiki/Teacher']

# Function to get text from URL
def get_text(URL):
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, "html.parser")
    paragraphs = soup.find_all("p")
    text = " ".join([p.get_text() for p in paragraphs])
    return text

# Get text for sports and education URLs
Sports_text = [get_text(URL) for URL in Sports_URLs]
Education_text = [get_text(URL) for URL in Education_URLs]

# Create dataframe
df = pd.DataFrame({
    "text": Sports_text + Education_text,
    "category": ["Sports"]*len(Sports_text) + ["Education"]*len(Education_text)
})

# Text preprocessing
stop_words = set(stopwords.words("english"))

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    text = " ".join(tokens)
    return text

df['text'] = df['text'].apply(clean_text)

# Feature extraction
count_vectorizer = CountVectorizer()
X_counts = count_vectorizer.fit_transform(df['text'])

tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_counts)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df["category"], test_size=0.2, random_state=42)

# Train Multinomial Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Unigram table
unigram_table = pd.DataFrame(X_counts.toarray(), columns=count_vectorizer.get_feature_names_out())
print("\nUnigram Table:")
print(unigram_table.head())

# Bigram table
bigram_vectorizer = CountVectorizer(ngram_range=(2, 2))
X_bigram_counts = bigram_vectorizer.fit_transform(df['text'])
bigram_table = pd.DataFrame(X_bigram_counts.toarray(), columns=bigram_vectorizer.get_feature_names_out())
print("\nBigram Table:")
print(bigram_table.head())

# TF-IDF table
tfidf_table = pd.DataFrame(X_tfidf.toarray(), columns=count_vectorizer.get_feature_names_out())
print("\nTF-IDF Table:")
print(tfidf_table.head())


Accuracy: 1.0

Unigram Table:
   03  062  072mm  073  076  07mm  10  100  1000  100000  ...  κέρας11  \
0   0    0      0    0    0     0   1    0     0       0  ...        0   
1   0    0      0    0    0     0   3    2     0       0  ...        0   
2   0    1      1    1    1     1   1    0     0       0  ...        0   
3   0    0      0    0    0     0   9    1     0       0  ...        0   
4   0    0      0    0    0     0   0    0     1       0  ...        1   

   κερητίζειν  κολλέγιο  μεταλυκειακής  σχολή  φαινίνδα  מכללה  מכללות  \
0           0         0              0      0         1      0       0   
1           0         0              0      0         0      0       0   
2           0         0              0      0         0      0       0   
3           0         0              0      0         0      0       0   
4           1         0              0      0         0      0       0   

   ἐπίσκυρος  蹴鞠  
0          1   2  
1          0   0  
2          0   0  
3   