In [1]:
Sports_URLs = ['https://en.wikipedia.org/wiki/Football',
        'https://en.wikipedia.org/wiki/Cricket',
        'https://en.wikipedia.org/wiki/Badminton',
        'https://en.wikipedia.org/wiki/Basketball',
        'https://en.wikipedia.org/wiki/Hockey']

Education_URLs = ['https://en.wikipedia.org/wiki/School',
        'https://en.wikipedia.org/wiki/College',
        'https://en.wikipedia.org/wiki/University',
        'https://en.wikipedia.org/wiki/Professor',
        'https://en.wikipedia.org/wiki/Teacher']

In [2]:
import requests
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

stemmer = PorterStemmer()
stop_words = set(stopwords.words("english"))

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    tokens = word_tokenize(text)
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    text = " ".join(tokens)
    return text

def get_text(URL):
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, "html.parser")
    Ps = soup.find_all("p")
    text = ""
    for p in Ps:
        text += p.text.strip()
    return clean_text(text)

Sports_text = [get_text(URL) for URL in Sports_URLs]
Education_text = [get_text(URL) for URL in Education_URLs]

In [3]:
Sports_text

['footbal famili team sport involv vari degre kick ball score goal unqualifi word footbal normal mean form footbal popular word use sport commonli call footbal includ associ footbal known soccer australia canada south africa unit state sometim ireland new zealand australian rule footbal gaelic footbal gridiron footbal specif american footbal arena footbal canadian footbal intern rule footbal rugbi leagu footbal rugbi union football1 variou form footbal share vari degre common origin known footbal codesther number refer tradit ancient prehistor ball game play mani differ part world234 contemporari code footbal trace back codif game english public school 19th centuri outgrowth mediev football56 expans cultur power british empir allow rule footbal spread area british influenc outsid directli control empire7 end 19th centuri distinct region code alreadi develop gaelic footbal exampl deliber incorpor rule local tradit footbal game order maintain heritage8 1888 footbal leagu found england be

In [4]:
Education_text

['school educ institut build design provid learn space learn environ teach student direct teacher countri system formal educ sometim compulsory2 system student progress seri school built oper govern privat organ name school vari countri discuss region term section gener includ primari school young children secondari school teenag complet primari educ institut higher educ taught commonli call univers colleg universityin addit core school student given countri may also attend school primari elementari us secondari middl school us education3 kindergarten preschool provid school young children typic age 35 univers vocat school colleg seminari may avail secondari school school may dedic one particular field school econom danc altern school may provid nontradit curriculum methodsnongovern school also known privat schools4 may requir govern suppli adequ specif educ need privat school also religi christian school gurukula hindu school madrasa arab school hawza shii muslim school yeshiva jewish

In [5]:
# Prepare a dataframe with the text and the category as labels
import pandas as pd

df = pd.DataFrame({
    "text": Sports_text + Education_text,
    "category": ["Sports"]*len(Sports_text) + ["Education"]*len(Education_text)
})

df

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Unnamed: 0,text,category
0,footbal famili team sport involv vari degre ki...,Sports
1,firstclass cricketon day internationallimit ov...,Sports
2,badminton racquet sport play use racquet hit s...,Sports
3,basketbal team sport two team commonli five pl...,Sports
4,hockey term use denot famili variou type summe...,Sports
5,school educ institut build design provid learn...,Education
6,colleg latin collegium educ institut constitu ...,Education
7,univers latin universita whole institut higher...,Education
8,professor commonli abbrevi prof1 academ rank u...,Education
9,teacher also call schoolteach formal educ pers...,Education


In [6]:
from collections import Counter
import numpy as np

def get_unigram_counts(texts):
    unigram_counts = Counter()
    for text in texts:
        unigram_counts.update(text.split())
    return unigram_counts

unigram_counts = get_unigram_counts(df["text"])
unigram_counts

def get_unigram_count_matrix(texts, unigram_counts):
    matrix = np.zeros((len(texts), len(unigram_counts)))
    for i, text in enumerate(texts):
        counts = Counter(text.split())
        for j, word in enumerate(unigram_counts):
            matrix[i, j] = counts[word]
    return matrix

unigram_count_matrix = get_unigram_count_matrix(df["text"], unigram_counts)
unigram_count_matrix

unigram_count_df = pd.DataFrame(unigram_count_matrix, columns=unigram_counts.keys())
unigram_count_df

Unnamed: 0,footbal,famili,team,sport,involv,vari,degre,kick,ball,score,...,processknow,nay,nonautocrat,guardian,cherish,ancestor,succeed,never,realli,nurtur
0,211.0,1.0,23.0,30.0,9.0,4.0,2.0,30.0,92.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,6.0,0.0,61.0,18.0,4.0,0.0,0.0,0.0,64.0,31.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,10.0,19.0,3.0,0.0,0.0,0.0,8.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5.0,0.0,88.0,22.0,4.0,6.0,1.0,2.0,104.0,15.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2.0,1.0,11.0,30.0,3.0,2.0,0.0,0.0,17.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,2.0,0.0,0.0,0.0,2.0,3.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,1.0,1.0,5.0,50.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,1.0,0.0,0.0,2.0,3.0,11.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,1.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,1.0,0.0,0.0,6.0,5.0,12.0,0.0,0.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
unigram_count_df.shape

(10, 6440)

In [8]:
def get_bigram_counts(texts):
    bigram_counts = Counter()
    for text in texts:
        words = text.split()
        bigrams = [(words[i], words[i+1]) for i in range(len(words)-1)]
        bigram_counts.update(bigrams)
    return bigram_counts

bigram_counts = get_bigram_counts(df["text"])
bigram_counts

def get_bigram_prob_matrix(texts, bigram_counts, unigram_counts):
    matrix = np.zeros((len(texts), len(bigram_counts)))
    for i, text in enumerate(texts):
        words = text.split()
        bigrams = [(words[i], words[i+1]) for i in range(len(words)-1)]
        for j, bigram in enumerate(bigram_counts):
            matrix[i, j] = bigram_counts[bigram]/unigram_counts[bigram[0]]
    return matrix

bigram_prob_matrix = get_bigram_prob_matrix(df["text"], bigram_counts, unigram_counts)
bigram_prob_matrix

bigram_prob_df = pd.DataFrame(bigram_prob_matrix, columns=bigram_counts.keys())
bigram_prob_df

Unnamed: 0_level_0,footbal,famili,team,sport,involv,vari,degre,kick,ball,score,...,form,year,mind,spirit,nurtur,school,environ,embodi,basic,american
Unnamed: 0_level_1,famili,team,sport,involv,vari,degre,kick,ball,score,goal,...,year,mind,spirit,nurtur,school,environ,embodi,basic,american,truth
0,0.004444,0.166667,0.046392,0.025,0.03125,0.107143,0.012346,0.25,0.003509,0.03125,...,0.010526,0.012658,0.5,0.090909,1.0,0.00216,0.083333,0.5,0.055556,0.017544
1,0.004444,0.166667,0.046392,0.025,0.03125,0.107143,0.012346,0.25,0.003509,0.03125,...,0.010526,0.012658,0.5,0.090909,1.0,0.00216,0.083333,0.5,0.055556,0.017544
2,0.004444,0.166667,0.046392,0.025,0.03125,0.107143,0.012346,0.25,0.003509,0.03125,...,0.010526,0.012658,0.5,0.090909,1.0,0.00216,0.083333,0.5,0.055556,0.017544
3,0.004444,0.166667,0.046392,0.025,0.03125,0.107143,0.012346,0.25,0.003509,0.03125,...,0.010526,0.012658,0.5,0.090909,1.0,0.00216,0.083333,0.5,0.055556,0.017544
4,0.004444,0.166667,0.046392,0.025,0.03125,0.107143,0.012346,0.25,0.003509,0.03125,...,0.010526,0.012658,0.5,0.090909,1.0,0.00216,0.083333,0.5,0.055556,0.017544
5,0.004444,0.166667,0.046392,0.025,0.03125,0.107143,0.012346,0.25,0.003509,0.03125,...,0.010526,0.012658,0.5,0.090909,1.0,0.00216,0.083333,0.5,0.055556,0.017544
6,0.004444,0.166667,0.046392,0.025,0.03125,0.107143,0.012346,0.25,0.003509,0.03125,...,0.010526,0.012658,0.5,0.090909,1.0,0.00216,0.083333,0.5,0.055556,0.017544
7,0.004444,0.166667,0.046392,0.025,0.03125,0.107143,0.012346,0.25,0.003509,0.03125,...,0.010526,0.012658,0.5,0.090909,1.0,0.00216,0.083333,0.5,0.055556,0.017544
8,0.004444,0.166667,0.046392,0.025,0.03125,0.107143,0.012346,0.25,0.003509,0.03125,...,0.010526,0.012658,0.5,0.090909,1.0,0.00216,0.083333,0.5,0.055556,0.017544
9,0.004444,0.166667,0.046392,0.025,0.03125,0.107143,0.012346,0.25,0.003509,0.03125,...,0.010526,0.012658,0.5,0.090909,1.0,0.00216,0.083333,0.5,0.055556,0.017544


In [9]:
bigram_prob_df.shape

(10, 28150)

In [10]:
def get_tf_matrix(texts, unigram_counts):
    matrix = np.zeros((len(texts), len(unigram_counts)))
    for i, text in enumerate(texts):
        counts = Counter(text.split())
        for j, word in enumerate(unigram_counts):
            matrix[i, j] = counts[word]
    return matrix

tf_matrix = get_tf_matrix(df["text"], unigram_counts)
tf_matrix

def get_idf_vector(texts, unigram_counts):
    idf_vector = np.zeros(len(unigram_counts))
    for j, word in enumerate(unigram_counts):
        idf_vector[j] = np.log(len(texts)/sum([1 for text in texts if word in text]))
    return idf_vector

idf_vector = get_idf_vector(df["text"], unigram_counts)
idf_vector

def get_tfidf_matrix(tf_matrix, idf_vector):
    return tf_matrix*idf_vector

tfidf_matrix = get_tfidf_matrix(tf_matrix, idf_vector)
tfidf_matrix

tfidf_df = pd.DataFrame(tfidf_matrix, columns=unigram_counts.keys())
tfidf_df

Unnamed: 0,footbal,famili,team,sport,involv,vari,degre,kick,ball,score,...,processknow,nay,nonautocrat,guardian,cherish,ancestor,succeed,never,realli,nurtur
0,146.254055,0.693147,11.748989,10.700248,2.008292,0.0,0.71335,48.283137,46.995957,2.14005,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4.158883,0.0,31.160363,6.420149,0.892574,0.0,0.0,0.0,32.69284,11.056923,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.693147,0.0,5.108256,6.776824,0.669431,0.0,0.0,0.0,4.086605,3.210074,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3.465736,0.0,44.952655,7.846849,0.892574,0.0,0.356675,3.218876,53.125865,5.350124,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.386294,0.693147,5.619082,10.700248,0.669431,0.0,0.0,0.0,8.684036,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,1.386294,0.0,0.0,0.0,0.0,1.070025,0.0,0.0,0.356675,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.356675,0.223144,0.0,17.833747,0.0,0.0,0.356675,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.693147,0.0,0.0,0.446287,0.0,3.923424,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.510826,0.0,0.0,0.0,0.71335,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.693147,0.0,0.0,1.338861,0.0,4.280099,0.0,0.0,0.356675,...,2.302585,2.302585,2.302585,2.302585,2.302585,2.302585,2.302585,0.916291,2.302585,2.302585


In [13]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(unigram_count_matrix, df["category"], test_size=0.2, random_state=42)

model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

1.0

In [14]:
X_train, X_test, y_train, y_test = train_test_split(bigram_prob_matrix, df["category"], test_size=0.2, random_state=42)

model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.5

In [15]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, df["category"], test_size=0.2, random_state=42)

from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

1.0