In [40]:
import wikipedia
import nltk
from nltk import NaiveBayesClassifier
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem import PorterStemmer as stemmer
from nltk import FreqDist
from nltk.classify import apply_features
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
import re

In [41]:
def preprocessing_text(text):
    remove_text = re.sub(r'[^a-zA-Z\s]', '', text)
    low_case_text = remove_text.lower()
    token_word = word_tokenize(low_case_text)
    stop_text = set(stopwords.words('english'))
    cleaned_text = ' '.join([word for word in token_word if word.lower() not in stop_text])
    soup = BeautifulSoup(cleaned_text, 'html.parser')
    remove_html_text = soup.get_text()
    lemmatizer = WordNetLemmatizer()
    token_text = nltk.word_tokenize(remove_html_text)
    lemmatized_text = ' '.join(lemmatizer.lemmatize(word) for word in token_text)
    stemmed = SnowballStemmer('english')
    token_text = nltk.word_tokenize(lemmatized_text)
    filtered_text = ' '.join(stemmed.stem(word) for word in token_text)
    return filtered_text



In [42]:
import requests

def get_wikipedia_articles_by_category(category, language = 'en', num_results=10000):
  base_url = 'https://en.wikipedia.org/w/api.php'.format(language)
  parameters = {
        'action': 'query',
        'format': 'json',
        'list': 'categorymembers',
        'cmtitle': 'Category:' + category,
        'cmlimit': num_results
  }
  response = requests.get(base_url, parameters)
  data = response.json()

  if 'query' in data and 'categorymembers' in data['query']:
    articles = [entry['title'] for entry in data['query']['categorymembers']]
    return articles
  else:
    return None

In [43]:
geographic = ['Topography','Tectonics','Cartography','Geomorphology','Climate','Hydrology']
non_geographic = ['Resilience','Integrity','Empathy','Tenacity','Ingenuity', "Fortitude"]
geographic_articles = []
non_geographic_articles = []
for i in range(len(geographic)):
  geographic_category = get_wikipedia_articles_by_category(geographic[i],)
  geographic_articles.append(geographic_category)
geographic_articles = [article for articles in geographic_articles for article in articles]
print("Geographic articles: ", geographic_articles)
for j in range(len(non_geographic)):
  non_geographic_category = get_wikipedia_articles_by_category(non_geographic[j],)
  non_geographic_articles.append(non_geographic_category)

non_geographic_articles = [article for articles in non_geographic_articles for article in articles]

print('\nNon Geographic Articles: ', non_geographic_articles)

labeled_data = [
    {'text': article, 'label': 1} for article in geographic_articles
] + [
    {'text': article, 'label': 0} for article in non_geographic_articles
]
import pandas as pd
article_df = pd.DataFrame(labeled_data)


Non Geographic Articles:  ['Empathy', 'Against Empathy', 'Artificial empathy', 'Clinical empathy', 'Creativity and mental health', 'Digital empathy', 'Double empathy problem', 'Ecological empathy', 'Empath', 'Empathic accuracy', 'The Empathic Civilization', 'Empathic concern', 'Empathic design', 'Empathising–systemising theory', 'Empathy gap', 'Empathy in literature', 'Empathy in media research', 'Empathy in online communities', 'Empathy quotient', 'Empathy-altruism', 'Ethnocultural empathy', 'Gender empathy gap', 'Hot-cold empathy gap', 'Light triad', 'Mimpathy', 'Schadenfreude', 'Self-other control', 'Simulation theory of empathy']


In [44]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
filtered_text = [preprocessing_text(doc) for doc in article_df['text']]

vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(filtered_text)

print("Vocabulary:", vectorizer.get_feature_names_out())
print("BoW Matrix:\n", bow_matrix.toarray())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emila\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\emila\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\emila\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Vocabulary: ['abras' 'absorpt' 'acclimat' ... 'zenith' 'zerocurtain' 'zone']
BoW Matrix:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [45]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

labels = article_df['label']

X_train, X_test, y_train, y_test = train_test_split(bow_matrix, labels, test_size = 0.2, random_state = 37)

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

nb_predict = nb_classifier.predict(X_test)

lr_classifier = LogisticRegression()
lr_classifier.fit(X_train, y_train)

lr_predict = lr_classifier.predict(X_test)

print('Naive Bayes Accuracy: ', accuracy_score(y_test, nb_predict))
print('Naive Bayes Classification:\n', classification_report(y_test, nb_predict))

print('\nLogistic Regression Accuracy:', accuracy_score(y_test, lr_predict))
print('Logistion Regression Classification:\n', classification_report(y_test, lr_predict))

Naive Bayes Accuracy:  0.9906542056074766
Naive Bayes Classification:
               precision    recall  f1-score   support

           0       0.50      1.00      0.67         2
           1       1.00      0.99      1.00       212

    accuracy                           0.99       214
   macro avg       0.75      1.00      0.83       214
weighted avg       1.00      0.99      0.99       214


Logistic Regression Accuracy: 1.0
Logistion Regression Classification:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00       212

    accuracy                           1.00       214
   macro avg       1.00      1.00      1.00       214
weighted avg       1.00      1.00      1.00       214

