In [45]:
# from get_links import links_on_page
import numpy as np
import pandas as pd
import requests
import re
import networkx as nx
import matplotlib.pyplot as plt
import nltk
import sklearn
from bs4 import BeautifulSoup
from tqdm import tqdm
tqdm.pandas()
from animal_list import names_from_table
from netwulf import visualize

In [47]:
animal_names = names_from_table()
# read txt file with pandas
animal_df = pd.read_csv('animal_links.txt', header=None)
animal_df.columns = ['page-name']

# remove the first part of the url
animal_df['page-name'] = animal_df['page-name'].str.replace('https://en.wikipedia.org', '')
animal_df["name"] = animal_df["page-name"].str.split("/").str[-1]


In [50]:
# animal_names
animal_df

Unnamed: 0,page-name,name
0,/wiki/Scaly-crowned_babbler,Scaly-crowned_babbler
1,/wiki/Velvet-fronted_nuthatch,Velvet-fronted_nuthatch
2,/wiki/Mangrove_whistler,Mangrove_whistler
3,/wiki/Mees%27s_white-eye,Mees%27s_white-eye
4,/wiki/Ictiobus,Ictiobus
...,...,...
31749,/wiki/Eutropis_multicarinata,Eutropis_multicarinata
31750,/wiki/Gehyra_vorax,Gehyra_vorax
31751,/wiki/Minervarya_andamanensis,Minervarya_andamanensis
31752,/wiki/Minervarya_greenii,Minervarya_greenii


In [51]:
# create function that will run on each row in the dataframe, which will take the page name and return all the readable text on the page

def get_text(page_name):
    url = 'https://en.wikipedia.org' + page_name
    response = requests.get(url)
    html_content = response.content
    soup = BeautifulSoup(html_content, 'html.parser')
    # Find all the paragraphs in the body
    paragraphs = soup.body.find_all('p')
    # Extract the text from the paragraphs but remove \n
    # text = [p.text for p in paragraphs]
    text = [str(p.text.replace('\n', '')).strip() for p in paragraphs]
    # Join the paragraphs together
    joined_text = ' '.join(text)
    # remove first space
    joined_text = joined_text[1:]
    return joined_text
        

In [52]:
get_text('/wiki/Aardvark')

'See text The aardvark (/ˈɑːrdvɑːrk/ ARD-vark; Orycteropus afer) is a medium-sized, burrowing, nocturnal mammal native to Africa.[2][3] It is the only living species of the order Tubulidentata,[4][5] although other prehistoric species and genera of Tubulidentata are known. Unlike most other insectivores, it has a long snout, similar to that of a pig, which is used to sniff out food. The aardvark is found over much of the southern two-thirds of the African continent, avoiding areas that are mainly rocky. A nocturnal feeder, it subsists on ants and termites, which it will dig out of their hills using its sharp claws and powerful legs. It also digs to create burrows in which to live and rear its young. The animal is listed as "least concern" by the IUCN, although its numbers are decreasing. Aardvarks are afrotheres, a clade which also includes elephants, manatees, and hyraxes. The aardvark is sometimes colloquially called the "African ant bear",[6] "anteater" (not to be confused with the 

In [53]:
# create a new column in the dataframe that will contain the text from the page
animal_df['text'] = animal_df['page-name'].progress_apply(get_text)

  0%|          | 98/31754 [00:41<4:05:06,  2.15it/s]

In [9]:
def tokenize(text):
    # convert to lowercase
    text = text.lower()

    # remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # remove urls
    text = re.sub(r'http\S+', '', text)

    # remove numbers
    text = re.sub(r'\d+', '', text)

    # tokenize
    tokens = nltk.word_tokenize(text)

    # remove stopwords
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    return tokens

In [10]:
# create a new column in the dataframe that will contain the tokenized text from the page
animal_df['tokens'] = animal_df['text'].apply(tokenize)

In [11]:
animal_df

Unnamed: 0,page-name,animal,text,tokens
0,/wiki/Aardvark,Aardvark,See text The aardvark (/ˈɑːrdvɑːrk/ ARD-vark; ...,"[see, text, aardvark, ˈɑːrdvɑːrk, ardvark, ory..."
1,/wiki/Albatross,Albatross,Captured at Midway Atoll DiomedeaThalassarche...,"[captured, midway, atoll, diomedeathalassarche..."
2,/wiki/Alligator,Alligator,"An alligator, or just gator, is a large reptil...","[alligator, gator, large, reptile, crocodilia,..."
3,/wiki/Alpaca,Alpaca,"Camelus pacos Linnaeus, 1758Vicugna pacos (Li...","[camelus, pacos, linnaeus, vicugna, pacos, lin..."
4,/wiki/Ant,Ant,Martialinae Leptanillinae Amblyoponinae Parapo...,"[martialinae, leptanillinae, amblyoponinae, pa..."
...,...,...,...,...
223,/wiki/Whale,Whale,Whales are a widely distributed and diverse gr...,"[whales, widely, distributed, diverse, group, ..."
224,/wiki/Wolf,Wolf,See Subspecies of Canis lupus The wolf (Canis ...,"[see, subspecies, canis, lupus, wolf, canis, l..."
225,/wiki/Wolverine,Wolverine,American wolverine (G. g. luscus) Eurasian wol...,"[american, wolverine, g, g, luscus, eurasian, ..."
226,/wiki/Wombat,Wombat,"Wombats are short-legged, muscular quadrupedal...","[wombats, shortlegged, muscular, quadrupedal, ..."


In [12]:
# td-idf
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(animal_df['text'])
tfidf_matrix.shape



(228, 40888)

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    return text

In [24]:
vectorizer = TfidfVectorizer(stop_words='english', preprocessor=preprocess_text)
# vectorizer = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
vectors = vectorizer.fit_transform(animal_df['text'])
feature_names = vectorizer.get_feature_names_out()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)

In [14]:
def tf_idf(corpus):
    tf_dist = [nltk.FreqDist(text) for tokens in corpus]
