In [47]:
# from get_links import links_on_page
import numpy as np
import pandas as pd
import requests
import re
import networkx as nx
import matplotlib.pyplot as plt
import nltk
import sklearn
from bs4 import BeautifulSoup
from tqdm import tqdm
tqdm.pandas()
from animal_list import names_from_table
from netwulf import visualize
from wordcloud import WordCloud
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

In [113]:
animal_names = names_from_table()
animal_names_set = set(animal_names.values())
# lower case all names
animal_names_set = set([x.lower() for x in animal_names_set])

# read txt file with pandas
animal_df = pd.read_csv('data/animal_links.txt', header=None)
animal_df.columns = ['page-name']

# remove the first part of the url
animal_df['page-name'] = animal_df['page-name'].str.replace('https://en.wikipedia.org', '', regex=False)
animal_df["name"] = animal_df["page-name"].str.split("/").str[-1]


In [60]:
# animal_names
animal_df

Unnamed: 0,page-name,name
0,/wiki/Scaly-crowned_babbler,Scaly-crowned_babbler
1,/wiki/Velvet-fronted_nuthatch,Velvet-fronted_nuthatch
2,/wiki/Mangrove_whistler,Mangrove_whistler
3,/wiki/Mees%27s_white-eye,Mees%27s_white-eye
4,/wiki/Ictiobus,Ictiobus
...,...,...
31749,/wiki/Eutropis_multicarinata,Eutropis_multicarinata
31750,/wiki/Gehyra_vorax,Gehyra_vorax
31751,/wiki/Minervarya_andamanensis,Minervarya_andamanensis
31752,/wiki/Minervarya_greenii,Minervarya_greenii


## Helper functions

In [57]:
# create function that will run on each row in the dataframe, which will take the page name and return all the readable text on the page
def get_text(page_name):
    url = 'https://en.wikipedia.org' + page_name
    response = requests.get(url)
    html_content = response.content
    soup = BeautifulSoup(html_content, 'html.parser')
    # Find all the paragraphs in the body
    paragraphs = soup.body.find_all('p')
    # Extract the text from the paragraphs but remove \n
    # text = [p.text for p in paragraphs]
    text = [str(p.text.replace('\n', '')).strip() for p in paragraphs]
    # Join the paragraphs together
    joined_text = ' '.join(text)
    # remove first space
    joined_text = joined_text[1:]
    return joined_text

# function that can be passed to the tf-idf vectorizer that will preprocess the text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    return text

# function we will apply to the tf-idf matrix to get the top n words
def get_top_words(row, n=300):
    return row.sort_values(ascending=False).head(n).to_dict()

# generate wordcloud
def generate_wordcloud(tf_idf:dict):
    wordcloud = WordCloud(width=800, height=400, background_color='white')
    wordcloud.generate_from_frequencies(tf_idf)
    plt.figure(figsize=(12,6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.savefig('wordcloud.png', dpi=300, bbox_inches='tight')
    plt.show()

In [86]:
# we will define a function to do all the steps above - tf-idf, get top words and return a dictionary with this information

def get_information_dict(filepath:str, load=True, save=False, preprocess=True) -> dict:
    """
    Function that will read a parquet file, or csv file, or json file, or pickle file, and return a dictionary with the dataframe, top words and tf-idf matrix

    Parameters
    ----------
    filepath : str
        The name of the file to read
    load : bool, optional
        Whether to load the file or not. The default is True. Can only be used if the file is already made, have to have run with save as True before
    save : bool, optional
        Whether to save the file or not. The default is False
    preprocess : bool, optional
        Whether to preprocess the text or not. The default is True
    
    Returns
    -------
    dict
        A dictionary with the dataframe, top words and tf-idf matrix

    
    
    
    
    
    """
    print(f"Reading {filepath}")
    # get the filepath before the extension
    filepath_without_extension = filepath.split('.')[0]
    if filepath.endswith('.parquet'):
        df = pd.read_parquet(filepath)
    elif filepath.endswith('.csv'):
        df = pd.read_csv(filepath)
    elif filepath.endswith('.json'):
        df = pd.read_json(filepath)
    elif filepath.endswith('long_reptile.pickle'):
        with open('data/data_plain_long_reptile.pickle', 'rb') as handle:
            a = pickle.load(handle)
        a = {k[0]: v for k, v in a.items()}
        df = animal_df[animal_df['name'].isin(a.keys())].reset_index(drop=True)
    elif filepath.endswith('reptile.pickle'):
        with open('data/data_plain_reptile.pickle', 'rb') as handle:
            a = pickle.load(handle)
        a = {k[0]: v for k, v in a.items()}
        df = animal_df[animal_df['page-name'].isin(a.keys())].reset_index(drop=True)
    
    if load:
        try:
            df = pd.read_parquet(filepath_without_extension + '.parquet')
        except FileNotFoundError:
            print(f"File {filepath} not found")
            return
    else:
        # get the text from the page
        print(f"Getting text")
        df['text'] = df['page-name'].progress_apply(get_text)

    if save:
        # save as parquet file
        df.to_parquet(filepath_without_extension + '.parquet')
    
    if preprocess:
        vectorizer = TfidfVectorizer(stop_words='english', dtype=np.float32, preprocessor=preprocess_text)
    else:
        vectorizer = TfidfVectorizer(stop_words='english', dtype=np.float32)
    
    tfidf_matrix = vectorizer.fit_transform(df['text'])
    feature_names = vectorizer.get_feature_names_out()
    tfidf_df = pd.DataFrame(tfidf_matrix.todense(), columns=feature_names)
    
    print(f"Getting top words")
    top_words = tfidf_df.progress_apply(get_top_words, axis=1)
    top_words.index = df['name']

    if save:
        # save as a json file
        top_words.to_json(filepath_without_extension + '_top_words.json')
    
    return {'df': df, 'top_words': top_words, 'tfidf_df': tfidf_df}
    
    
        
            
        

In [87]:
data_dict_reptile_long = get_information_dict('data/data_plain_long_reptile.pickle', load=True, save=True, preprocess=True)
data_dict_reptile = get_information_dict('data/data_plain_reptile.pickle', load=True, save=True, preprocess=True)

Reading data/data_plain_long_reptile.pickle
Getting top words


100%|██████████| 764/764 [00:02<00:00, 357.72it/s]


Reading data/data_plain_reptile.pickle
Getting top words


100%|██████████| 4344/4344 [00:19<00:00, 226.31it/s]


In [133]:
# We want to check the score of the words in the top words for each animal
# We will do this by getting the tf-idf score for each word in the top words for each animal


for name, top_words in data_dict_reptile['top_words'].items():
    tw_keys = set([k for k, value in top_words.items() if value > 0]) # get the keys of the top words that have a score > 0

    links = list(animal_names_set.intersection(tw_keys))
    if links:
        print(links)

        # get the tf-idf scores for the top words
        # tfidf_scores = data_dict_reptile['tfidf_df'].loc[name, links[0]]







['skink']
['turtle', 'alligator']
['skink']
['lizard']
['crocodile', 'alligator']
['salmon', 'deer', 'kangaroo', 'human']
['lizard']
['frog', 'turtle', 'buffalo', 'crocodile']
['skink', 'lizard']
['gecko', 'lizard']
['crocodile', 'alligator', 'human']
['lizard']
['gecko']
['skink', 'lizard']
['turtle']
['turtle']
['turtle']
['crocodile', 'lizard', 'deer']
['gecko', 'human']
['turtle']
['turtle']
['lizard']
['gecko', 'lizard']
['lizard', 'human']
['turtle']
['turtle', 'crow', 'human']
['lizard']
['crocodile', 'lizard']
['iguana']
['turtle']
['turtle']
['turtle']
['salmon', 'turtle']
['lizard']
['lizard']
['skink']
['lizard']
['gecko', 'toad', 'lemur', 'mosquito']
['lizard', 'human']
['turtle']
['gecko']
['skink']
['lizard']
['viper']
['turtle']
['lizard', 'iguana']
['mongoose', 'lizard']
['skink', 'lizard']
['cobra', 'viper', 'spoonbill']
['turtle']
['viper']
['turtle']
['viper']
['viper', 'parrot']
['lizard']
['viper']
['human']
['gecko', 'lizard']
['turtle']
['skink', 'lizard']
['croc

In [101]:
data_dict_reptile["df"]

Unnamed: 0,page-name,name,text
0,/wiki/Chironius_laurenti,Chironius_laurenti,Chironius laurenti is a species of nonvenomous...
1,/wiki/Dasia_olivacea,Dasia_olivacea,"Dasia olivacea, the olive dasia or olive tree ..."
2,/wiki/Alabama_map_turtle,Alabama_map_turtle,The Alabama map turtle (Graptemys pulchra) is ...
3,/wiki/Geoscincus,Geoscincus,Geoscincus is a monotypic genus of skinks: the...
4,/wiki/Gonyosoma_oxycephalum,Gonyosoma_oxycephalum,"Elaphe oxycephala (Boie, 1827) Gonyosoma oxyce..."
...,...,...,...
4339,/wiki/Eugongylus_albofasciolatus,Eugongylus_albofasciolatus,The white-striped cape skink or barred shark s...
4340,/wiki/Eumecia_johnstoni,Eumecia_johnstoni,Eumecia johnstoni is a species of skink found ...
4341,/wiki/Giant_plated_lizard,Giant_plated_lizard,The giant plated lizard (Matobosaurus validus)...
4342,/wiki/Eutropis_multicarinata,Eutropis_multicarinata,Eutropis multicarinata is a species of skink f...


In [130]:
data_dict_reptile["top_words"]["Dasia_olivacea"].

{'olivacea': 0.5874013304710388,
 'dasia': 0.5518575310707092,
 'olive': 0.1317024677991867,
 'green': 0.11882834136486053,
 'locality': 0.11172519624233246,
 'island': 0.10261748731136322,
 'southeast': 0.09537190943956375,
 'sakaerat': 0.09381362050771713,
 'thailand': 0.08957552909851074,
 'annals': 0.08943317830562592,
 'viet': 0.08632520586252213,
 'trees': 0.08577132970094681,
 'ratchasima': 0.08391447365283966,
 'ecologically': 0.08391447365283966,
 'asia': 0.08220656216144562,
 'nam': 0.08194476366043091,
 'northernmost': 0.0802794024348259,
 'descending': 0.0802794024348259,
 'penang': 0.07883679121732712,
 'nakhon': 0.07642605900764465,
 'prince': 0.07642605900764465,
 'publication': 0.07539638131856918,
 'islands': 0.0724032074213028,
 'type': 0.06832505762577057,
 'flexible': 0.06790796667337418,
 'station': 0.06742773205041885,
 'indonesian': 0.0669679343700409,
 'bronze': 0.06530257314443588,
 'andaman': 0.06492346525192261,
 'ocelli': 0.06420296430587769,
 'eggs': 0.0635