# Text analysis

In [None]:
# from get_links import links_on_page
import numpy as np
import pandas as pd
import requests
import re
import networkx as nx
import matplotlib.pyplot as plt
import nltk
import sklearn
from bs4 import BeautifulSoup
from tqdm import tqdm
tqdm.pandas()
from animal_list import names_from_table
from netwulf import visualize
from wordcloud import WordCloud
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

## Load in datasets from data directory

In [None]:
# aniamal names is the big dataset with all the names
animal_names = names_from_table()
animal_names_set = set(animal_names.values())
# lower case all names
animal_names_set = set([x.lower() for x in animal_names_set])

# read txt file with pandas
animal_df = pd.read_csv('data/animal_links.txt', header=None)
animal_df.columns = ['page-name']

# remove the first part of the url
animal_df['page-name'] = animal_df['page-name'].str.replace('https://en.wikipedia.org', '', regex=False)
animal_df["name"] = animal_df["page-name"].str.split("/").str[-1]

# lower case everything
animal_df["name"] = animal_df["name"].str.lower()
animal_df["page-name"] = animal_df["page-name"].str.lower()

animal_df.shape


In [None]:
# load in data from pickle files
# load in reptile dataset just for testing purposes
with open('data/data_plain_reptile.pickle', 'rb') as handle:
    pa = pickle.load(handle)
with open('data/data_plain_long_reptile.pickle', 'rb') as handle:
    pb = pickle.load(handle)

# load reptile attributes
with open("data/Reptile_attributes.pickle", "rb") as handle:
    reptile_attributes = pickle.load(handle)

# make all keys lower case - which family order, etc they belong to
reptile_attributes = {k.lower(): v for k, v in reptile_attributes.items()}


In [None]:
# reptile_attributes["Argentine_snake-necked_turtle"]
reptile_attributes

## Helper functions

In [None]:
# create function that will run on each row in the dataframe, which will take the page name and return all the readable text on the page
def get_text(page_name):
    url = 'https://en.wikipedia.org' + page_name
    response = requests.get(url)
    html_content = response.content
    soup = BeautifulSoup(html_content, 'html.parser')
    # Find all the paragraphs in the body
    paragraphs = soup.body.find_all('p')
    # Extract the text from the paragraphs but remove \n
    # text = [p.text for p in paragraphs]
    text = [str(p.text.replace('\n', '')).strip() for p in paragraphs]
    # Join the paragraphs together
    joined_text = ' '.join(text)
    # remove first space
    joined_text = joined_text[1:]
    return joined_text

def get_links(name):
    # we look up in the reptile_long dataset
    name = name.capitalize()
    links = []
    for key, value in pb.items():
        if key[0] == name:
            links.append(key[1].lower())
        elif key[1] == name:
            links.append(key[0].lower())
    return links

def get_family(name):
    try:
        family = reptile_attributes[name]["Family:"]
    except KeyError as e:
        print(f"error at {name}")
        family = "Unknown"
    return family.lower()
    

# function that can be passed to the tf-idf vectorizer that will preprocess the text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    return text

# function we will apply to the tf-idf matrix to get the top n words
def get_top_words(row, n=300):
    return row.sort_values(ascending=False).head(n).to_dict()

# generate wordcloud
def generate_wordcloud(tf_idf:dict, title:str = "Default text"):
    wordcloud = WordCloud(width=800, height=400, background_color='white')
    wordcloud.generate_from_frequencies(tf_idf)
    plt.figure(figsize=(12,6))
    plt.title(f"Node: {title}", fontsize=20)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.savefig('wordcloud.png', dpi=300, bbox_inches='tight')
    plt.show()

In [None]:
# get_links("balkan_pond_turtle")
# get_links("East_african_black_mud_turtle")
# get_family("east_african_black_mud_turtle")

# pb

In [94]:
# we will define a function to do all the steps above - tf-idf, get top words and return a dictionary with this information

def get_information_dict(filepath:str, load=True, save=False, preprocess=True, links=True, family=True) -> dict:
    """
    Function that will read a parquet file, or csv file, or json file, or pickle file, and return a dictionary with the dataframe, top words and tf-idf matrix

    Parameters
    ----------
    filepath : str
        The name of the file to read
    load : bool, optional
        Whether to load the file or not. The default is True. Can only be used if the file is already made, have to have run with save as True before
    save : bool, optional
        Whether to save the file or not. The default is False
    preprocess : bool, optional
        Whether to preprocess the text or not. The default is True
    links : bool, optional
        Whether to get the links or not. The default is True
    family : bool, optional
        Whether to get the family or not. The default is True
    
    Returns
    -------
    dict
        A dictionary with the dataframe, top words and tf-idf matrix
    """
    
    print(f"Reading {filepath}")
    # get the filepath before the extension
    filepath_without_extension = filepath.split('.')[0]
    if filepath.endswith('.parquet'):
        df = pd.read_parquet(filepath)
    elif filepath.endswith('.csv'):
        df = pd.read_csv(filepath)
    elif filepath.endswith('.json'):
        df = pd.read_json(filepath)
    elif filepath.endswith('long_reptile.pickle'):
        with open('data/data_plain_long_reptile.pickle', 'rb') as handle:
            a = pickle.load(handle)
        b = {str(k[0]).lower() for k, v in a.items()}
        c = {str(k[1]).lower() for k, v in a.items()}
        d = b.union(c)
        df = animal_df[animal_df['name'].isin(d)].reset_index(drop=True)

    elif filepath.endswith('reptile.pickle'):
        with open('data/data_plain_reptile.pickle', 'rb') as handle:
            a = pickle.load(handle)
        b = {str(k[0]).lower(): v for k, v in a.items()}
        df = animal_df[animal_df['page-name'].isin(b.keys())].reset_index(drop=True)
    
    if load:
        try:
            df = pd.read_parquet(filepath_without_extension + '.parquet')
        except FileNotFoundError:
            print(f"File {filepath} not found")
            return
    else:
        # get the text from the page
        print(f"Getting text")
        df['text'] = df['page-name'].progress_apply(get_text)

    if links:
        # Links column that shows which animals the given animal links to
        print(f"Getting links")
        df['links'] = df['name'].progress_apply(get_links)
    if family:
        # Family column that shows which family the given animal belongs to
        print(f"Getting family")
        df['family'] = df['name'].progress_apply(get_family)

    if save:
        # save as parquet file
        df.to_parquet(filepath_without_extension + '.parquet')
    
    if preprocess:
        vectorizer = TfidfVectorizer(stop_words='english', dtype=np.float32, preprocessor=preprocess_text)
    else:
        vectorizer = TfidfVectorizer(stop_words='english', dtype=np.float32)
    
    tfidf_matrix = vectorizer.fit_transform(df['text'])
    feature_names = vectorizer.get_feature_names_out()
    tfidf_df = pd.DataFrame(tfidf_matrix.todense(), columns=feature_names)
    
    print(f"Getting top words")
    top_words = tfidf_df.progress_apply(get_top_words, axis=1)
    top_words.index = df['name']

    if save:
        # save as a json file
        top_words.to_json(filepath_without_extension + '_top_words.json')
    
    return {'df': df, 'top_words': top_words, 'tfidf_df': tfidf_df}
    
    
        
            
        

In [97]:
data_dict_reptile_long = get_information_dict('data/data_plain_long_reptile.pickle', load=True, save=True, preprocess=True)
data_dict_reptile = get_information_dict('data/data_plain_reptile.pickle', load=True, save=True, preprocess=True, links=False, family=False)



Reading data/data_plain_long_reptile.pickle
Getting links


100%|██████████| 1063/1063 [00:05<00:00, 199.46it/s]


Getting family


100%|██████████| 1063/1063 [00:00<00:00, 164868.73it/s]


error at lankascincus_deraniyagalae
error at vipera_pontica
error at cnemaspis_indraneildasii
error at cloete%27s_girdled_lizard
error at macrovipera_lebetinus_schweizeri
error at ptychoglossus_festae
error at brown%27s_scaly-toed_gecko
error at lerista_talpina
error at cnemaspis_anaikattiensis
error at brookesia_ambreensis
error at cyrtodactylus_subsolanus
error at dwarf_karoo_girdled_lizard
error at lerista_maculosa
error at cyrtodactylus_ramboda
error at naultinus_elegans
Getting top words


100%|██████████| 1063/1063 [00:01<00:00, 551.36it/s]


Reading data/data_plain_reptile.pickle
Getting top words


100%|██████████| 4344/4344 [00:08<00:00, 530.99it/s]


In [96]:
data_dict_reptile["df"]
# data_dict_reptile_long["df"]

Unnamed: 0,page-name,name,text,links,family
0,/wiki/chironius_laurenti,chironius_laurenti,Chironius laurenti is a species of nonvenomous...,[],colubridae
1,/wiki/dasia_olivacea,dasia_olivacea,"Dasia olivacea, the olive dasia or olive tree ...",[],scincidae
2,/wiki/alabama_map_turtle,alabama_map_turtle,The Alabama map turtle (Graptemys pulchra) is ...,"[d%27orbigny%27s_slider, jamaican_slider, yell...",emydidae
3,/wiki/geoscincus,geoscincus,Geoscincus is a monotypic genus of skinks: the...,[geoscincus],scincidae
4,/wiki/gonyosoma_oxycephalum,gonyosoma_oxycephalum,"Elaphe oxycephala (Boie, 1827) Gonyosoma oxyce...",[],colubridae
...,...,...,...,...,...
4339,/wiki/eugongylus_albofasciolatus,eugongylus_albofasciolatus,The white-striped cape skink or barred shark s...,[],scincidae
4340,/wiki/eumecia_johnstoni,eumecia_johnstoni,Eumecia johnstoni is a species of skink found ...,[],scincidae
4341,/wiki/giant_plated_lizard,giant_plated_lizard,The giant plated lizard (Matobosaurus validus)...,[],gerrhosauridae
4342,/wiki/eutropis_multicarinata,eutropis_multicarinata,Eutropis multicarinata is a species of skink f...,[],scincidae


In [98]:
# We want to check the score of the words in the top words for each animal
# We will do this by getting the tf-idf score for each word in the top words for each animal


for name, top_words in data_dict_reptile['top_words'].items():
    tw_keys = set([k for k, value in top_words.items() if value > 0]) # get the keys of the top words that have a score > 0

    links = list(animal_names_set.intersection(tw_keys))
    if links:
        link_dict = {name: {k: top_words[k]} for k in links}
        
        # add to the df
        # 
        data_dict_reptile['df'].loc[data_dict_reptile['df']['name'] == name, 'links'] = str(link_dict[name])

In [99]:
data_dict_reptile["df"]

Unnamed: 0,page-name,name,text,links,family
0,/wiki/chironius_laurenti,chironius_laurenti,Chironius laurenti is a species of nonvenomous...,[],colubridae
1,/wiki/dasia_olivacea,dasia_olivacea,"Dasia olivacea, the olive dasia or olive tree ...",{'skink': 0.05847395211458206},scincidae
2,/wiki/alabama_map_turtle,alabama_map_turtle,The Alabama map turtle (Graptemys pulchra) is ...,{'turtle': 0.27575668692588806},emydidae
3,/wiki/geoscincus,geoscincus,Geoscincus is a monotypic genus of skinks: the...,{'skink': 0.11515277624130249},scincidae
4,/wiki/gonyosoma_oxycephalum,gonyosoma_oxycephalum,"Elaphe oxycephala (Boie, 1827) Gonyosoma oxyce...",[],colubridae
...,...,...,...,...,...
4339,/wiki/eugongylus_albofasciolatus,eugongylus_albofasciolatus,The white-striped cape skink or barred shark s...,{'skink': 0.3205573856830597},scincidae
4340,/wiki/eumecia_johnstoni,eumecia_johnstoni,Eumecia johnstoni is a species of skink found ...,{'skink': 0.22512587904930115},scincidae
4341,/wiki/giant_plated_lizard,giant_plated_lizard,The giant plated lizard (Matobosaurus validus)...,{'lizard': 0.22714518010616302},gerrhosauridae
4342,/wiki/eutropis_multicarinata,eutropis_multicarinata,Eutropis multicarinata is a species of skink f...,{'skink': 0.2291412651538849},scincidae


In [101]:
# find entry by name
data_dict_reptile_long["df"][data_dict_reptile_long["df"]["name"] == "argentine_snake-necked_turtle"]
# data_dict_reptile_long["df"]

Unnamed: 0,page-name,name,links,text,family
472,/wiki/argentine_snake-necked_turtle,argentine_snake-necked_turtle,"[irwin%27s_turtle, gulf_snapping_turtle, phryn...",The Argentine snake-necked turtle (Hydromedusa...,chelidae


![alternatvie text](figures/network.png)

Above the graph of the network can be seen for the file data_plain_reptile which tracks how many times all the pages refer back to the 224 long list of animals (main animals). We see a clear clustering around the nodes: Lizard, Skink, Turtle, Snake and Gecko to name the largest. Let us investigate if we can see a link for some of the nodes in these clusters through our text analysis


We will look at the animal laudakia_dayana which can faintly be seen in the lizard cluster.
The result can be seen below where lizard comes in at the 20th place according to tf-idf score and can also be seen in the wordcloud

In [None]:
sample_name = 'laudakia_dayana'
generate_wordcloud(data_dict_reptile["top_words"][sample_name], sample_name)
data_dict_reptile["top_words"][sample_name]

Let us investigate another one. This time geophis_fulvoguttatus which can be seen in the snake cluster
Our hypothesis is that it also can be seen in the wordcloud and in the td-idf score

The wordcloud and the top words below indicate that there is quite a strong connection with this sample to snake, being the 3rd highest ranked. But we also see that there is not many words on the page at all, only about 15 in total

In [None]:
sample_name = 'geophis_fulvoguttatus'
generate_wordcloud(data_dict_reptile["top_words"][sample_name], sample_name)
data_dict_reptile["top_words"][sample_name]

In [None]:
sample_name = "northern_yellow-faced_turtle"
generate_wordcloud(data_dict_reptile_long["top_words"][sample_name], sample_name)
data_dict_reptile_long["top_words"][sample_name]

In [None]:
sample_name = "argentine_snake-necked_turtle"
generate_wordcloud(data_dict_reptile_long["top_words"][sample_name], sample_name)
data_dict_reptile_long["top_words"][sample_name]

In [85]:
# we want to to investigate if there is correlation between the top words and the links
# We will do this by seeing the if the links are in the top 20 top words

for name, top_words in data_dict_reptile_long["top_words"].items():
    
    

SyntaxError: expected ':' (2434911645.py, line 4)