# Day 20 Exercise
### Exercise Number 1: Ten most frequent words

In [None]:
import requests
from bs4 import BeautifulSoup
from collections import Counter
import re

def find_most_frequent_words_url(url, limit):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    text = ' '.join(soup.get_text().split()) 
    words = re.findall(r'\b\w+\b', text.lower())
    word_count = Counter(words)
    return word_count.most_common(limit)


print(find_most_frequent_words_url('http://www.gutenberg.org/files/1112/1112.txt', 10))


### Number 2: Reading the cats API

In [None]:

import requests
import numpy as np

def analyze_cats(api_url):
    response = requests.get(api_url)
    data = response.json()

    # Extracting weights and lifespans
    weights = [cat['weight']['metric'][:-3] for cat in data if 'weight' in cat and cat['weight']['metric'][:-3].replace('.', '').isdigit()]
    lifespans = [cat['life_span'].split()[0] for cat in data if 'life_span' in cat]

    # Convert strings to numerical values for analysis
    weights = np.array(weights, dtype=float)
    lifespans = np.array(lifespans, dtype=float)

    # Calculate statistics
    weight_stats = {}
    lifespan_stats = {}

    if len(weights) > 0:
        weight_stats = {
            'min': np.min(weights),
            'max': np.max(weights),
            'mean': np.mean(weights),
            'median': np.median(weights),
            'std_dev': np.std(weights)
        }
        
    if len(lifespans) > 0:
        lifespan_stats = {
            'min': np.min(lifespans),
            'max': np.max(lifespans),
            'mean': np.mean(lifespans),
            'median': np.median(lifespans),
            'std_dev': np.std(lifespans)
        }

    return weight_stats, lifespan_stats


print(analyze_cats('https://api.thecatapi.com/v1/breeds'))
    

### Number 3: Reading Countries API

In [None]:
import requests
from collections import Counter

def analyze_countries_api(api_url):
    response = requests.get(api_url)
    if response.status_code != 200:
       
        print(f"Failed to fetch data. Status code: {response.status_code}")
        return None, None, None

    countries_data = response.json()

   
    try:
        largest_countries = sorted(countries_data, key=lambda x: x.get('area', 0), reverse=True)[:10]
    except KeyError as e:
        print(f"KeyError: {e}. 'area' key not found in the data.")
        largest_countries = []

   
    languages = [lang['name'] for country in countries_data for lang in country.get('languages', [])]
    common_languages = Counter(languages).most_common(10)

   
    total_languages = len(set(languages))

    return largest_countries, common_languages, total_languages

largest, spoken_languages, total_langs = analyze_countries_api('https://restcountries.com/v2/all')

print(largest)
print(spoken_languages)
print(f'The total languages are: {total_langs}')


### Number 4: Reading the content of UCL

In [None]:
import requests
from bs4 import BeautifulSoup

def explore_uci_datasets(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    links = soup.find_all('a')
    
    dataset_links = [link['href'] for link in links if 'datasets' in str(link.get('href'))]
    
    return dataset_links

# Usage example:
uci_datasets = explore_uci_datasets('https://archive.ics.uci.edu/ml/datasets.')
print(uci_datasets)
