### Python Package Manager Exercises

1. Read this url and find the 10 most frequent words. romeo_and_juliet = 'http://www.gutenberg.org/files/1112/1112.txt'


In [17]:
import requests
import re
url = 'http://www.gutenberg.org/files/1112/1112.txt'
response = requests.get(url)
print(response)
header = response.headers
print(f'The header in the text: {header}')
check_status = response.status_code
if check_status == 200:
    print('Success!')
else:
    print('An error has occurred in getting response from the server.')
print(check_status)
text = response.text
print(text)
# remove not white spaces and alphanumerics from the text
text = re.sub(r'[^\w\s]','',text) 
split_txt = text.split()
word_count = {}
for word in split_txt:
    word_count[word] = word_count.get(word, 0) + 1

sorted_word_count = sorted(word_count.items(),key=lambda x:x[1],reverse=True)
output = [(word[1],word[0]) for word in sorted_word_count]
print(output[:10])

#The status code shows that there is an error getting the response from the URL.
#The error is due to the fact that the URL is not accessible. 

<Response [404]>
The header in the text: {'date': 'Mon, 20 Jan 2025 14:12:14 GMT', 'server': 'Apache', 'last-modified': 'Mon, 08 Jul 2024 20:23:51 GMT', 'accept-ranges': 'bytes', 'content-length': '6414', 'x-backend': 'gutenweb1', 'content-type': 'text/html'}
An error has occurred in getting response from the server.
404
<!DOCTYPE html>
<html class="client-nojs" lang="en" dir="ltr">
<head>
 <meta charset="UTF-8"/>

<title>404 | Project Gutenberg</title>
 <link rel="stylesheet" href="/gutenberg/style.css?v=1.1">
 <link rel="stylesheet" href="/gutenberg/collapsible.css?1.1">
 <link rel="stylesheet" href="/gutenberg/new_nav.css?v=1.321231">
<link rel="stylesheet" href="/gutenberg/pg-desktop-one.css">
 <meta name="viewport" content="width=device-width, initial-scale=1">
 <meta name="keywords" content="books, ebooks, free, kindle, android, iphone, ipad"/>
 <meta name="google-site-verification" content="wucOEvSnj5kP3Ts_36OfP64laakK-1mVTg-ptrGC9io"/>
 <meta name="alexaVerifyID" content="4WNaC

2. Read the cats API and cats_api = 'https://api.thecatapi.com/v1/breeds' and find :
- the min, max, mean, median, standard deviation of cats' weight in metric units.
- the min, max, mean, median, standard deviation of cats' lifespan in years.
- Create a frequency table of country and breed of cats

In [19]:
import requests
from statistics import mean, median, stdev

# Step 1: Fetch data from the API
cats_api = 'https://api.thecatapi.com/v1/breeds'
response = requests.get(cats_api)
cats_data = response.json()

# Step 2: Helper function to get average from a range like "3 - 5"
def get_average_range(range_str):
    if range_str:
        parts = range_str.split(" - ")
        return (float(parts[0]) + float(parts[1])) / 2  # Average of min and max
    return None

# Step 3: Process the data
weights = []  # To store average weights
lifespans = []  # To store average lifespans
country_breed_count = {}  # To count breeds by country

for cat in cats_data:
    # Get average weight
    weight_range = cat.get("weight", {}).get("metric", "")
    avg_weight = get_average_range(weight_range)
    if avg_weight:
        weights.append(avg_weight)
    
    # Get average lifespan
    lifespan_range = cat.get("life_span", "")
    avg_lifespan = get_average_range(lifespan_range)
    if avg_lifespan:
        lifespans.append(avg_lifespan)
    
    # Count breeds by country
    country = cat.get("origin", "Unknown")
    if country in country_breed_count:
        country_breed_count[country] += 1
    else:
        country_breed_count[country] = 1

# Step 4: Calculate statistics
weight_stats = {
    "min": min(weights),
    "max": max(weights),
    "mean": mean(weights),
    "median": median(weights),
    "stdev": stdev(weights) if len(weights) > 1 else 0,
}

lifespan_stats = {
    "min": min(lifespans),
    "max": max(lifespans),
    "mean": mean(lifespans),
    "median": median(lifespans),
    "stdev": stdev(lifespans) if len(lifespans) > 1 else 0,
}

# Step 5: Print results
print("Weight Statistics (in metric units):", weight_stats)
print("Lifespan Statistics (in years):", lifespan_stats)
print("Frequency Table (Country and Number of Breeds):", country_breed_count)


Weight Statistics (in metric units): {'min': 3.0, 'max': 7.5, 'mean': 4.708955223880597, 'median': 4.5, 'stdev': 1.066533799956462}
Lifespan Statistics (in years): {'min': 10.5, 'max': 19.0, 'mean': 13.746268656716419, 'median': 13.5, 'stdev': 1.5844249849048053}
Frequency Table (Country and Number of Breeds): {'Egypt': 3, 'Greece': 1, 'United States': 28, 'United Arab Emirates': 1, 'Australia': 1, 'France': 2, 'United Kingdom': 8, 'Burma': 2, 'Canada': 3, 'Cyprus': 1, 'Russia': 4, 'China': 1, 'Japan': 1, 'Thailand': 4, 'Isle of Man': 1, 'Norway': 1, 'Iran (Persia)': 1, 'Singapore': 1, 'Somalia': 1, 'Turkey': 2}


3. Read the countries API and find
+ the 10 largest countries
+ the 10 most spoken languages
+ the total number of languages in the countries API

In [None]:
# The website displays this error message: 404 Not Found.

4. UCI is one of the most common places to get data sets for data science and machine learning. Read the content of UCL (https://archive.ics.uci.edu/ml/datasets.php). Without additional libraries it will be difficult, so you may try it with BeautifulSoup4

In [22]:
import requests
url = 'https://archive.ics.uci.edu/ml/datasets.php'
response = requests.get(url)
print(response)
header = response.headers
print(f'The header in the text: {header}')
check_status = response.status_code
if check_status == 200:
    print('Success!')
else:
    print('An error has occurred in getting response from the server.')
print(check_status)

<Response [404]>
The header in the text: {'Server': 'nginx/1.27.3', 'Date': 'Mon, 20 Jan 2025 15:12:24 GMT', 'Content-Type': 'text/html', 'Content-Length': '55692', 'Connection': 'keep-alive', 'etag': '"m9ww13"', 'x-sveltekit-page': 'true'}
An error has occurred in getting response from the server.
404
