# Method 1: Here we use Pandas & SQLAlchemy to ingest fake data into the Postgres database.

In [3]:
# We will use the SQLAlchemy package to access an postgres database

# We start by importing the create_engine function.
    # This engine fires up a SQL engine that will communicates out SQL queries to the database 
from sqlalchemy import create_engine, text, inspect
from faker import Faker
import pandas as pd

# Create the engine
engine = create_engine('postgresql://myuser:mypassword@postgres/mydatabase')

# Checking the table names within the database
insp = inspect(engine)
print(insp.get_table_names(schema="schema_test")) # recall that postgres prefer lower case for names 

# Connecting to the engine and executing a SELECT query
with engine.connect() as conn:

    faker = Faker('en_US')

    # Insert fake data
    for i in range(10):
        test_id = faker.random_int(min=1, max=200)
        amount = faker.random_int(min=100, max=10000)
        #created_at: recall that the created_at is defined in the init.sql
        #insert_query = text(f"INSERT INTO SCHEMA_TEST.TABLE_TEST (test_id, amount) VALUES ({test_id}, {amount})")
        insert_query = text("INSERT INTO SCHEMA_TEST.TABLE_TEST (test_id, amount) VALUES (:test_id, :amount)")
        conn.execute(insert_query, {"test_id": test_id, "amount": amount})

    # Commit the transaction
    conn.commit() # committing refers to finalizing and applying the changes made within a transaction to the database.

    # Fetch and print the table after inserting the data
    select_query = text("SELECT * FROM SCHEMA_TEST.TABLE_TEST")
    result = conn.execute(select_query) # Created a SQLAlchemy object that is assigned to the result variable
    df = pd.DataFrame(result.fetchall()) # Fetches all rows
    df.columns = result.keys() # set the dataframe column names
    # Print the table after inserting the data
df.head()


['table_test']


Unnamed: 0,test_id,amount,created_at
0,75,2016,2024-04-15 14:39:27.564142
1,28,2016,2024-04-15 14:39:27.564142
2,171,2842,2024-04-15 14:39:27.564142
3,178,9162,2024-04-15 14:39:27.564142
4,117,8347,2024-04-15 14:39:27.564142


# Method 2: Here we use Pandas & SQLAlchemy to ingest fake data into the Postgres database, but quicker.

In [7]:
# We will use the SQLAlchemy package to access an postgres database, but with pandas at the end to query it

# We start by importing the create_engine function.
    # This engine fires up a SQL engine that will communicates out SQL queries to the database 
from sqlalchemy import create_engine, text, inspect
from faker import Faker
import pandas as pd

# Create the engine
engine = create_engine('postgresql://myuser:mypassword@postgres/mydatabase')

# Checking the table names within the database
insp = inspect(engine)
print(insp.get_table_names(schema="schema_test")) # recall that postgres prefer lower case for names 

# Connecting to the engine and executing a SELECT query
with engine.connect() as conn:

    faker = Faker('en_US')

    # Insert fake data
    for i in range(10):
        test_id = faker.random_int(min=1, max=200)
        amount = faker.random_int(min=100, max=10000)
        #created_at: recall that the created_at is defined in the init.sql
        #insert_query = text(f"INSERT INTO SCHEMA_TEST.TABLE_TEST (test_id, amount) VALUES ({test_id}, {amount})")
        insert_query = text("INSERT INTO SCHEMA_TEST.TABLE_TEST (test_id, amount) VALUES (:test_id, :amount)")
        conn.execute(insert_query, {"test_id": test_id, "amount": amount})

    # Commit the transaction
    conn.commit() # committing refers to finalizing and applying the changes made within a transaction to the database.

df = pd.read_sql_query("SELECT * FROM SCHEMA_TEST.TABLE_TEST", engine)
df.head()

['table_test']


Unnamed: 0,test_id,amount,created_at
0,200,1909,2024-04-15 14:50:40.478758
1,198,4832,2024-04-15 14:50:40.478758
2,173,1485,2024-04-15 14:50:40.478758
3,174,929,2024-04-15 14:50:40.478758
4,91,1693,2024-04-15 14:50:40.478758


# Method 3: Here we use Pandas & urllib to ingest CSV data from an URL

In [8]:
# Import package
from urllib.request import urlretrieve

# Import pandas
import pandas as pd

# Assign url of file: url
url = 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1606/datasets/winequality-red.csv'

# Save file locally
urlretrieve(url, 'winequality-red.csv')

# Read file into a DataFrame and print its head
df = pd.read_csv('winequality-red.csv', sep=';')
print(df.head())

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0      9.4        5  
1      9.8        5  
2      9.8        5 

# Method 4: Here we ingest data from an URL with HTTP requests

In [10]:
# Import package
import requests

# Specify the url: url
url = "http://www.datacamp.com/teach/documentation"

# Packages the request, send the request and catch the response: r
r = requests.get(url)

# Extract the response: text
text = r.text

# Print the html
print(text)

<!DOCTYPE html><html lang="en-US"><head><title>Just a moment...</title><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta http-equiv="X-UA-Compatible" content="IE=Edge"><meta name="robots" content="noindex,nofollow"><meta name="viewport" content="width=device-width,initial-scale=1"><style>*{box-sizing:border-box;margin:0;padding:0}html{line-height:1.15;-webkit-text-size-adjust:100%;color:#313131}button,html{font-family:system-ui,-apple-system,BlinkMacSystemFont,Segoe UI,Roboto,Helvetica Neue,Arial,Noto Sans,sans-serif,Apple Color Emoji,Segoe UI Emoji,Segoe UI Symbol,Noto Color Emoji}@media (prefers-color-scheme:dark){body{background-color:#222;color:#d9d9d9}body a{color:#fff}body a:hover{color:#ee730a;text-decoration:underline}body .lds-ring div{border-color:#999 transparent transparent}body .font-red{color:#b20f03}body .big-button,body .pow-button{background-color:#4693ff;color:#1d1d1d}body #challenge-success-text{background-image:url(data:image/svg+xml;base64,PH

# Method 5: Here we Scrape the web using BeautifulSoup and HTTP requests

In [12]:
# Import packages
import requests
from bs4 import BeautifulSoup

# Specify url
url = 'https://www.python.org/~guido/'

# Package the request, send the request and catch the response: r
r = requests.get(url)

# Extracts the response as html: html_doc
html_doc = r.text

# create a BeautifulSoup object from the HTML: soup
soup = BeautifulSoup(html_doc)

# Print the title of Guido's webpage
print(soup.title)

# Find all 'a' tags (which define hyperlinks): a_tags
a_tags = soup.find_all('a')

# Print the URLs to the shell
for link in a_tags:
    print(link.get('href'))

<title>Guido's Personal Home Page</title>
pics.html
pics.html
http://www.washingtonpost.com/wp-srv/business/longterm/microsoft/stories/1998/raymond120398.htm
images/df20000406.jpg
http://neopythonic.blogspot.com/2016/04/kings-day-speech.html
http://www.python.org
Resume.html
Publications.html
bio.html
http://legacy.python.org/doc/essays/
http://legacy.python.org/doc/essays/ppt/
interviews.html
pics.html
http://neopythonic.blogspot.com
http://www.artima.com/weblogs/index.jsp?blogger=12088
https://twitter.com/gvanrossum
Resume.html
https://docs.python.org
https://github.com/python/cpython/issues
https://discuss.python.org
guido.au
http://legacy.python.org/doc/essays/
images/license.jpg
http://www.cnpbagwell.com/audio-faq
http://sox.sourceforge.net/
images/internetdog.gif


# Method 6: Here we Ingest data from APIs and JSONs

In [13]:
# Import package
import requests

# Assign URL to variable: url
url = 'http://www.omdbapi.com/?apikey=72bc447a&t=social+network'

# Package the request, send the request and catch the response: r
r = requests.get(url)

# Decode the JSON data into a dictionary: json_data
json_data = r.json()

# Print each key-value pair in json_data
for k in json_data.keys():
    print(k + ': ', json_data[k])

Title:  The Social Network
Year:  2010
Rated:  PG-13
Released:  01 Oct 2010
Runtime:  120 min
Genre:  Biography, Drama
Director:  David Fincher
Writer:  Aaron Sorkin, Ben Mezrich
Actors:  Jesse Eisenberg, Andrew Garfield, Justin Timberlake
Plot:  As Harvard student Mark Zuckerberg creates the social networking site that would become known as Facebook, he is sued by the twins who claimed he stole their idea and by the co-founder who was later squeezed out of the business.
Language:  English, French
Country:  United States
Awards:  Won 3 Oscars. 173 wins & 187 nominations total
Poster:  https://m.media-amazon.com/images/M/MV5BOGUyZDUxZjEtMmIzMC00MzlmLTg4MGItZWJmMzBhZjE0Mjc1XkEyXkFqcGdeQXVyMTMxODk2OTU@._V1_SX300.jpg
Ratings:  [{'Source': 'Internet Movie Database', 'Value': '7.8/10'}, {'Source': 'Rotten Tomatoes', 'Value': '96%'}, {'Source': 'Metacritic', 'Value': '95/100'}]
Metascore:  95
imdbRating:  7.8
imdbVotes:  754,796
imdbID:  tt1285016
Type:  movie
DVD:  05 Jun 2012
BoxOffice:  $9

# Method 7: Here we Ingest data from APIs and nested JSONs

In [14]:
# Import package
import requests

# Assign URL to variable: url
url = 'https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=json&exintro=&titles=pizza'

# Package the request, send the request and catch the response: r
r = requests.get(url)

# Decode the JSON data into a dictionary: json_data
json_data = r.json()

# Print the Wikipedia page extract
pizza_extract = json_data['query']['pages']['24768']['extract']
print(pizza_extract)


<link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1033289096">
<p class="mw-empty-elt">

</p>
<p><b>Pizza</b> (<span></span> <i title="English pronunciation respelling"><span>PEET</span>-sə</i>, <span>Italian:</span> <span lang="it-Latn-fonipa">[ˈpittsa]</span>; <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1177148991"><span>Neapolitan:</span> <span lang="nap-Latn-fonipa">[ˈpittsə]</span>) is a dish of Italian origin consisting of a usually round, flat base of leavened wheat-based dough topped with tomato, cheese, and often various other ingredients (such as anchovies, mushrooms, onions, olives, vegetables, meat, etc.), which is then baked at a high temperature, traditionally in a wood-fired oven.</p><p>The term <i>pizza</i> was first recorded in the year 997 AD, in a Latin manuscript from the southern Italian town of Gaeta, in Lazio, on the border with Campania. Raffaele Esposito is often credited for creating modern pizza in Naples. In 20