## Libraries and settings

In [31]:
# Libraries
import os
import re
import pytz
import json
import folium
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt

from datetime import date
from datetime import time

from zipfile import ZipFile
from bs4 import BeautifulSoup

from PyPDF2 import PdfReader

from reportlab.lib.units import inch
from reportlab.lib.colors import blue
from reportlab.lib.pagesizes import LETTER
from reportlab.pdfgen.canvas import Canvas

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Show current working directory
print(os.getcwd())

/workspaces/data_ingestion/01_Input_Output_and_Formatting_Python


### Datei Struktur eines db-Files überprüfen

In [32]:
# Create e new db
conn = sqlite3.connect('example_sqlite.db')

# Close connection to db
conn.close()

In [33]:
import sqlite3

# Verbindung zur SQLite-Datenbank herstellen
conn = sqlite3.connect("example_sqlite.db")

# 1. Anzeigen aller Tabellen in der Datenbank
cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()
print("Tabellen in der Datenbank:")
for table in tables:
    print(" -", table[0])
    
# 2. Struktur jeder Tabelle anzeigen
for table in tables:
    print(f"\nStruktur der Tabelle '{table[0]}':")
    cursor = conn.execute(f"PRAGMA table_info({table[0]})")
    columns = cursor.fetchall()
    for column in columns:
        print(f"Spalte: {column[1]}, Typ: {column[2]}")

# Verbindung schließen
conn.close()

Tabellen in der Datenbank:
 - COMPANY

Struktur der Tabelle 'COMPANY':
Spalte: index, Typ: INTEGER
Spalte: NAME, Typ: TEXT
Spalte: AGE, Typ: INTEGER
Spalte: CITY, Typ: TEXT
Spalte: SALARY, Typ: INTEGER


### CSV (comma separated value)

- A comma-separated values (CSV) file is a delimited text file.
- Each line of the file is a data record.
- Each record consists of one or more fields, separated by a separator (default = comma).
- The use of the comma as a field separator is the source of the name for this file format.
- The seperator can also be user-defined, e.g. you can also use a semicolon instead of a comma.
- A CSV file typically stores tabular data (numbers and text).

In [34]:
import pandas as pd

# Nur die ersten Zeilen einlesen, um die Struktur anzusehen
data = pd.read_csv("example_write.csv", sep=";", nrows=5)
print(data.columns)  # Spaltennamen anzeigen
print(data.head())   # Erste Zeilen der Datei anzeigen


Index(['Unnamed: 0', 'chicken_id', 'weight', 'breed', 'eggs_per_year'], dtype='object')
   Unnamed: 0  chicken_id       weight              breed  eggs_per_year
0           0           1  2728.854920  New Hampshire Red            158
1           1           2  2323.761365         Australorp            273
2           2           3  2635.062034  New Hampshire Red            153
3           3           4  2603.985152      Plymouth Rock            268
4           4           5  3079.394487         Australorp            360


In [35]:
# Read data from .csv-file using pandas
data = pd.read_csv("example.csv", sep=";")

# Print the header info of data (first five rows)
print(data.head(5))

# Write data to csv
data.to_csv("example_write.csv", sep=";")

   chicken_id       weight              breed  eggs_per_year
0           1  2728.854920  New Hampshire Red            158
1           2  2323.761365         Australorp            273
2           3  2635.062034  New Hampshire Red            153
3           4  2603.985152      Plymouth Rock            268
4           5  3079.394487         Australorp            360


### TXT (plain text)

- In Plain Text file format, everything is written in plain text
- Usually, this text is in unstructured form and there is no meta-data associated with it
- The TXT file format can easily be read by any program

In [36]:
# Ersten 10 Zeilen einer Textdatei anzeigen
with open("example.txt", "r", encoding="utf-8") as file:
    for i in range(10):
        print(file.readline())


Dorothy lived in the midst of the great Kansas prairies, with Uncle Henry, who was a farmer, and Aunt Em, who was the farmer’s wife. Their house was small, for the lumber to build it had to be carried by wagon many miles. There were four walls, a floor and a roof, which made one room; and this room contained a rusty looking cookstove, a cupboard for the dishes, a table, three or four chairs, and the beds. Uncle Henry and Aunt Em had a big bed in one corner, and Dorothy a little bed in another corner. There was no garret at all, and no cellar—except a small hole dug in the ground, called a cyclone cellar, where the family could go in case one of those great whirlwinds arose, mighty enough to crush any building in its path. It was reached by a trap door in the middle of the floor, from which a ladder led down into the small, dark hole.











In [37]:
# Erste Zeile einlesen und Trennzeichen prüfen
with open("example.txt", "r", encoding="utf-8") as file:
    first_line = file.readline()
    print("Erste Zeile:", first_line)

    # Überprüfen auf gängige Trennzeichen
    if "," in first_line:
        print("Trennzeichen: Komma")
    elif "\t" in first_line:
        print("Trennzeichen: Tab")
    elif ";" in first_line:
        print("Trennzeichen: Semikolon")
    else:
        print("Keine gängigen Trennzeichen gefunden")


Erste Zeile: Dorothy lived in the midst of the great Kansas prairies, with Uncle Henry, who was a farmer, and Aunt Em, who was the farmer’s wife. Their house was small, for the lumber to build it had to be carried by wagon many miles. There were four walls, a floor and a roof, which made one room; and this room contained a rusty looking cookstove, a cupboard for the dishes, a table, three or four chairs, and the beds. Uncle Henry and Aunt Em had a big bed in one corner, and Dorothy a little bed in another corner. There was no garret at all, and no cellar—except a small hole dug in the ground, called a cyclone cellar, where the family could go in case one of those great whirlwinds arose, mighty enough to crush any building in its path. It was reached by a trap door in the middle of the floor, from which a ladder led down into the small, dark hole.
Trennzeichen: Komma


In [38]:
# Open a connection to the text-file (r = read)
text_file = open("example.txt", 
                 "r", 
                 encoding='utf-8')

# Read data from .txt file
lines = text_file.read()

# Show type
print(type(lines))

# Print the data
print(lines)

<class 'str'>
Dorothy lived in the midst of the great Kansas prairies, with Uncle Henry, who was a farmer, and Aunt Em, who was the farmer’s wife. Their house was small, for the lumber to build it had to be carried by wagon many miles. There were four walls, a floor and a roof, which made one room; and this room contained a rusty looking cookstove, a cupboard for the dishes, a table, three or four chairs, and the beds. Uncle Henry and Aunt Em had a big bed in one corner, and Dorothy a little bed in another corner. There was no garret at all, and no cellar—except a small hole dug in the ground, called a cyclone cellar, where the family could go in case one of those great whirlwinds arose, mighty enough to crush any building in its path. It was reached by a trap door in the middle of the floor, from which a ladder led down into the small, dark hole.


In [39]:
# Write data to .txt
lines = ['Dorothy lived in the midst of the great Kansas prairies', 
         'with Uncle Henry, who was a farmer ...']

with open('example_write.txt', 'w') as f:
    f.writelines(lines)
    
# Check whether file exists
files = [f for f in os.listdir('.') if re.match('example_write.txt', f)]
print(files)

['example_write.txt']


### JSON (JavaScript Object Notation)

- JSON is a syntax for storing and exchanging data
- JSON is text, written with JavaScript object notation

In [40]:
import json

# Ersten Teil der JSON-Datei einlesen, um die Struktur zu sehen
with open("example.json", "r") as f:
    data = json.load(f)
    print(data.keys()) if isinstance(data, dict) else print(data[0].keys())


dict_keys(['firstName', 'lastName', 'age', 'address'])


In [41]:
# Read data
with open('example.json', 'r') as f:
    data = json.load(f)
print(data)

# Read data to a data frame using the pandas library
data = pd.read_json("example.json")

# Print the data
print(data)

{'firstName': 'John', 'lastName': 'Smith', 'age': 27, 'address': {'streetAddress': '21 2nd Street', 'city': 'New York', 'state': 'NY', 'postalCode': '10021-3100'}}
              firstName lastName  age        address
streetAddress      John    Smith   27  21 2nd Street
city               John    Smith   27       New York
state              John    Smith   27             NY
postalCode         John    Smith   27     10021-3100


In [42]:
# Write data to .json 
data.to_json('example_write.json')

# Check whether the file exists
files = [f for f in os.listdir('.') if re.match('example_write.json', f)]
print(files)

['example_write.json']


### XML (extensible markup language)

- XML stands for extensible Markup Language
- XML is a markup language much like HTML
- XML was designed to store and transport data
- XML was designed to be self-descriptive
- XML is a W3C Recommendation

In [43]:
from bs4 import BeautifulSoup

# Struktur der XML-Datei ohne komplettes Parsen prüfen
with open("example.xml", "r") as file:
    soup = BeautifulSoup(file, "xml")
    print(soup.prettify()[:500])  # Ersten Teil anzeigen


<?xml version="1.0" encoding="utf-8"?>
<breakfast_menu>
 <food>
  <name>
   Belgian Waffles
  </name>
  <price>
   $5.95
  </price>
  <description>
   Two of our famous Belgian Waffles with plenty of real maple syrup
  </description>
  <calories>
   650
  </calories>
 </food>
</breakfast_menu>



In [44]:
# First option: reading the xml file with BeautifulSoup
bs = BeautifulSoup(open('example.xml'), 'html.parser')
print(bs.prettify())

# Second option: using pandas and convert thr xml file to a data frame
data = pd.read_xml("example.xml")
print("------------------------")
print(data[["name","price"]])

<?xml version="1.0" encoding="UTF-8"?>
<breakfast_menu>
 <food>
  <name>
   Belgian Waffles
  </name>
  <price>
   $5.95
  </price>
  <description>
   Two of our famous Belgian Waffles with plenty of real maple syrup
  </description>
  <calories>
   650
  </calories>
 </food>
</breakfast_menu>

------------------------
              name  price
0  Belgian Waffles  $5.95


In [45]:
# Second option: reading xml using .read_xml() from pandas
data = pd.read_xml("example.xml")
print(data)

# Write data to .xml
data.to_xml('example_write.xml')

# Check whether file exists
files = [f for f in os.listdir('.') if re.match('example_write.xml', f)]
print(files)

              name  price                                        description  \
0  Belgian Waffles  $5.95  \n   Two of our famous Belgian Waffles with pl...   

   calories  
0       650  
['example_write.xml']


### HTML (hyper text markup language)

- HTML stands for Hyper Text Markup Language
- HTML is the standard markup language for creating Web pages
- HTML describes the structure of a Web page
- HTML consists of a series of elements
- HTML elements tell the browser how to display the content
- HTML elements label pieces of content such as "this is a heading", "this is a paragraph", "this is a link", etc.

In [46]:
# Read data from .html
filename = 'example.html'
html = open(filename, "r").read()
print(html)

<!DOCTYPE html>
<html>
<head>
<title>Page Title</title>
</head>
<body><h1>My First Heading</h1>
<p>My first paragraph.</p></body>
</html>


In [47]:
# Write data to .html (taking the html-file from above)
with open('example_write.html', 'w') as f:
    f.writelines(html)
    
# Check whether file exists
files = [f for f in os.listdir('.') if re.match('example_write.html', f)]
print(files)

['example_write.html']


### ZIP (archive file format)

- ZIP is an archive file format that supports lossless data compression
- A ZIP file may contain one or more files or directories that may have been compressed
- The ZIP file format permits a number of compression algorithms, though DEFLATE is the most common
- The name "zip" (meaning "move at high speed") was suggested by R. Mahoney
- They wanted to imply that their product would be faster than ARC and other compression formats of the time

In [48]:
from zipfile import ZipFile

# ZIP-Datei öffnen und Struktur anzeigen
with ZipFile("example_write.zip", "r") as zip_obj:
    # Liste der Dateien im ZIP-Archiv
    zip_info = zip_obj.infolist()
    for file_info in zip_info:
        print(f"Dateiname: {file_info.filename}, Größe: {file_info.file_size} bytes")


Dateiname: example.csv, Größe: 837 bytes
Dateiname: example.html, Größe: 137 bytes
Dateiname: example.json, Größe: 193 bytes


In [49]:
# Pandas supports zip file reads
data = pd.read_csv("archive.zip", sep=";")
data.head(5)

Unnamed: 0,chicken_id,weight,breed,eggs_per_year
0,1,2728.85492,New Hampshire Red,158
1,2,2323.761365,Australorp,273
2,3,2635.062034,New Hampshire Red,153
3,4,2603.985152,Plymouth Rock,268
4,5,3079.394487,Australorp,360


In [50]:
# Create an empty Zip-archive
zipObj = ZipFile('example_write.zip', 'w')

# Add selected files to the zip archive
zipObj.write('example.csv')
zipObj.write('example.html')
zipObj.write('example.json')

# Close the Zip-archive
zipObj.close()

# Check whether zip-file exists
files = [f for f in os.listdir('.') if re.match('example_write.zip', f)]
print(files)

['example_write.zip']


### XLSX (Microsoft Excel Open XML file format)

- It is an XML-based file format created by Microsoft Excel 
- The XLSX format was introduced with Microsoft Office 2007
- In XLSX data is organized under the cells and columns in a sheet
- Each XLSX file may contain one or more sheets
- A single workbook can contain multiple sheets

In [51]:
# Read data from an example .xlsx-file
data = pd.read_excel("example.xlsx", sheet_name = "sheet1")

# Print the data 
data.head(5)

Unnamed: 0,chicken_id,weight,breed,eggs_per_year
0,1,2728.85492,New Hampshire Red,158
1,2,2323.761365,Australorp,273
2,3,2635.062034,New Hampshire Red,153
3,4,2603.985152,Plymouth Rock,268
4,5,3079.394487,Australorp,360


In [52]:
import pandas as pd

# Excel-Datei laden, aber nur die Struktur prüfen
excel_file = pd.ExcelFile("example.xlsx")
print("Sheets:", excel_file.sheet_names)  # Namen der Blätter anzeigen

# Erste Zeilen eines Blatts anzeigen, z. B. 'Sheet1'
sheet_data = pd.read_excel("example.xlsx",  nrows=5)
print(sheet_data.columns)  # Spaltennamen anzeigen
print(sheet_data.head())   # Erste Zeilen anzeigen


Sheets: ['sheet1']
Index(['chicken_id', 'weight', 'breed', 'eggs_per_year'], dtype='object')
   chicken_id       weight              breed  eggs_per_year
0           1  2728.854920  New Hampshire Red            158
1           2  2323.761365         Australorp            273
2           3  2635.062034  New Hampshire Red            153
3           4  2603.985152      Plymouth Rock            268
4           5  3079.394487         Australorp            360


In [53]:
# Write data to xlsx
data.to_excel('example_write.xlsx', sheet_name = "sheet1")

# Check whether file exists
files = [f for f in os.listdir('.') if re.match('example_write.xlsx', f)]
print(files)

['example_write.xlsx']


### PDF (portable document format)

- PDF is a file format developed by Adobe in the 1990s to present documents, including text formatting and images, in a manner independent of application software, hardware, and operating systems
- Based on the PostScript language, each PDF file encapsulates a complete description of a fixed-layout flat document, including the text, fonts, vector graphics, raster images and other information needed to display it

In [54]:
from PyPDF2 import PdfReader

# PDF-Datei laden und Metadaten überprüfen
reader = PdfReader("example.pdf")
print("Anzahl Seiten:", len(reader.pages))
print("Metadaten:", reader.metadata)


Anzahl Seiten: 1
Metadaten: {'/Author': 'Gellrich Mario (gell)', '/Company': '', '/CreationDate': "D:20220306135939+01'00'", '/Creator': 'Acrobat PDFMaker 21 für Excel', '/MSIP_Label_10d9bad3-6dac-4e9a-89a3-89f3b8d247b2_ActionId': '6c412dd4-ac43-489a-8e51-585029c84fdd', '/MSIP_Label_10d9bad3-6dac-4e9a-89a3-89f3b8d247b2_ContentBits': '0', '/MSIP_Label_10d9bad3-6dac-4e9a-89a3-89f3b8d247b2_Enabled': 'true', '/MSIP_Label_10d9bad3-6dac-4e9a-89a3-89f3b8d247b2_Method': 'Standard', '/MSIP_Label_10d9bad3-6dac-4e9a-89a3-89f3b8d247b2_Name': '10d9bad3-6dac-4e9a-89a3-89f3b8d247b2', '/MSIP_Label_10d9bad3-6dac-4e9a-89a3-89f3b8d247b2_SetDate': '2022-03-06T12:52:42Z', '/MSIP_Label_10d9bad3-6dac-4e9a-89a3-89f3b8d247b2_SiteId': '5d1a9f9d-201f-4a10-b983-451cf65cbc1e', '/ModDate': "D:20220306135940+01'00'", '/Producer': 'Adobe PDF Library 21.11.71'}


In [55]:
# Reading metadata
reader = PdfReader("example.pdf")
meta = reader.metadata
print(len(reader.pages))

# All of the following could be None!
print(meta.author)
print(meta.creator)
print(meta.producer)
print(meta.subject)

# Extract text
page = reader.pages[0]
print('\n')
print(page.extract_text())

# Number of pages
print('\n')
print(f'Number of pages in PDF: {len(reader.pages)}')


1
Gellrich Mario (gell)
Acrobat PDFMaker 21 für Excel
Adobe PDF Library 21.11.71
None


This is a PDF-File
Table 1:
Pet Color
Dog brown
Cat yellow
Bird green
Dog black
Cat gray
Bird green
Dog red
Cat blue


Number of pages in PDF: 1


In [56]:
# Create a canvas
canvas = Canvas("example_write.pdf", pagesize = LETTER)

# Set font to Times New Roman with 36-point size
canvas.setFont("Times-Roman", 36)

# Draw blue text one inch from the left and ten inches from the bottom
canvas.setFillColor(blue)
canvas.drawString(1 * inch, 10 * inch, "This is a PDF file ...")

# Save the PDF file
canvas.save()

# Check whether file exists
files = [f for f in os.listdir('.') if re.match('example_write.pdf', f)]
print(files)

['example_write.pdf']


### Jupyter notebook --footer info-- (please always provide this at the end of each notebook)

In [57]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
POSIX
Linux | 6.5.0-1025-azure
Datetime: 2024-11-09 20:21:12
Python Version: 3.11.10
-----------------------------------
