# Before we start analysing data, we need to access that data! (of course!!!) #
## So, let us begin ... ##



### 🎓 Learning Objectives
1.  **Master File Formats:** CSV, JSON, Parquet, PDF.
2.  **Defensive Coding:** Error handling patterns.
3.  **Validation:** Data quality checks.
---


In [None]:
def validate_import(df, name="Dataset"):
    print(f"\n🔍 REPORT: {name}")
    print(f"   Shape: {df.shape}")
    print(f"   Missing: {df.isnull().sum().sum()}")
    print("-" * 30)



# Open just one csv file

In [None]:
# importing the widely-used pandas module; documentation: https://pandas.pydata.org/pandas-docs/stable/index.html
import pandas as pd # giving it a name pd so that we can type 'pd' later rather than typing 'pandas' in all commands; 


### by the way, anything written after a hashtag # will be processed as a comment rather than a code ###

In [None]:
import os # os is another module we are importing to deal with directories/folder paths
os.getcwd() # this is to print or get cURRENT wORKING dIRECTORY

In [None]:
df = pd.read_csv('/home/jovyan/DataFile/SameCol1.csv') # calling dataframe df is standard parctice; but you can call it df_1, df2 or anything

In [None]:
# 🛡️ Defensive Coding Pattern
try:
    # Intentionally failing import
    pd.read_csv('missing_file.csv')
except FileNotFoundError:
    print("✅ Caught expected error: File not found")



In [None]:
df # taking a look at the whole df

In [None]:
df.info()

In [None]:
df.describe() # for summary statistics of numerical value columns only

# Open one text file

In [None]:
path = '/home/jovyan/DataFile' #path to the onlie file - 'jyovan' seems to be a default folder in this online environment (see above)

In [None]:
path

In [None]:
###reading one text file
with open(path+'/Test2.txt',"r") as file:
    text_test = file.read()
    file.close() # to close the file

In [None]:
text_test

# To access multiple text files

In [None]:
import glob
import os

In [None]:
path = '/home/jovyan/DataFile' #path to the file

In [None]:
file_list = glob.glob(path+"/*.txt")

In [None]:
file_list

In [None]:
corpus = []


In [None]:
for file_path in file_list:
    with open(file_path, encoding="utf8") as f_input:
        corpus.append(f_input.read())



In [None]:
corpus

In [None]:
import pandas as pd


In [None]:
df = pd.DataFrame(file_list, columns=['Filenames'])

In [None]:
df

In [None]:
df["Content"] = pd.DataFrame(corpus)

In [None]:
df.head(3)

In [None]:
#save file as csv
df.to_csv('/home/jovyan/Text_combined.csv', index=False) # check if a file names Text_combined.csv is saved on the left pane

# Open multiple csv files 

In [None]:
import pandas as pd
import glob

In [None]:
path = '/home/jovyan/DataFile' #path to the file

In [None]:
file_list = glob.glob(path+"/*.csv")

In [None]:
file_list

In [None]:
df = pd.concat(map(pd.read_csv, file_list))

In [None]:
df

In [None]:
#save file as csv
df.to_csv('/home/jovyan/CSV_combined.csv', index=False) # check if a file names CSV_combined.csv is saved on the left pane

# Open json files

In [None]:
import json

In [None]:
path = '/home/jovyan/DataFile' #path to the file

In [None]:
File = path+'/matches_England.json'

In [None]:
File

In [None]:
with open(File,'r') as f:
    data = json.loads(f.read())

In [None]:
#print(json.dumps(data, indent = 4, sort_keys=True)) ## only use in a desktop environment

In [None]:
data[1]["venue"] # let us explore some of the data within the json file

In [None]:
data[4]["venue"]

In [None]:
data[4]["referees"]

In [None]:
data[4]["referees"][2]

In [None]:
data[4]["referees"][2]["refereeId"]

In [None]:
data[4]["referees"][2]["role"]

# Open one parquet file

##### Make sure pyarrow or similar module is installed first <b> (see below to install) </b>

In [None]:
ls

In [None]:
#!pip install pyarrow==18.0.0

In [None]:
import pyarrow.parquet as pq

In [None]:
table = pq.ParquetFile('/home/jovyan/DataFile/ABC_file.parquet')

In [None]:
table.schema

In [None]:
print(table.schema)

In [None]:
df = pd.read_parquet('/home/jovyan/DataFile/ABC_file.parquet', engine='pyarrow') 

In [None]:
df.head(2)

# Access one pdf file

In [None]:
path = '/home/jovyan/DataFile' #path to the file

In [None]:
# Install modern pypdf (preferred over PyPDF2)
!pip install pypdf



In [None]:
# 📄 Reading PDF Files (Modern Method)
# We use 'pypdf' or 'PyPDF2>=3.0' which uses 'PdfReader' instead of the old 'PdfFileReader'

from pypdf import PdfReader # Modern import

# 1. Open the file
reader = PdfReader(path + "/Microsoft 2021 Annual Report.pdf")

# 2. Check number of pages (Old: .numPages -> New: len(reader.pages))
num_pages = len(reader.pages)
print(f"Total Pages: {num_pages}")

# 3. Extract text from the first page
first_page_text = reader.pages[0].extract_text()
print("\n--- First Page Snippet ---")
print(first_page_text[:300] + "...")



In [None]:
# 4. Loop through ALL pages
full_text = ""
print("\n--- Processing All Pages ---")

for p_num, page in enumerate(reader.pages):
    # Extract text
    text = page.extract_text()
    if text:
        full_text += text
    # print(f"Page {p_num+1} processed")

print(f"Successfully extracted {len(full_text)} characters from {num_pages} pages.")



## 🌡️ Research Corner: Climate Data Formats
| Format | Extension | Use Case |
|:---|:---|:---|
| **NetCDF** | `.nc` | Satellite data |
| **Parquet** | `.parquet` | Big Data tables |



# Accessing image

In [None]:
import cv2

In [None]:
path = '/home/jovyan/DataFile' #path to the file

In [None]:
import os

In [None]:
os.chdir(path)

In [None]:
image = cv2.imread("Test_image.png")

In [None]:
image[0][0]

In [None]:
image[0]

In [None]:
image

# Accessing web page

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
url = "http://books.toscrape.com/catalogue/category/books/travel_2/index.html" 

In [None]:
response = requests.get(url) # getting the information from webpage

In [None]:
soup = BeautifulSoup(response.content, "html.parser") # parsing the extracted text to get required information

In [None]:
titles = soup.find_all('h3') # find title

In [None]:
prices = soup.find_all('p', class_='price_color') #find price

In [None]:
stocks = soup.find_all('p', class_='instock availability') #find availability

In [None]:
titles # title.get_text()

In [None]:
for ABCDE in titles:
    print(ABCDE.get_text()) # ABCDE can be changed to anything like Title, Names, etc.

# Accessing Word document

In [None]:
path = '/home/jovyan/DataFile' #path to the file

In [None]:
import os

In [None]:
os.chdir(path) # no need to execute if already done above; you can ignore the error message

In [None]:
ls

In [None]:
! pip install docx2txt==0.8

In [None]:
import docx2txt

In [None]:
result = docx2txt.process("Test_Doc.docx")                  

In [None]:
result

# Accessing Excel

In [None]:
! pip3 install openpyxl==3.1.0 #if you are using Anaconda, install it differently without using pip

In [None]:
import pandas as pd

In [None]:
df = pd.read_excel('Test_spreadsheet.xlsx', sheet_name = "Phones")

In [None]:
df

# Accessing Stata file

In [None]:
path = '/home/jovyan/DataFile' #path to the file

In [None]:
import os

In [None]:
os.getcwd() # chdir

In [None]:
import pandas as pd

In [None]:
ls

In [None]:
df = pd.read_stata("MatrixPractice.dta")

In [None]:
df

# Accessing zipped files

# Zip files

In [None]:
path = '/home/jovyan/DataFile' #path to the file

In [None]:
import os
os.chdir(path) # no need to execute if already done above; you can ignore the error message

In [None]:
os.getcwd()

In [None]:
from zipfile import ZipFile # import zipfile 
zf = ZipFile("SameColFile.zip")                



### So what files are there within the zipped folder? ###

In [None]:
zf.namelist() # gives names of the files within teh zipped folder

In [None]:
df3 = pd.read_csv(zf.open("SameCol1.csv"))

In [None]:
# 🛡️ Defensive Coding Pattern
try:
    # Intentionally failing import
    pd.read_csv('missing_file.csv')
except FileNotFoundError:
    print("✅ Caught expected error: File not found")



In [None]:
df3

In [None]:
from zipfile import ZipFile
with ZipFile("SameColFile.zip", "r") as zip_ref:
   # Get list of files names in zip
   list_of_files2 = zip_ref.namelist()

In [None]:
list_of_files2

# GZIP files

In [None]:
path = '/home/jovyan/DataFile' #path to the file

In [None]:
import os
os.chdir(path) # no need to execute if already done above; you can ignore the error message

In [None]:
df2 = pd.read_csv('SameCol4.gz', compression='gzip')

In [None]:
df2