In [1]:
# Python can work with the following file types: 
# Comma-separated values (CSV) 
# XLSX (Excel) 
# Plain Text (txt) 
# JSON
# XML 
# HTML 
# PDF (Adobe) 
# DOCX (Microsoft Word) 
# Images 
# MP3 
# MP4 
# ZIP

import pandas as pd 
df_csv = pd.read_csv('data/csvtest.csv')

In [2]:
display(df_csv)

Unnamed: 0,Film,Year,Awards,Nominations
0,7 Faces of Dr. Lao,1964,0 (1),1
1,7th Heaven,1927/28,3,5
2,8 Mile,2002,1,1
3,12 Years a Slave,2013,3,9
4,20 Feet from Stardom,2013,1,1
5,"20,000 Leagues Under the Sea",1954,2,3
6,1917,2019,3,10
7,2001: A Space Odyssey,1968,1,4
8,A Fantastic Woman,2017,1,1
9,A force in Readiness,1961,0 (1),0


In [3]:
df_xls = pd.read_excel('data/exceltest.xlsx', sheet_name='xlstest')

In [4]:
display(df_xls)

Unnamed: 0,Film,Year,Awards,Nominations
0,7 Faces of Dr. Lao,1964,0 (1),1
1,7th Heaven,1927/28,3,5
2,8 Mile,2002,1,1
3,12 Years a Slave,2013,3,9
4,20 Feet from Stardom,2013,1,1
5,"20,000 Leagues Under the Sea",1954,2,3
6,1917,2019,3,10
7,2001: A Space Odyssey,1968,1,4
8,A Fantastic Woman,2017,1,1
9,A force in Readiness,1961,0 (1),0


In [5]:
# Accessing the columns
print(df_xls.columns.ravel())

['Film' 'Year' 'Awards' 'Nominations']


In [6]:
# list the contents of the Film column
print(df_xls['Film'].tolist())

['7 Faces of Dr. Lao', '7th Heaven', '8 Mile', '12 Years a Slave', '20 Feet from Stardom', '20,000 Leagues Under the Sea', 1917, '2001: A Space Odyssey', 'A Fantastic Woman', 'A force in Readiness', 'A Girl in the River: The Price of Forgiveness', 'A Herb Alpert and the Tijuana Brass Double Feature']


In [7]:
# Convert to JSON from excel
print('Print Excel as JSON:', df_xls.to_json(orient='records'))

Print Excel as JSON: [{"Film":"7 Faces of Dr. Lao","Year":1964,"Awards":"0 (1)","Nominations":1},{"Film":"7th Heaven","Year":"1927\/28","Awards":3,"Nominations":5},{"Film":"8 Mile","Year":2002,"Awards":1,"Nominations":1},{"Film":"12 Years a Slave","Year":2013,"Awards":3,"Nominations":9},{"Film":"20 Feet from Stardom","Year":2013,"Awards":1,"Nominations":1},{"Film":"20,000 Leagues Under the Sea","Year":1954,"Awards":2,"Nominations":3},{"Film":1917,"Year":2019,"Awards":3,"Nominations":10},{"Film":"2001: A Space Odyssey","Year":1968,"Awards":1,"Nominations":4},{"Film":"A Fantastic Woman","Year":2017,"Awards":1,"Nominations":1},{"Film":"A force in Readiness","Year":1961,"Awards":"0 (1)","Nominations":0},{"Film":"A Girl in the River: The Price of Forgiveness","Year":2015,"Awards":1,"Nominations":1},{"Film":"A Herb Alpert and the Tijuana Brass Double Feature","Year":1966,"Awards":1,"Nominations":1}]


In [8]:
# show the results as a CSV 
print('Print Excel as CSV', df_xls.to_csv(index=False))

Print Excel as CSV Film,Year,Awards,Nominations
7 Faces of Dr. Lao,1964,0 (1),1
7th Heaven,1927/28,3,5
8 Mile,2002,1,1
12 Years a Slave,2013,3,9
20 Feet from Stardom,2013,1,1
"20,000 Leagues Under the Sea",1954,2,3
1917,2019,3,10
2001: A Space Odyssey,1968,1,4
A Fantastic Woman,2017,1,1
A force in Readiness,1961,0 (1),0
A Girl in the River: The Price of Forgiveness,2015,1,1
A Herb Alpert and the Tijuana Brass Double Feature,1966,1,1



In [9]:
# Read JSON files
df_json = pd.read_json('data/jsontest.json')

In [10]:
# display json
display(df_json)

Unnamed: 0,glossary
GlossDiv,"{'title': 'S', 'GlossList': {'GlossEntry': {'I..."
title,example glossary


In [11]:
# using ElementTree library
import xml.etree.ElementTree as ET 
tree = ET.parse('data/countries.xml')
root = tree.getroot()

#Each element of the tree, including the root, has tags that describe it: 
root.tag
root.attrib

{}

In [12]:
#print all children of the root of the tree, and their corresponding tags and attributes 
for child in root: 
    print(child.tag, child.attrib)

country {'name': 'Liechtenstein'}
country {'name': 'Singapore'}
country {'name': 'Panama'}


In [13]:
#pip install pdfminer

In [14]:
#pip install docx2txt

In [15]:
import docx2txt
text = docx2txt.process("data/doctest.docx")

In [16]:
print(text)

Film Year Awards Nominations

7 Faces of Dr. Lao 1964 0 (1) 1

7th Heaven 1927/28 3 5

8 Mile 2002 1 1

12 Years a Slave 2013 3 9

20 Feet from Stardom 2013 1 1

"20 000 Leagues Under the Sea" 1954 2 3

1917 2019 3 10

2001: A Space Odyssey 1968 1 4

A Fantastic Woman 2017 1 1

A force in Readiness 1961 0 (1) 0

A Girl in the River: The Price of Forgiveness 2015 1 1

A Herb Alpert and the Tijuana Brass Double Feature 1966 1 1


In [17]:
from nltk.tokenize import RegexpTokenizer
from pdfminer.high_level import extract_text
from nltk.probability import FreqDist

In [18]:
textp = extract_text('data/simple1.pdf')

In [19]:
print(textp)

Hello 

World

Hello 

World

H e l l o  

W o r l d

H e l l o  

W o r l d




In [20]:
import zipfile
archive = zipfile.ZipFile('data.zip', 'r')

In [21]:
# read the csvtest file
df_archive = archive.read('data/csvtest.csv')
display (df_archive)

b'Film,Year,Awards,Nominations\n7 Faces of Dr. Lao,1964,0 (1),1\n7th Heaven,1927/28,3,5\n8 Mile,2002,1,1\n12 Years a Slave,2013,3,9\n20 Feet from Stardom,2013,1,1\n"20,000 Leagues Under the Sea",1954,2,3\n1917,2019,3,10\n2001: A Space Odyssey,1968,1,4\nA Fantastic Woman,2017,1,1\nA force in Readiness,1961,0 (1),0\nA Girl in the River: The Price of Forgiveness,2015,1,1\nA Herb Alpert and the Tijuana Brass Double Feature,1966,1,1\n\n'