# Importing Other File Types

## Exploring Working Directory

Using library **os**:

In [11]:
import os
wd = os.getcwd() # gets current working directory as a string
os.listdir(wd) # lists files in the given directory string

['.ipynb_checkpoints',
 'Data',
 '[1] Importing Flat Files.ipynb',
 '[2] Importing Other File Types.ipynb']

## Pickled Files

- e.g. Excel files (.xlsx), pickle files (.pkl)
- Native to Python
- Not all datatypes have an obvious way to store them (e.g. dictionaries, lists)
- Serialised (object converted to bytestream)

In [15]:
import pickle 

# "rb" -> read only, binary
with open("./Data/dogs.pkl", "rb") as file:
    data = pickle.load(file)
    
print(data)

{'Ozzy': 3, 'Filou': 8, 'Luna': 5, 'Skippy': 10, 'Barco': 12, 'Balou': 9, 'Laika': 16}


In [51]:
import pandas as pd
import numpy as np

file = "./Data/Whosn1 Data.xlsx"
data = pd.ExcelFile(file)

df = data.parse("Data") # sheet name

# sheet index
# use a list of columns
# skip a list of rows
# rename columns
df = data.parse(1, usecols=[108,109], skiprows=[3,4,6], names=["Grades", "Accepted"])

df.head()

Unnamed: 0,Grades,Accepted
0,5,1
1,3,1
2,2,1
3,1,0
4,1,0


## Importing SAS and Stata Files

- most common SAS extensions are ".sas7bdat" and ".sas7bcat"
- stata extension is ".dta

In [56]:
import pandas as pd
from sas7bdat import SAS7BDAT

with SAS7BDAT("./Data/airline.sas7bdat") as file:
    df_sas = file.to_data_frame()
    
df_sas.head()

Unnamed: 0,YEAR,Y,W,R,L,K
0,1948.0,1.214,0.243,0.1454,1.415,0.612
1,1949.0,1.354,0.26,0.2181,1.384,0.559
2,1950.0,1.569,0.278,0.3157,1.388,0.573
3,1951.0,1.948,0.297,0.394,1.55,0.564
4,1952.0,2.265,0.31,0.3559,1.802,0.574


In [58]:
import pandas as pd
data = pd.read_stata("./Data/cola.dta")

data.head()

Unnamed: 0,id,choice,price,feature,display
0,1.0,0.0,1.79,0.0,0.0
1,1.0,0.0,1.79,0.0,0.0
2,1.0,1.0,1.79,0.0,0.0
3,2.0,0.0,1.79,0.0,0.0
4,2.0,0.0,1.79,0.0,0.0


## HDF5

- hierarchical structure
- good for storing large quantities of data

In [64]:
import h5py

filename = "./Data/H-H1_LOSC_4_V2-1126259446-32.hdf5"
data = h5py.File(filename, "r") # r is to read
print(type(data))

# navigating the hierachy similar to navigating a dictionary
for key in data.keys():
    print(key)
    
# continue navigating down the hierachical structure
for key in data["meta"].keys():
    print(key)

# get the value using [()]
print(data["meta"]["Description"][()])

<class 'h5py._hl.files.File'>
meta
quality
strain
Description
DescriptionURL
Detector
Duration
GPSstart
Observatory
Type
UTCstart
b'Strain data time series from LIGO'


## MATLAB

- .mat files

### Using SciPy

MATLAB workspaces are stored as dictionaries. The keys refer to the variable names and the values are the objects assigned to said variables.

In [69]:
import scipy.io

filename = "./Data/data1.mat"
mat = scipy.io.loadmat(filename)

print(type(mat)) # a dictionary

print(mat.keys()) # get keys

print(type(mat["fret"])) # type of variable x

print(mat["fret"])

<class 'dict'>
dict_keys(['__header__', '__version__', '__globals__', 'fret', 'rfp'])
<class 'numpy.ndarray'>
[[0.         0.         0.         ... 0.63444499 0.64521205 0.64302332]
 [0.         0.         0.         ... 0.59611929 0.60171088 0.59868451]
 [0.         0.         0.         ... 0.72522822 0.74362067 0.73933919]
 ...
 [0.         0.         0.         ... 0.69681655 0.67885274 0.70031342]
 [0.         0.         0.         ... 0.75400259 0.73390885 0.74981491]
 [0.         0.         0.         ... 0.69681655 0.67885274 0.70031342]]
