In [2]:
import numpy as np
import pandas as pd

## Exploring your working directory
- This means that the IPython magic command ! ls will display the contents of your current directory. Your task is to use the IPython magic command ! ls to check out the contents of your current directory

In [1]:
!ls

DS_10_Introduction to Importing Data in Python.ipynb
DS_3_Data_Manipulation_with_Pandas.ipynb
DS_4_Joining Data_with_Pandas.ipynb
DS_5_Introduction to data visualization with Matplotlib.ipynb
DS_6_Introduction_to_Seaborn.ipynb
DS_7_Python Data Science Toolbox (Part 1).ipynb
DS_8_Python Data Science Toolbox (Part 2).ipynb
DS_9_Intermediate Data Visualization with Seaborn.ipynb
[34mData[m[m


 - you learned how to use the IPython magic command ! ls to explore your current working directory. You can also do this natively in Python using the library os, which consists of miscellaneous operating system interfaces.

- The first line of the following code imports the library os, the second line stores the name of the current directory in a string called wd and the third outputs the contents of the directory in a list to the shell.



In [4]:
import os
wd = os.getcwd()
os.listdir(wd)

['.DS_Store',
 'DS_6_Introduction_to_Seaborn.ipynb',
 'Untitled.ipynb',
 'DS_10_Introduction to Importing Data in Python.ipynb',
 'DS_3_Data_Manipulation_with_Pandas.ipynb',
 'DS_8_Python Data Science Toolbox (Part 2).ipynb',
 'DS_7_Python Data Science Toolbox (Part 1).ipynb',
 'DS_9_Intermediate Data Visualization with Seaborn.ipynb',
 '.ipynb_checkpoints',
 'Data',
 'DS_5_Introduction to data visualization with Matplotlib.ipynb',
 'DS_4_Joining Data_with_Pandas.ipynb']

## Reading a text file 

In [None]:
filename= 'huck-finn.txt'
file = open(filename, mode = 'r') # 'r is to read'
text = file.read()
file.close()

## Write to a file

In [None]:
filename= 'huck-finn.txt'
file = open(filename, mode = 'w') # 'r is to read'
file.close()

## Importing text files line by line (Context manager with
- For large files, we may not want to print all of their content to the shell: you may wish to print only the first few lines.
- Enter the readline() method, which allows you to do this.
- When a file called file is open, you can print out the first line by executing file.readline(). If you execute the same command again, the second line will print, and so on. 

In [None]:
with open('huck-finn.txt', 'r') as file:
    print(file.read())

In [None]:
with open('huck-finn.txt', 'r') as file:
    print(file.readline())

## Importing entire text files

In [None]:
# Open a file: file
file = open('moby_dick.txt', 'r')

# Print it
print(file.read())

# Check whether file is closed
print(file.closed)

# Close file
file.close()

## NumPy 
- NUmPy arrays: standard for storing numerical data
- Essential for other packages: e.g. scikit-learn
- loadtxt()
- genfromtxt()

## Import falt files using NumPy
- There are a number of arguments that np.loadtxt() takes that you'll find useful:
    - delimiter changes the delimiter that loadtxt() is expecting.
        - You can use ',' for comma-delimited.
        - You can use '\t' for tab-delimited.
    - skiprows allows you to specify how many rows (not indices) you wish to skip
    - usecols takes a list of the indices of the columns you wish to keep.

In [None]:
filename = 'MNIST.txt'
data = np.loadtxt(filename, delimiter= ',', skiprows = 1, usecols= [0, 2])
print(data)

In [None]:
data = np.loadtxt(filename, delimiter = ',', dtype =str)

## Working with mixed datatypes
- Much of the time you will need to import datasets which have different datatypes in different columns; one column may contain strings and another floats, for example. The function np.loadtxt() will freak at this. There is another function, np.genfromtxt(), which can handle such structures. If we pass dtype=None to it, it will figure out what types each column should be.
- data = np.genfromtxt('titanic.csv', delimiter=',', names=True, dtype=None)
    - the first argument is the filename
    - the second specifies the delimiter 
    - the third argument names tells us there is a header. 
    - Because the data are of different types, data is an object called a structured array. Because numpy arrays have to contain elements that are all the same type, the structured array solves this by being a 1D array, where each element of the array is a row of the flat file imported. You can test this by checking out the array's shape in the shell by executing np.shape(data).
- You have just used np.genfromtxt() to import data containing mixed datatypes. There is also another function np.recfromcsv() that behaves similarly to np.genfromtxt(), except that its default dtype is None. (defaults delimiter=',' and names=True)

## Using pandas to import flat files as DataFrames
- missing values are also commonly referred to as NA or NaN
- contains comments after the character '#'
- is tab-delimited.

In [None]:
# Assign the filename: file
file = 'digits.csv'

# Read the first 5 rows of the file into a DataFrame: data
data = pd.read_csv(file, nrows = 5, header = None)

# Build a numpy array from the DataFrame: data_array
data_array = np.array(data)

# Print the datatype of data_array to the shell
print(type(data_array))

In [None]:
# Import matplotlib.pyplot as plt
import matplotlib.pyplot as plt

# Assign filename: file
file = 'titanic_corrupt.txt'

# Import file: data
data = pd.read_csv(file, sep='\t', comment='#', na_values=['Nothing'])

# Print the head of the DataFrame
print(data.head())

## Pickled files
- File type native to Python
- Motivation: many datatypes for which it isn;t obvious how to store them 
- Pickled files are serialized
- Serialized. = convert object to bytestream

In [None]:
import pickle
with open('pickled_fruit.pkl','rb') as file: #one signifying 'read only', the other 'binary'
    data = pickle.load(file)
print(data)

## Importing Excel speardsheets

In [None]:
file = 'urbanpop.xlsx'
data = pd.ExcelFile(file)
print(data.sheet_names)

df1 = data.parse('1960-1966') # Sheet name, as string
df2 = data.parse(0) # Sheet index, as a float

In [None]:
"""
The spreadsheet 'battledeath.xlsx' is already loaded as xls.
Parse the first sheet by index. 
In doing so, skip the first row of data and name the columns 'Country' and 'AAM due to War (2002)' 
    using the argument names. The values passed to skiprows and names all need to be of type list.
"""

# Parse the first sheet and rename the columns: df1
df1 = xls.parse(0, skiprows=1, names=['Country','AAM due to War (2002)'])

# Print the head of the DataFrame df1
print(df1.head())

In [None]:
"""
The spreadsheet 'battledeath.xlsx' is already loaded as xls.
Parse the second sheet by index. 
In doing so, parse only the first column with the usecols parameter, 
    skip the first row and rename the column 'Country'. 
The argument passed to usecols also needs to be of type list.
"""

# Parse the first column of the second sheet and rename the column: df2
df2 = xls.parse(1, usecols=[0], skiprows=[0], names=['Country'])

# Print the head of the DataFrame df2
print(df2.head())

## Importing SAS/State files using pandas

In [None]:
import pandas as pd
from sas7bdat import SAS7BDAT
with SAS7BDAT('urbanpop.sas7bdat') as file:
    df_sas = file.to_data_frame()

## Importing Stata files

In [None]:
import pandas as pd
data = pd.read_stata('urbanpop.dta')

## Importing HDF5 flies
- Hierarchical Data Format version 5
- Standard for storing large quantities of numerical data
- Satasets can be hundreds of gigabytes or terabytes
- HDF5 can scale to exabytes

In [None]:
import h5py
filename = 'H-H1_LOSC_4_V1.hdf5'
data = h5py.File(filename,'r') #'r' is to read
print(type(data))

## The structure of HDF5 files

In [None]:
for key in data.keys():
    print(key) #results is an HDF group

## importing MATLAB files
 - keys = matlab variable names
 - values = objects assigned to variables

In [None]:
import scipy.io
filename = 'workspace.mat'
mat =scipy.io.loadmat(filename)
print(type(mat))

## Creating a database engine
- SQLite database: Fast and simple
- SQLAlchemy: Works with many relational database management systems

In [None]:
from sqlalchemy import create_engine
engine = create_engine('sqlit:///Northwind.sqlite')

In [None]:
table_names = engine.table_names()
print(table_names)

## Querying relational databased in Python

In [None]:
from sqlalchemy import create_engine
import pandas as pd
engine = create_engine('sqlit:///Northwind.sqlite')
con = engine.connect()
rs = con.execute("Select * from Orders")
df = pd.DataFrame(rs.fetchall())
df.columns = rs.keys()
con.close()

## Using the context manager 

In [None]:
from sqlalchemy import create_engine
import pandas as pd
engine = create_engine('sqlit:///Northwind.sqlite')

with engine.connect() as con: 
    rs = con.execute("Select order_id, OrderDate, ShipName from Orders")
    df = pd.DataFrame(rs.fetchmany(size=5))
    df.columns = rs.keys()

## The pands way to query

In [None]:
df = pd.read_sql_query("Select * from Orders", engine)

## Advanced querying: Exploiting table relationships

In [None]:
# Execute query and store records in DataFrame: df
df = pd.read_sql_query(" select * from PlaylistTrack INNER JOIN Track on PlaylistTrack.TrackId = Track.TrackId where Milliseconds < 250000", engine)

# Print head of DataFrame
print(df.head())