# Introduction to Python for Data Science


Let us see what is the version of the python and Ipyhton we are using

In [1]:
import IPython
import platform

print ('Python version:', platform.python_version())
print ('IPython version:', IPython.__version__)

Python version: 3.5.2
IPython version: 5.1.0


Now lets import the python libraries we are going to use

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

%matplotlib inline

print ('numpy version:', np.__version__)
print ('pandas version:', pd.__version__)
print ('matplotlib version:', matplotlib.__version__)


numpy version: 1.12.0
pandas version: 0.19.2
matplotlib version: 2.0.0


#  
# NumPy

NumPy is the fundamental package for scientific computing with Python. It contains among other things:

- a powerful N-dimensional array object
- sophisticated (broadcasting) functions
- tools for integrating C/C++ and Fortran code
- useful linear algebra, Fourier transform, and random number capabilities

Besides its obvious scientific uses, NumPy can also be used as an efficient multi-dimensional container of generic data. Arbitrary data-types can be defined. This allows NumPy to seamlessly and speedily integrate with a wide variety of databases.


In [3]:
# declare a vector using a list as the argument
v = np.array([1,2,3,4])
v

array([1, 2, 3, 4])

In [4]:
# declare a matrix using a nested list as the argument
M = np.array([[1,2],[3,4]])
M

array([[1, 2],
       [3, 4]])

In [5]:
# still the same core type with different shapes
type(v), type(M)

(numpy.ndarray, numpy.ndarray)

In [6]:
M.size

4

In [7]:
# arguments: start, stop, step
x = np.arange(0, 10, 1)
x 

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [8]:
np.linspace(0, 10, 25)

array([  0.        ,   0.41666667,   0.83333333,   1.25      ,
         1.66666667,   2.08333333,   2.5       ,   2.91666667,
         3.33333333,   3.75      ,   4.16666667,   4.58333333,
         5.        ,   5.41666667,   5.83333333,   6.25      ,
         6.66666667,   7.08333333,   7.5       ,   7.91666667,
         8.33333333,   8.75      ,   9.16666667,   9.58333333,  10.        ])

In [10]:
np.logspace(0, 10, 10, base=np.e)

array([  1.00000000e+00,   3.03773178e+00,   9.22781435e+00,
         2.80316249e+01,   8.51525577e+01,   2.58670631e+02,
         7.85771994e+02,   2.38696456e+03,   7.25095809e+03,
         2.20264658e+04])

In [None]:
x, y = np.mgrid[0:5, 0:5]
x

In [None]:
y

In [None]:
np.random.rand(5,5)

In [None]:
# normal distribution
np.random.randn(5,5)

In [None]:
np.diag([1,2,3])

In [None]:
print( M.itemsize, M.nbytes, M.ndim)

In [None]:
print(v[0], M[1,1])
print(M[1])

In [None]:
# assign new value
M[0,0] = 7
M

In [None]:
M[0,:] = 0
M

In [None]:
# slicing works just like with lists
A = array([1,2,3,4,5])
A[1:3]

In [None]:
A = array([[n+m*10 for n in range(5)] for m in range(5)])
A

In [None]:
row_indices = [1, 2, 3]
A[row_indices]

In [None]:
# index masking
B = array([n for n in range(5)])
row_mask = array([True, False, True, False, False])
B[row_mask]

### Linear Algebra

In [None]:
v1 = arange(0, 5)
v1

In [None]:
v1 + 2

In [None]:
v1 * 2

In [None]:
v1 * v1

In [None]:
dot(v1, v1)

In [None]:
dot(A, v1)

In [None]:
# cast changes behavior of + - * etc. to use matrix algebra
M = matrix(A)
M * M

In [None]:
# inner product
v.T * v

In [None]:
C = matrix([[1j, 2j], [3j, 4j]])
C

In [None]:
conjugate(C)

In [None]:
# inverse
C.I

### Statistics

In [None]:
mean(A[:,3])

In [None]:
std(A[:,3]), var(A[:,3])

In [None]:
A[:,3].min(), A[:,3].max()

In [None]:
d = arange(1, 10)
sum(d), prod(d)

In [None]:
cumsum(d)

In [None]:
cumprod(d)

In [None]:
# sum of diagonal
trace(A)

In [None]:
m = random.rand(3, 3)
m

In [None]:
# use axis parameter to specify how function behaves
m.max(), m.max(axis=0)

In [None]:
# reshape without copying underlying data
n, m = A.shape
B = A.reshape((1,n*m))

B

In [None]:
# modify the array
B[0,0:5] = 5
B

In [None]:
# also changed
A

In [None]:
# creates a copy
B = A.flatten()
B

In [None]:
# can insert a dimension in an array
v = array([1,2,3])
v[:, newaxis], v[:,newaxis].shape, v[newaxis,:].shape

In [None]:
repeat(v, 3)

In [None]:
tile(v, 3)

In [None]:
w = array([5, 6])

In [None]:
concatenate((v, w), axis=0)

In [None]:
# deep copy
B = copy(A)

#  
# Matplotlib

Matplotlib is a python 2D plotting library which produces publication quality figures in a variety of hardcopy formats and interactive environments across platforms. matplotlib can be used in python scripts, the python and ipython shell, web application servers, and six graphical user interface toolkits.

Matplotlib tries to make easy things easy and hard things possible. You can generate plots, histograms, power spectra, bar charts, errorcharts, scatterplots, etc, with just a few lines of code.

In [None]:
x = linspace(0, 5, 10)
y = x ** 2

fig = plt.figure()

# left, bottom, width, height (range 0 to 1)
axes = fig.add_axes([0.1, 0.1, 0.8, 0.8])

axes.plot(x, y, 'r')

axes.set_xlabel('x')
axes.set_ylabel('y')
axes.set_title('title');

In [None]:
fig = plt.figure()

axes1 = fig.add_axes([0.1, 0.1, 0.8, 0.8]) # main axes
axes2 = fig.add_axes([0.2, 0.5, 0.4, 0.3]) # inset axes

# main figure
axes1.plot(x, y, 'r')
axes1.set_xlabel('x')
axes1.set_ylabel('y')
axes1.set_title('title')

# insert
axes2.plot(y, x, 'g')
axes2.set_xlabel('y')
axes2.set_ylabel('x')
axes2.set_title('insert title');

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2)

for ax in axes:
    ax.plot(x, y, 'r')
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.set_title('title')
    
fig.tight_layout()

In [None]:
# example with a legend and latex symbols
fig, ax = plt.subplots()

ax.plot(x, x**2, label=r"$y = \alpha^2$")
ax.plot(x, x**3, label=r"$y = \alpha^3$")
ax.legend(loc=2) # upper left corner
ax.set_xlabel(r'$\alpha$', fontsize=18)
ax.set_ylabel(r'$y$', fontsize=18)
ax.set_title('title');

In [None]:
# line customization
fig, ax = plt.subplots(figsize=(12,6))

ax.plot(x, x+1, color="blue", linewidth=0.25)
ax.plot(x, x+2, color="blue", linewidth=0.50)
ax.plot(x, x+3, color="blue", linewidth=1.00)
ax.plot(x, x+4, color="blue", linewidth=2.00)

# possible linestype options ‘-‘, ‘–’, ‘-.’, ‘:’, ‘steps’
ax.plot(x, x+5, color="red", lw=2, linestyle='-')
ax.plot(x, x+6, color="red", lw=2, ls='-.')
ax.plot(x, x+7, color="red", lw=2, ls=':')

# custom dash
line, = ax.plot(x, x+8, color="black", lw=1.50)
line.set_dashes([5, 10, 15, 10]) # format: line length, space length, ...

# possible marker symbols: marker = '+', 'o', '*', 's', ',', '.', 
# '1', '2', '3', '4', ...
ax.plot(x, x+ 9, color="green", lw=2, ls='*', marker='+')
ax.plot(x, x+10, color="green", lw=2, ls='*', marker='o')
ax.plot(x, x+11, color="green", lw=2, ls='*', marker='s')
ax.plot(x, x+12, color="green", lw=2, ls='*', marker='1')

# marker size and color
ax.plot(x, x+13, color="purple", lw=1, ls='-', marker='o', markersize=2)
ax.plot(x, x+14, color="purple", lw=1, ls='-', marker='o', markersize=4)
ax.plot(x, x+15, color="purple", lw=1, ls='-', marker='o', markersize=8, 
        markerfacecolor="red")
ax.plot(x, x+16, color="purple", lw=1, ls='-', marker='s', markersize=8, 
        markerfacecolor="yellow", markeredgewidth=2, markeredgecolor="blue");

In [None]:
# axis controls
fig, axes = plt.subplots(1, 3, figsize=(12, 4))

axes[0].plot(x, x**2, x, x**3)
axes[0].set_title("default axes ranges")

axes[1].plot(x, x**2, x, x**3)
axes[1].axis('tight')
axes[1].set_title("tight axes")

axes[2].plot(x, x**2, x, x**3)
axes[2].set_ylim([0, 60])
axes[2].set_xlim([2, 5])
axes[2].set_title("custom axes range");

In [None]:
# scaling
fig, axes = plt.subplots(1, 2, figsize=(10,4))
      
axes[0].plot(x, x**2, x, exp(x))
axes[0].set_title("Normal scale")

axes[1].plot(x, x**2, x, exp(x))
axes[1].set_yscale("log")
axes[1].set_title("Logarithmic scale (y)");

In [None]:
# axis grid
fig, axes = plt.subplots(1, 2, figsize=(10,3))

# default grid appearance
axes[0].plot(x, x**2, x, x**3, lw=2)
axes[0].grid(True)

# custom grid appearance
axes[1].plot(x, x**2, x, x**3, lw=2)
axes[1].grid(color='b', alpha=0.5, linestyle='dashed', linewidth=0.5)

In [None]:
# twin axes example
fig, ax1 = plt.subplots()

ax1.plot(x, x**2, lw=2, color="blue")
ax1.set_ylabel(r"area $(m^2)$", fontsize=18, color="blue")
for label in ax1.get_yticklabels():
    label.set_color("blue")
    
ax2 = ax1.twinx()
ax2.plot(x, x**3, lw=2, color="red")
ax2.set_ylabel(r"volume $(m^3)$", fontsize=18, color="red")
for label in ax2.get_yticklabels():
    label.set_color("red")

In [None]:
# other plot styles
xx = np.linspace(-0.75, 1., 100)
n = array([0,1,2,3,4,5])

fig, axes = plt.subplots(1, 4, figsize=(12,3))

axes[0].scatter(xx, xx + 0.25*randn(len(xx)))
axes[0].set_title("scatter")

axes[1].step(n, n**2, lw=2)
axes[1].set_title("step")

axes[2].bar(n, n**2, align="center", width=0.5, alpha=0.5)
axes[2].set_title("bar")

axes[3].fill_between(x, x**2, x**3, color="green", alpha=0.5);
axes[3].set_title("fill_between");

In [None]:
# histograms
n = np.random.randn(100000)
fig, axes = plt.subplots(1, 2, figsize=(12,4))

axes[0].hist(n)
axes[0].set_title("Default histogram")
axes[0].set_xlim((min(n), max(n)))

axes[1].hist(n, cumulative=True, bins=50)
axes[1].set_title("Cumulative detailed histogram")
axes[1].set_xlim((min(n), max(n)));

In [None]:
# annotations
fig, ax = plt.subplots()

ax.plot(xx, xx**2, xx, xx**3)

ax.text(0.15, 0.2, r"$y=x^2$", fontsize=20, color="blue")
ax.text(0.65, 0.1, r"$y=x^3$", fontsize=20, color="green");

In [None]:
# color map
alpha = 0.7
phi_ext = 2 * pi * 0.5

def flux_qubit_potential(phi_m, phi_p):
    return ( + alpha - 2 * cos(phi_p)*cos(phi_m) - 
        alpha * cos(phi_ext - 2*phi_p))

phi_m = linspace(0, 2*pi, 100)
phi_p = linspace(0, 2*pi, 100)
X,Y = meshgrid(phi_p, phi_m)
Z = flux_qubit_potential(X, Y).T

fig, ax = plt.subplots()

p = ax.pcolor(X/(2*pi), Y/(2*pi), Z, 
              cmap=cm.RdBu, vmin=abs(Z).min(), vmax=abs(Z).max())
cb = fig.colorbar(p, ax=ax)

In [None]:
from mpl_toolkits.mplot3d.axes3d import Axes3D

In [None]:
# surface plots
fig = plt.figure(figsize=(14,6))

# `ax` is a 3D-aware axis instance because of the projection='3d' 
# keyword argument to add_subplot
ax = fig.add_subplot(1, 2, 1, projection='3d')

p = ax.plot_surface(X, Y, Z, rstride=4, cstride=4, linewidth=0)

# surface_plot with color grading and color bar
ax = fig.add_subplot(1, 2, 2, projection='3d')
p = ax.plot_surface(X, Y, Z, rstride=1, cstride=1, 
                    cmap=cm.coolwarm, linewidth=0, antialiased=False)
cb = fig.colorbar(p, shrink=0.5)

In [None]:
# wire frame
fig = plt.figure(figsize=(8,6))

ax = fig.add_subplot(1, 1, 1, projection='3d')

p = ax.plot_wireframe(X, Y, Z, rstride=4, cstride=4)

In [None]:
# contour plot with projections
fig = plt.figure(figsize=(8,6))

ax = fig.add_subplot(1,1,1, projection='3d')

ax.plot_surface(X, Y, Z, rstride=4, cstride=4, alpha=0.25)
cset = ax.contour(X, Y, Z, zdir='z', offset=-pi, cmap=cm.coolwarm)
cset = ax.contour(X, Y, Z, zdir='x', offset=-pi, cmap=cm.coolwarm)
cset = ax.contour(X, Y, Z, zdir='y', offset=3*pi, cmap=cm.coolwarm)

ax.set_xlim3d(-pi, 2*pi);
ax.set_ylim3d(0, 3*pi);
ax.set_zlim3d(-pi, 2*pi);

#  
# Pandas

Pandas is an open source, BSD-licensed library providing high-performance, easy-to-use data structures and data analysis tools for the Python programming language.

In [None]:
# create a series
s = pd.Series([1,3,5,np.nan,6,8])
s

In [None]:
# create a data frame
dates = pd.date_range('20130101',periods=6)
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))
df

In [None]:
# another way to create a data frame
df2 = pd.DataFrame(
    { 'A' : 1.,
      'B' : pd.Timestamp('20130102'),
      'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
      'D' : np.array([3] * 4,dtype='int32'),
      'E' : 'foo' })
df2

In [None]:
df.index

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.values

In [None]:
# quick data summary
df.describe()

In [None]:
df.T

In [None]:
# axis 0 is index, axis 1 is columns
df.sort_index(axis=1, ascending=False)

In [None]:
# can sort by values too
df.sort(columns='B')

In [None]:
# select a column (yields a series)
df['A']

In [None]:
# column names also attached to the object
df.A

In [None]:
# slicing works
df[0:3]

In [None]:
df['20130102':'20130104']

In [None]:
# cross-section using a label
df.loc[dates[0]]

In [None]:
# getting a scalar value
df.loc[dates[0], 'A']

In [None]:
# select via position
df.iloc[3]

In [None]:
df.iloc[3:5,0:2]

In [None]:
# column slicing
df.iloc[:,1:3]

In [None]:
# get a value by index
df.iloc[1,1]

In [None]:
# boolean indexing
df[df.A > 0]

In [None]:
df[df > 0]

In [None]:
# filtering
df3 = df.copy()
df3['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df3[df3['E'].isin(['two', 'four'])]

In [None]:
# setting examples
df.at[dates[0],'A'] = 0
df.iat[0,1] = 0
df.loc[:, 'D'] = np.array([5] * len(df))
df

In [None]:
# dealing with missing data
df4 = df.reindex(index=dates[0:4],columns=list(df.columns) + ['E'])
df4.loc[dates[0]:dates[1],'E'] = 1
df4

In [None]:
# drop rows with missing data
df4.dropna(how='any')

In [None]:
# fill missing data
df4.fillna(value=5)

In [None]:
# boolean mask for nan values
pd.isnull(df4)

In [None]:
df.mean()

In [None]:
# pivot the mean calculation
df.mean(1)

In [None]:
# aligning objects with different dimensions
s = pd.Series([1,3,5,np.nan,6,8],index=dates).shift(2)
df.sub(s,axis='index')

In [None]:
# applying functions
df.apply(np.cumsum)

In [None]:
df.apply(lambda x: x.max() - x.min())

In [None]:
# simple count aggregation
s = pd.Series(np.random.randint(0,7,size=10))
s.value_counts()

### Merging / Grouping / Shaping

In [None]:
# concatenation
df = pd.DataFrame(np.random.randn(10, 4))
pieces = [df[:3], df[3:7], df[7:]]
pd.concat(pieces)

In [None]:
# SQL-style join
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})
pd.merge(left, right, on='key')

In [None]:
# append
df = pd.DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
s = df.iloc[3]
df.append(s, ignore_index=True)

In [None]:
df = pd.DataFrame(
    { 'A' : ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
      'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
      'C' : np.random.randn(8),
      'D' : np.random.randn(8) })
df

In [None]:
# group by
df.groupby('A').sum()

In [None]:
# group by multiple columns
df.groupby(['A','B']).sum()

In [None]:
df = pd.DataFrame(
    { 'A' : ['one', 'one', 'two', 'three'] * 3,
      'B' : ['A', 'B', 'C'] * 4,
      'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
      'D' : np.random.randn(12),
      'E' : np.random.randn(12)} )
df

In [None]:
# pivot table
pd.pivot_table(df, values='D', rows=['A', 'B'], columns=['C'])

In [None]:
# time period resampling
rng = pd.date_range('1/1/2012', periods=100, freq='S')
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
ts.resample('5Min', how='sum')

In [None]:
rng = pd.date_range('1/1/2012', periods=5, freq='M')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts

In [None]:
ps = ts.to_period()
ps.to_timestamp()

In [None]:
# time series plot
ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))
ts = ts.cumsum()
ts.plot()

In [None]:
# plot with a data frame
df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index, columns=['A', 'B', 'C', 'D'])
df = df.cumsum()
plt.figure(); df.plot(); plt.legend(loc='best')

### Input / Output

In [None]:
# write to a csv file
df.to_csv('foo.csv', index=False)

In [None]:
# read file back in
path = r'C:\Users\John\Documents\IPython Notebooks\foo.csv'
newDf = pd.read_csv(path)
newDf.head()

In [None]:
# remove the file
import os
os.remove(path)

In [None]:
# can also do Excel
df.to_excel('foo.xlsx', sheet_name='Sheet1')

In [None]:
newDf2 = pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA'])
newDf2.head()

In [None]:
os.remove('foo.xlsx')