# Chapter 2. Python Environments (Python Data Science Cookbook)

In this chapter, we will cover the following recipes:
-  Using NumPy libraries
- Plotting with matplotlib
- Machine learning with scikit-learn

### Using NumPy libraries

NumPy provides an efficient way of handling very large arrays in Python. Most of the
Python scientific libraries use NumPy internally for the array and matrix operations. 

In [1]:
#Recipe_1a.py
# Importing numpy as np
import numpy as np
# Creating arrays
a_list = [1,2,3]
an_array = np.array(a_list)
print an_array
# Specify the datatype
an_array = np.array(a_list,dtype=float)
print an_array

# Creating matrices
a_listoflist = [[1,2,3],[5,6,7],[8,9,10]]
a_matrix = np.matrix(a_listoflist,dtype=float)
print a_matrix

[1 2 3]
[ 1.  2.  3.]
[[  1.   2.   3.]
 [  5.   6.   7.]
 [  8.   9.  10.]]


In [2]:
#Recipe_1b.py

# A simple function to examine given numpy object
def display_shape(a):
    print
    print a
    print
    print "Nuber of elements in a = %d"%(a.size)
    print "Number of dimensions in a = %d"%(a.ndim)
    print "Rows and Columns in a ",a.shape
    print

display_shape(a_matrix)


[[  1.   2.   3.]
 [  5.   6.   7.]
 [  8.   9.  10.]]

Nuber of elements in a = 9
Number of dimensions in a = 2
Rows and Columns in a  (3, 3)



In [4]:
#Recipe_1c.py
# Alternate ways of creating arrays
# 1. Leverage np.arange to create numpy array
created_array = np.arange(1,10,dtype=float)
display_shape(created_array)

# 2. Using np.linspace to create numpy array
created_array = np.linspace(1,10)
display_shape(created_array)

# 3. Create numpy arrays in using np.logspace
created_array = np.logspace(1,10,base=10.0)
display_shape(created_array)

# Specify step size in arange while creating
# an array. This is where it is different
# from np.linspace
created_array = np.arange(1,10,2,dtype=int)
display_shape(created_array)



[ 1.  2.  3.  4.  5.  6.  7.  8.  9.]

Nuber of elements in a = 9
Number of dimensions in a = 1
Rows and Columns in a  (9,)


[  1.           1.18367347   1.36734694   1.55102041   1.73469388
   1.91836735   2.10204082   2.28571429   2.46938776   2.65306122
   2.83673469   3.02040816   3.20408163   3.3877551    3.57142857
   3.75510204   3.93877551   4.12244898   4.30612245   4.48979592
   4.67346939   4.85714286   5.04081633   5.2244898    5.40816327
   5.59183673   5.7755102    5.95918367   6.14285714   6.32653061
   6.51020408   6.69387755   6.87755102   7.06122449   7.24489796
   7.42857143   7.6122449    7.79591837   7.97959184   8.16326531
   8.34693878   8.53061224   8.71428571   8.89795918   9.08163265
   9.26530612   9.44897959   9.63265306   9.81632653  10.        ]

Nuber of elements in a = 50
Number of dimensions in a = 1
Rows and Columns in a  (50,)


[  1.00000000e+01   1.52641797e+01   2.32995181e+01   3.55648031e+01
   5.42867544e+01   8.28642773e+01   1.26485522e+02  

In [7]:
#Recipe_1d.py

# Create a matrix will all elements as 1
ones_matrix = np.ones((3,3))
display_shape(ones_matrix)
# Create a matrix with all elements as 0
zeros_matrix = np.zeros((3,3))
display_shape(zeros_matrix)

# Identity matrix
# k parameter controls the index of 1
# if k =0, (0,0),(1,1,),(2,2) cell values
# are set to 1 in a 3 x 3 matrix
identity_matrix = np.eye(N=3,M=3,k=0)
display_shape(identity_matrix)

identity_matrix = np.eye(N=3,k=1)
display_shape(identity_matrix)


                        


[[ 1.  1.  1.]
 [ 1.  1.  1.]
 [ 1.  1.  1.]]

Nuber of elements in a = 9
Number of dimensions in a = 2
Rows and Columns in a  (3, 3)


[[ 0.  0.  0.]
 [ 0.  0.  0.]
 [ 0.  0.  0.]]

Nuber of elements in a = 9
Number of dimensions in a = 2
Rows and Columns in a  (3, 3)


[[ 1.  0.  0.]
 [ 0.  1.  0.]
 [ 0.  0.  1.]]

Nuber of elements in a = 9
Number of dimensions in a = 2
Rows and Columns in a  (3, 3)


[[ 0.  1.  0.]
 [ 0.  0.  1.]
 [ 0.  0.  0.]]

Nuber of elements in a = 9
Number of dimensions in a = 2
Rows and Columns in a  (3, 3)



In [9]:
#Recipe_1e.py
# Array shaping
a_matrix = np.arange(9).reshape(3,3)
display_shape(a_matrix)



[[0 1 2]
 [3 4 5]
 [6 7 8]]

Nuber of elements in a = 9
Number of dimensions in a = 2
Rows and Columns in a  (3, 3)



In [10]:
#Recipe_1f.py
# Matrix operations
a_matrix = np.arange(9).reshape(3,3)
print "a_matrix, row sum",a_matrix.sum(axis=1)

a_matrix, row sum [ 3 12 21]


In [12]:
#Recipe_1g.py
# reversing elements
display_shape(a_matrix[::-1])


[[6 7 8]
 [3 4 5]
 [0 1 2]]

Nuber of elements in a = 9
Number of dimensions in a = 2
Rows and Columns in a  (3, 3)



In [14]:
zz = a_matrix.flatten()
print zz

[0 1 2 3 4 5 6 7 8]


In [16]:
#Recipe_1h.py
# Random numbers
general_random_numbers = np.random.randint(1,100, size=10)
print general_random_numbers
uniform_rnd_numbers = np.random.normal(loc=0.2,scale=0.2,size=(3,3))
print uniform_rnd_numbers

[82 51  9 25 90 91  7 71 67 29]
[[ 0.11641715  0.19456909  0.19537803]
 [ 0.36285832  0.30009289  0.17719462]
 [ 0.06687779  0.25381162  0.08164304]]


In [18]:
# Paramter -1 refers to as many as dimension needed
back_to_array = a_matrix.reshape(-1)
display_shape(back_to_array)

a_matrix = np.arange(9).reshape(3,3)
back_array = np.ravel(a_matrix)
display_shape(back_array)

a_matrix = np.arange(9).reshape(3,3)
back_array = a_matrix.flatten()
display_shape(back_array)


[0 1 2 3 4 5 6 7 8]

Nuber of elements in a = 9
Number of dimensions in a = 1
Rows and Columns in a  (9,)


[0 1 2 3 4 5 6 7 8]

Nuber of elements in a = 9
Number of dimensions in a = 1
Rows and Columns in a  (9,)


[0 1 2 3 4 5 6 7 8]

Nuber of elements in a = 9
Number of dimensions in a = 1
Rows and Columns in a  (9,)



In [20]:
b_matrix = np.arange(9).reshape(3,3)
c_matrix = a_matrix + b_matrix
display_shape(c_matrix)
d_matrix = a_matrix * b_matrix
display_shape(d_matrix)
e_matrix = np.dot(a_matrix,b_matrix)
display_shape(e_matrix)
f_matrix = e_matrix.T
display_shape(f_matrix)


[[ 0  2  4]
 [ 6  8 10]
 [12 14 16]]

Nuber of elements in a = 9
Number of dimensions in a = 2
Rows and Columns in a  (3, 3)


[[ 0  1  4]
 [ 9 16 25]
 [36 49 64]]

Nuber of elements in a = 9
Number of dimensions in a = 2
Rows and Columns in a  (3, 3)


[[ 15  18  21]
 [ 42  54  66]
 [ 69  90 111]]

Nuber of elements in a = 9
Number of dimensions in a = 2
Rows and Columns in a  (3, 3)


[[ 15  42  69]
 [ 18  54  90]
 [ 21  66 111]]

Nuber of elements in a = 9
Number of dimensions in a = 2
Rows and Columns in a  (3, 3)



In [21]:
# reversing elements
display_shape(f_matrix[::-1])


[[ 21  66 111]
 [ 18  54  90]
 [ 15  42  69]]

Nuber of elements in a = 9
Number of dimensions in a = 2
Rows and Columns in a  (3, 3)



In [23]:
# Like python all elements are used by reference
# if copy is needed copy() command is used
f_copy = f_matrix.copy()
display_shape(f_copy)


[[ 15  42  69]
 [ 18  54  90]
 [ 21  66 111]]

Nuber of elements in a = 9
Number of dimensions in a = 2
Rows and Columns in a  (3, 3)



In [25]:
# Grid commands
xx,yy,zz = np.mgrid[0:3,0:3,0:3]
display_shape(xx)
display_shape(yy)
display_shape(zz)
xx = xx.flatten()
display_shape(xx)
yy = yy.flatten()
display_shape(yy)
zz = zz.flatten()
display_shape(zz)


[[[0 0 0]
  [0 0 0]
  [0 0 0]]

 [[1 1 1]
  [1 1 1]
  [1 1 1]]

 [[2 2 2]
  [2 2 2]
  [2 2 2]]]

Nuber of elements in a = 27
Number of dimensions in a = 3
Rows and Columns in a  (3, 3, 3)


[[[0 0 0]
  [1 1 1]
  [2 2 2]]

 [[0 0 0]
  [1 1 1]
  [2 2 2]]

 [[0 0 0]
  [1 1 1]
  [2 2 2]]]

Nuber of elements in a = 27
Number of dimensions in a = 3
Rows and Columns in a  (3, 3, 3)


[[[0 1 2]
  [0 1 2]
  [0 1 2]]

 [[0 1 2]
  [0 1 2]
  [0 1 2]]

 [[0 1 2]
  [0 1 2]
  [0 1 2]]]

Nuber of elements in a = 27
Number of dimensions in a = 3
Rows and Columns in a  (3, 3, 3)


[0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2]

Nuber of elements in a = 27
Number of dimensions in a = 1
Rows and Columns in a  (27,)


[0 0 0 1 1 1 2 2 2 0 0 0 1 1 1 2 2 2 0 0 0 1 1 1 2 2 2]

Nuber of elements in a = 27
Number of dimensions in a = 1
Rows and Columns in a  (27,)


[0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2]

Nuber of elements in a = 27
Number of dimensions in a = 1
Rows and Columns in a 

### Plotting with matplotlib

In this recipe, we will introduce basic plotting mechanisms using pyplot. We will use
pyplot in almost all our recipes for visualization in this book.

In [28]:
#Recipe_2a.py
import numpy as np
import matplotlib.pyplot as plt
def simple_line_plot(x,y,figure_no):
    plt.figure(figure_no)
    plt.plot(x,y)
    plt.xlabel('x values')
    plt.ylabel('y values')
    plt.title('Simple Line')

def simple_dots(x,y,figure_no):
    plt.figure(figure_no)
    plt.plot(x,y,'or')
    plt.xlabel('x values')
    plt.ylabel('y values')
    plt.title('Simple Dots')

def simple_scatter(x,y,figure_no):
    plt.figure(figure_no)
    plt.scatter(x,y)
    plt.xlabel('x values')
    plt.ylabel('y values')
    plt.title('Simple scatter')

def scatter_with_color(x,y,labels,figure_no):
    plt.figure(figure_no)
    plt.scatter(x,y,c=labels)
    plt.xlabel('x values')
    plt.ylabel('y values')
    plt.title('Scatter with color')

if __name__ == "__main__":
    plt.close('all')
    # Sample x y data for line and simple dot plots
    x = np.arange(1,100,dtype=float)
    y = np.array([np.power(xx,2) for xx in x])
    figure_no=1
    simple_line_plot(x,y,figure_no)
    figure_no+=1
    simple_dots(x,y,figure_no)
    # Sample x,y data for scatter plot
    x = np.random.uniform(size=100)
    y = np.random.uniform(size=100)
    figure_no+=1
    simple_scatter(x,y,figure_no)
    figure_no+=1
    label = np.random.randint(2,size=100)
    scatter_with_color(x,y,label,figure_no)
    plt.show()


  if self._edgecolors == str('face'):


In [29]:
#Recipe_2b.py
import numpy as np
import matplotlib.pyplot as plt
def x_y_axis_labeling(x,y,x_labels,y_labels,figure_no):
    plt.figure(figure_no)
    plt.plot(x,y,'+r')
    plt.margins(0.2)
    plt.xticks(x,x_labels,rotation='vertical')
    plt.yticks(y,y_labels,)

def plot_heat_map(x,figure_no):
    plt.figure(figure_no)
    plt.pcolor(x)
    plt.colorbar()

if __name__ == "__main__":
    plt.close('all')
    x = np.array(range(1,6))
    y = np.array(range(100,600,100))
    x_label = ['element 1','element 2','element 3','element 4','element 5']
    y_label = ['weight1','weight2','weight3','weight4','weight5']
    x_y_axis_labeling(x,y,x_label,y_label,1)
    x = np.random.normal(loc=0.5,scale=0.2,size=(10,10))
    plot_heat_map(x,2)
    plt.show()

### Machine learning with scikit-learn

Scikit-learn is a versatile machine learning library in Python.In this recipe, we will demonstrate some of the capabilities of scikit-learn and learn about some of their API organization so that we can follow it seamlessly in our future recipes.
Scikit-learn provides us with an inbuilt dataset. Let’s see how to access this dataset and
use it:

In [30]:
#Recipe_3a.py
from sklearn.datasets import load_iris,load_boston,make_classification,make_circles, make_moons

# Iris dataset
data = load_iris()
x = data['data']
y = data['target']
y_labels = data['target_names']
x_labels = data['feature_names']

print
print x.shape
print y.shape
print x_labels
print y_labels

# Boston dataset
data = load_boston()
x = data['data']
y = data['target']
x_labels = data['feature_names']
print
print x.shape
print y.shape
print x_labels

# make some classification dataset
x,y = make_classification(n_samples=50,n_features=5, n_classes=2)
print
print x.shape
print y.shape
print x[1,:]
print y[1]

# Some non linear dataset
x,y = make_circles()
import numpy as np
import matplotlib.pyplot as plt
plt.close('all')
plt.figure(1)
plt.scatter(x[:,0],x[:,1],c=y)
x,y = make_moons()
import numpy as np
import matplotlib.pyplot as plt
plt.figure(2)
plt.scatter(x[:,0],x[:,1],c=y)
plt.show()



(150, 4)
(150,)
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
['setosa' 'versicolor' 'virginica']

(506, 13)
(506,)
['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']

(50, 5)
(50,)
[ 1.83780121  0.21116232 -2.04936456  0.42778782 -1.78072325]
1


In [32]:
#Recipe_3b.py
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
# Data Preprocessing routines
x = np.asmatrix([[1,2],[2,4]])
poly = PolynomialFeatures(degree = 2)
poly.fit(x)
x_poly = poly.transform(x)

print "Original x variable shape",x.shape
print x
print
print "Transformed x variables",x_poly.shape
print x_poly

#alternatively
x_poly = poly.fit_transform(x)

from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris
data = load_iris()
x = data['data']
y = data['target']
estimator = DecisionTreeClassifier()
estimator.fit(x,y)
predicted_y = estimator.predict(x)
predicted_y_prob = estimator.predict_proba(x)
predicted_y_lprob = estimator.predict_log_proba(x)

print 'DecisionTreeClassifier'
print predicted_y
print predicted_y_prob
print predicted_y_lprob

from sklearn.pipeline import Pipeline
poly = PolynomialFeatures(degree=3)
tree_estimator = DecisionTreeClassifier()
steps = [('poly',poly),('tree',tree_estimator)]
estimator = Pipeline(steps=steps)
estimator.fit(x,y)
predicted_y = estimator.predict(x)

print 'DecisionTreeClassifier Pipeline'
print predicted_y

Original x variable shape (2, 2)
[[1 2]
 [2 4]]

Transformed x variables (2, 6)
[[ 1  1  2  1  2  4]
 [ 1  2  4  4  8 16]]
DecisionTreeClassifier
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
[[ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 1.  0

There are a lot more dataset creation functions available in scikit-learn. Refer to the
following link:
http://scikit-learn.org/stable/datasets/
While creating nonlinear datasets using make_circle and make_moons, we mentioned that
a lot of desired properties can be added to the dataset. The data can be corrupted slightly
by inducing incorrect class labels. Refer to the following link for a list of options that are
available in order to introduce such nuances in the data:
- http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_circles.html

- http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_moons.html