# Introduction to Python for Data Science

In [2]:
# Initialize the data
import pandas as pd

mlb = pd.read_csv('../baseball.csv')
height = mlb['Height'].tolist()
weight = mlb['Weight'].tolist()

## Your first numpy array

In [3]:
# Initialize the numpy array for height and weight
import numpy as np

np_height = np.array(height)
np_weight = np.array(weight)

# Verify if it is a numpy array
print(type(np_height))
print(type(np_weight))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


## Numpy array operations

In [4]:
# Unlike normal Python lists, Numpy Array arithmetic operations work for each of the element
# For example, convert the height from inches to meters
np_height_m = np_height * 0.0254
print('Height:', np_height)
print('Height (kg):', np_height_m)

# Convert the weight from pounds to kilogram
np_weight_kg = np_weight * 0.453592
print('Weight:', np_weight)
print('Weight (kg):',np_weight_kg)

# calculate the BMI = weight / (height * height)
np_bmi = np_weight_kg / (np_height_m ** 2)
print('BMI:', np_bmi)

Height: [74 74 72 ..., 75 75 73]
Height (kg): [ 1.8796  1.8796  1.8288 ...,  1.905   1.905   1.8542]
Weight: [180 215 210 ..., 205 190 195]
Weight (kg): [ 81.64656  97.52228  95.25432 ...,  92.98636  86.18248  88.45044]
BMI: [ 23.11037639  27.60406069  28.48080465 ...,  25.62295933  23.74810865
  25.72686361]


In [5]:
# You can create an array of boolean by using conditional logic onto the array
# lists of boolean where the BMI < 21
np_lightweight = np_bmi < 21
print('Boolean of BMI < 21:', np_lightweight)

# From the numpy array's of boolean, you can use this array to print out the element who passed the boolean array
np_bmi_lightweight = np_bmi[np_lightweight]
print('Lightweight players:', np_bmi_lightweight)

Boolean of BMI < 21: [False False False ..., False False False]
Lightweight players: [ 20.54255679  20.54255679  20.69282047  20.69282047  20.34343189
  20.34343189  20.69282047  20.15883472  19.4984471   20.69282047
  20.9205219 ]


## Numpy two-dimensional array

In [20]:
# you can zip two 1-dimensional arrays into one 2-dimensional array by using python zip builtin function
np_baseball = np.array(list(zip(np_height_m, np_weight_kg)))
print(np_baseball)

# Print out the type of np_baseball
print(type(np_baseball))

# Print out the shape of np_baseball: number of rows and column total (row, column)
print(np_baseball.shape)

[[  1.8796   81.64656]
 [  1.8796   97.52228]
 [  1.8288   95.25432]
 ..., 
 [  1.905    92.98636]
 [  1.905    86.18248]
 [  1.8542   88.45044]]
<class 'numpy.ndarray'>
(1015, 2)


## Subsetting 2D Numpy array

In [21]:
# Selecting 2D numpy array is pretty intuitive, similar like python lists
# The indexes before the comma refer to the rows, while after the comma refer to the columns
# For example, selecting the 50th row from the baseball

# Print out the 50th row of np_baseball
print('50th row: ', np_baseball[49])

# Select the entire second column of np_baseball: np_weight
np_weight = np_baseball[:,1]
print('Only weight: ',np_weight)

# Print out height of 124th player
print('Height of 142th player: ', np_baseball[123,0])

50th row:  [  1.778    88.45044]
Only weight:  [ 81.64656  97.52228  95.25432 ...,  92.98636  86.18248  88.45044]
Height of 142th player:  1.905


## 2D Arithmetic

In [37]:
# you can perform array concatenation that will work for every row in the array
# First let create an numpy array with height, weight, and age
age = mlb['Age'].tolist()
np_baseball_updated = np.array(list(zip(height, weight, age)))
print(np_baseball_updated)

# Then we would like to convert:
# height from inches --> m
# weight from pounds --> kg
# age --> unchanged
# we can create an array for this conversion
conversion = np.array([0.0254, 0.453592, 1])

# then we perform arithmetic calculation for each of the row
np_baseball_converted = np_baseball_updated * conversion
print(np_baseball_converted)

[[  74.    180.     22.99]
 [  74.    215.     34.69]
 [  72.    210.     30.78]
 ..., 
 [  75.    205.     25.19]
 [  75.    190.     31.01]
 [  73.    195.     27.92]]
[[  1.8796   81.64656  22.99   ]
 [  1.8796   97.52228  34.69   ]
 [  1.8288   95.25432  30.78   ]
 ..., 
 [  1.905    92.98636  25.19   ]
 [  1.905    86.18248  31.01   ]
 [  1.8542   88.45044  27.92   ]]


# Basic Statistics

## Average vs Median

In [44]:
# you can perform basic statistics with numpy
# lets say you want to take the average height and the median for the height
# first lets select the height column
height = np_baseball_converted[:, 0]
print("Height of all baseball players:", height)

# Get the mean value
print("Mean:", np.mean(height))

# get the median value
print("Median:", np.median(height))

# get the standard deviation
print("Standard deviation:", np.std(height))

# Do big players tend to be heavier? get the correlation coefficient
corr = np.corrcoef(np_baseball_converted[:, 0], np_baseball_converted[:, 1])
print("Correlation:", corr)

Height of all baseball players: [ 1.8796  1.8796  1.8288 ...,  1.905   1.905   1.8542]
Mean: 1.87171724138
Median: 1.8796
Standard deviation: 0.0587449137786
Correlation: [[ 1.          0.53153932]
 [ 0.53153932  1.        ]]
