In [1]:
import pandas as pd
import numpy as np

## Arrays of randomly generated data

In [2]:
np.random.seed(146)
arr = np.random.random([5,3])
print(arr)

[[0.49810944 0.61890697 0.29982061]
 [0.26070284 0.33385462 0.90236071]
 [0.36283182 0.00634385 0.94479603]
 [0.760953   0.60299465 0.27470333]
 [0.98892395 0.42572124 0.74183254]]


In [3]:
#Find the element in 3rd row 2nd column
arr[2, 1]

0.0063438478821165045

In [4]:
#Get the second row
arr[2, :]

array([0.36283182, 0.00634385, 0.94479603])

In [5]:
#Get the second column
arr[:, 1]

array([0.61890697, 0.33385462, 0.00634385, 0.60299465, 0.42572124])

In [6]:
#Change the last entry to 1
arr[-1, -1] = 1
print(arr)

[[0.49810944 0.61890697 0.29982061]
 [0.26070284 0.33385462 0.90236071]
 [0.36283182 0.00634385 0.94479603]
 [0.760953   0.60299465 0.27470333]
 [0.98892395 0.42572124 1.        ]]


In [7]:
#Change the last entry to 'turtle'
#What do you notice?

arr[-1, -1] = "turtle"
print(arr)

ValueError: could not convert string to float: 'turtle'

In [8]:
#Swap the dimensions to 3 rows, 5 columns
new_arr = arr.reshape((3, 5))
print(new_arr)

[[0.49810944 0.61890697 0.29982061 0.26070284 0.33385462]
 [0.90236071 0.36283182 0.00634385 0.94479603 0.760953  ]
 [0.60299465 0.27470333 0.98892395 0.42572124 1.        ]]


In [9]:
#Put everything into a 1D array
#We prefer flatten to ravel 
flat = arr.flatten()
print(flat)


[0.49810944 0.61890697 0.29982061 0.26070284 0.33385462 0.90236071
 0.36283182 0.00634385 0.94479603 0.760953   0.60299465 0.27470333
 0.98892395 0.42572124 1.        ]


In [12]:
#Find the mean in the original array

print(np.mean(arr))  # sums all elements
print(np.mean(arr, axis=0)) # mean of each column (summing down by rows)
print(np.mean(arr, axis=1)) # mean of each row (summing right by columns)


0.5520682036776914
[0.57430421 0.39756427 0.68433614]
[0.472279   0.49897272 0.43799056 0.546217   0.80488173]


## Simulating dice rolls: from arrays to data frames

In [2]:
# How many sides do the dice have?
n_sides = 6
# How many dice?
n_dice = 5
# How many times do we want to roll them?
n_rolls = 1000

In [3]:
np.random.seed(182)
rolls = np.random.randint(low = 1, high = n_sides+1, size = (n_rolls, n_dice))

In [9]:
rolls.shape

(1000, 5)

In [14]:
np.sum(rolls, axis=1)

array([15, 14, 15, 12, 18, 22, 13, 21, 21, 15, 20, 12, 18,  9, 20, 11, 16,
       26, 19, 24, 13, 19, 23, 21, 10, 20, 19, 13, 20, 22, 21, 13, 22, 18,
       14, 10, 10, 16, 16, 21, 23, 19, 19, 16, 15, 14, 18, 17, 15, 14, 20,
       16, 15, 23, 16, 19, 20, 17, 21, 19, 13, 16, 19, 22, 22, 19, 24, 19,
       18, 15, 13, 17, 19, 16, 14, 18, 24, 18, 15, 15, 15, 19, 23, 10, 18,
       18, 19, 16, 19, 20, 18, 20, 17, 17, 12, 20, 21, 15, 14, 13, 14, 16,
       18, 22, 22, 12, 20, 19, 18, 22, 20, 13, 23, 17, 15, 12, 22, 16, 21,
       12, 12, 19, 25, 14, 23, 16, 26, 17, 14, 15, 19, 18, 13, 18, 22, 17,
       17, 21, 16, 21, 15, 14, 21, 20, 19, 25, 17, 14, 17, 21, 19, 18,  7,
       18, 17, 12, 18, 17, 23, 19, 15, 16, 17, 18, 19, 14, 18, 15, 20, 11,
       16, 19, 14, 21, 17, 13, 18, 17, 18, 13, 20, 18, 14, 19, 17, 16, 17,
       23, 21, 20, 17, 16, 19, 16, 18, 20, 15, 21, 12, 14, 12, 21, 16, 19,
       18, 19, 23, 13, 12, 20, 24,  9, 11, 22, 14, 17, 21, 20, 14, 19, 18,
       16, 19, 19, 19, 16

In [13]:
np.sum(np.sum(rolls, axis=1) < 25)

968

In [18]:
rolls[np.sum(rolls, axis=1) < 25].shape

(968, 5)

In [15]:
#look at first row
rolls[0]

array([4, 3, 2, 5, 1])

In [16]:
#look at first 10 rows
rolls[0:10]

array([[4, 3, 2, 5, 1],
       [2, 2, 2, 5, 6],
       [6, 3, 5, 2, 3],
       [4, 2, 4, 4, 2],
       [4, 5, 2, 1, 4],
       [5, 4, 3, 6, 5],
       [1, 4, 4, 3, 3],
       [6, 6, 1, 2, 1],
       [4, 1, 1, 6, 5],
       [1, 1, 1, 1, 6]])

In [17]:
#turn this into a data frame
my_rolls = pd.DataFrame(data=rolls, columns = ['d1','d2','d3','d4','d5'])

In [18]:
my_rolls.head()

Unnamed: 0,d1,d2,d3,d4,d5
0,4,3,2,5,1
1,2,2,2,5,6
2,6,3,5,2,3
3,4,2,4,4,2
4,4,5,2,1,4


In [None]:
#Let's get the 10th set of rolls
my_rolls[9]

In [None]:
#get all of the rolls from the data frame where the first roll was a 1 and the last roll
#was not a 6


In [None]:
#get the 10th roll from this subset


In [None]:
#Why didn't this work?

In [None]:
#find the mean for each roll
#nonsensical but to illustrate something
my_rolls.mean()

## From Data Frames to Arrays

One of the most common things we will do when building models is separate out features (explanatory variables) from our response (dependent variable) and any other metadata (information about the observations). Commonly we will have the data in a data frame and will need to convert it into two arrays for analysis.  

Many of the errors you will have when you create models will be related to improper formatting of the input data. That's why we will talk about it now!

In [None]:
data = pd.read_csv('./data/gapminder.tsv', sep = '\t')

In [None]:
data.head()

In [None]:
#We will look only at 2002 and Americas to create a smallish data set- you tell me!


In [None]:
data_2002.head()

In [None]:
#we want to separate out population and gdpPercap as features
#and make them an array


In [None]:
#Suppose that we are trying to predict life expectancy
#We want to separate that out and put it in its own array


Notice that because we only had one column of data, this array is one-dimensional - everything is listed in one big "list". When you do analysis, it will want this organized in the same rows and columns as the features, i.e., each row with its own array. 