In [1]:
import numpy
#Numpy, multidimensional arrays + matrices and math functions
#array is like a list except everything must be same type (int, float, str)
numbers = [1, 2, 3, 4, 5]
#to calculate average
numpy.mean(numbers)

3.0

In [2]:
#to calculate median
numpy.median(numbers)

3.0

In [3]:
#to calculate std dev
numpy.std(numbers)

1.4142135623730951

In [4]:
#numby array can be 1D
array = numpy.array([1, 2, 3, 4, 5], float)
print (array)
#or 2D
array_2D = numpy.array([[1, 2, 3], [4, 5, 6]])
print (array_2D)

[1. 2. 3. 4. 5.]
[[1 2 3]
 [4 5 6]]


In [5]:
#you can index, slice, and manipulate Numpy arrays as you would Python lists
#also add, subtract, multiply, and dot product
smaller_array = numpy.array([1, 2])
print (numpy.dot(smaller_array, array_2D))
print (smaller_array[0])

[ 9 12 15]
1


In [6]:
import pandas as pd
#Pandas dataframe = 2D labeled structure with columns which can contain different types
#like a spreadsheet
#to create a dataframe: create a Python dictionary (d) of key:value pairs where each key is name of
#column and corresponding value is a Python Series +/- and array of indexes
d = {
    'name': pd.Series(['JD', 'Dan', 'Maddie'], 
            index = ['a', 'b', 'c']),
    'age': pd.Series([6, 4, 2], 
            index = ['a', 'b', 'c'])}
#once you've created the dictionary, pass as argument to DataFrame function 
df = pd.DataFrame(d)
print(df)

     name  age
a      JD    6
b     Dan    4
c  Maddie    2


In [7]:
#Series is a 1D object similar to an array, list, or column
#by default will assign an index label from 0-N (where N = length-1)
series = pd.Series(['Dave', 'Thomas'],
            index = ['first', 'last'])
print (series['first'])
print (series[['first']])

Dave
first    Dave
dtype: object


In [8]:
#You can also use Boolean operators to select specific items from a Series
cuteness = pd.Series([1, 3, 5], 
                index = ['cockroach', 'pig', 'kitten'])
print(cuteness > 3)
print(cuteness[cuteness >3])

cockroach    False
pig          False
kitten        True
dtype: bool
kitten    5
dtype: int64


In [9]:
#DataFrame has rows and cols, to create one pass a dictionary of lists to DataFrame constructor
data = {'year': [2010, 2011, 2012],
        'team': ['Bears', 'Bears', 'Bears'],
        'wins': [11, 8, 10],
        'losses': [5, 8, 6]}
football = pd.DataFrame(data)
print(football)

   year   team  wins  losses
0  2010  Bears    11       5
1  2011  Bears     8       8
2  2012  Bears    10       6


In [10]:
#Pandas has funcs to show baed info about DataFrame
#1) dtypes, gives datatype by column
print(football.dtypes)

year       int64
team      object
wins       int64
losses     int64
dtype: object


In [11]:
#2) describe(), basic stats from numerical data
print(football.describe())

         year       wins    losses
count     3.0   3.000000  3.000000
mean   2011.0   9.666667  6.333333
std       1.0   1.527525  1.527525
min    2010.0   8.000000  5.000000
25%    2010.5   9.000000  5.500000
50%    2011.0  10.000000  6.000000
75%    2011.5  10.500000  7.000000
max    2012.0  11.000000  8.000000


In [12]:
#3) head() displays first 5 rows
#4) tail() displays last 5 rows
print(football.head())

   year   team  wins  losses
0  2010  Bears    11       5
1  2011  Bears     8       8
2  2012  Bears    10       6


In [13]:
#DataFrame columns: operate on a specific column by calling on them as if a key in a dictionary
#both examples below yield the same result
print(football['year']) 
print('')
print(football.year)

0    2010
1    2011
2    2012
Name: year, dtype: int64

0    2010
1    2011
2    2012
Name: year, dtype: int64


In [14]:
#can also pass list of column names
print(football[['year', 'team']])

   year   team
0  2010  Bears
1  2011  Bears
2  2012  Bears


In [15]:
#Dataframe rows: use .loc method to call a specific row with row index as an argument
#either default row index (#) or assigned index
print (football.loc[0])

year       2010
team      Bears
wins         11
losses        5
Name: 0, dtype: object


In [17]:
#to subset the DataFrame, use True/False statements regarding columns
print (football[football['wins'] >= 10])

   year   team  wins  losses
0  2010  Bears    11       5
2  2012  Bears    10       6


In [19]:
#subset columns AND rows
print (football['team'][football['wins'] >= 10])
#in the above ['team'] is column of choice to show and ['wins'] tells which rows

0    Bears
2    Bears
Name: team, dtype: object


In [24]:
#Indexing DataFrames, dataframes as a groups of Series (columns) that share an Index (rows)
#important to remember:
#1) selecting a col will return a Series
#2) selecting multiple col with return a dataframe
#3) row selection can be done in multiple ways: slicing, index (.loc or .iloc), Boolean indexing (plus & and/or |)
print (football.iloc[0]) #.iloc is by location
print (football.loc[0]) #.loc is by index label (if assigned)

year       2010
team      Bears
wins         11
losses        5
Name: 0, dtype: object
year       2010
team      Bears
wins         11
losses        5
Name: 0, dtype: object


In [26]:
#Vectorized Methods
#vector could be every single column
d = {'one': pd.Series([1, 2, 3], index = ['a', 'b', 'c']), 
     'two': pd.Series([1, 2, 3, 4], index = ['a', 'b', 'c', 'd'])}
df = pd.DataFrame(d)
print (df)

   one  two
a  1.0    1
b  2.0    2
c  3.0    3
d  NaN    4


In [28]:
#apply() method example
print (df.apply(numpy.mean)) #returns mean of EVERY column

one    2.0
two    2.5
dtype: float64


In [30]:
#use .map to specify particular columns
print (df['one'].map(lambda x: x >= 1))
#evaluates if value in 'one' column is >= 1 and returns True/False

a     True
b     True
c     True
d    False
Name: one, dtype: bool


In [32]:
print(df.applymap(lambda x: x >= 1))
#evaluates ALL values in DataFrame

     one   two
a   True  True
b   True  True
c   True  True
d  False  True


In [34]:
#Matrix Multiplication and Numpt Dot
#numpy.dot allows to take the dot product between two vectors
a = [1, 2, 3]
b = [4, 5, 6]
print (numpy.dot(a, b)) #returns 1x4 + 2x5 + 3x6 = 4 + 10 + 18 = 32

32


In [37]:
#can also use numpy.dot to multiply arrays into matrices
c = [1, 2]
print (c)
d = [[2, 4, 6], [3, 5, 7]]
print (d)
print (numpy.dot(c, d)) #returns 1 x each first row of c + 2 x each second row:
#(1x2 + 2x3) = 2 + 6 = 8
#(1x4 + 2x5) = 4 + 10 = 14
#(1x6 + 2x7) = 6 + 14 = 20

[1, 2]
[[2, 4, 6], [3, 5, 7]]
[ 8 14 20]


In [42]:
#from "Olympic Medals Points"

'''
Imagine a point system in which each country is awarded 4 points for each
gold medal,  2 points for each silver medal, and one point for each 
bronze medal.  

Using the numpy.dot function, create a new dataframe called 
'olympic_points_df' that includes:
    a) a column called 'country_name' with the country name
    b) a column called 'points' with the total number of points the country
       earned at the Sochi olympics.

You do not need to call the function in your code when running it in the
browser - the grader will do that automatically when you submit or test it.
'''

countries = ['Russian Fed.', 'Norway', 'Canada', 'United States',
             'Netherlands', 'Germany', 'Switzerland', 'Belarus',
             'Austria', 'France', 'Poland', 'China', 'Korea', 
             'Sweden', 'Czech Republic', 'Slovenia', 'Japan',
             'Finland', 'Great Britain', 'Ukraine', 'Slovakia',
             'Italy', 'Latvia', 'Australia', 'Croatia', 'Kazakhstan']

gold = [13, 11, 10, 9, 8, 8, 6, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
silver = [11, 5, 10, 7, 7, 6, 3, 0, 8, 4, 1, 4, 3, 7, 4, 2, 4, 3, 1, 0, 0, 2, 2, 2, 1, 0]
bronze = [9, 10, 5, 12, 9, 5, 2, 1, 5, 7, 1, 2, 2, 6, 2, 4, 3, 1, 2, 1, 0, 6, 2, 1, 0, 1]

point_breakdown = [4, 2, 1] #points per medal type
points = numpy.dot(point_breakdown, (gold, silver, bronze)) #dot product points per medal type and number of each type
olympics_data = {'country_name': countries, #new dictionary
                 'points': points}
olympic_points_df = pd.DataFrame(olympics_data) #create dataframe
print (olympic_points_df)

      country_name  points
0     Russian Fed.      83
1           Norway      64
2           Canada      65
3    United States      62
4      Netherlands      55
5          Germany      49
6      Switzerland      32
7          Belarus      21
8          Austria      37
9           France      31
10          Poland      19
11           China      22
12           Korea      20
13          Sweden      28
14  Czech Republic      18
15        Slovenia      16
16           Japan      15
17         Finland      11
18   Great Britain       8
19         Ukraine       5
20        Slovakia       4
21           Italy      10
22          Latvia       6
23       Australia       5
24         Croatia       2
25      Kazakhstan       1
