# NumPy: Broadcasting

In [2]:
import numpy as np

## Case 1

In [3]:
x = np.arange(3)
y = 5
print(x)
print(x.shape)
print(x+y)

[0 1 2]
(3,)
[5 6 7]


## Case2

In [4]:
x = np.random.randint(10, size =((3,3)))
y = np.random.randint(10, size = 3)
print("x array")
print(x)
print("y array")
print(y)

print("Their shapes are respectively")
print(x.shape)
print(y.shape)

x array
[[9 4 0]
 [8 6 1]
 [8 9 5]]
y array
[9 9 8]
Their shapes are respectively
(3, 3)
(3,)


In [5]:
x - y

array([[ 0, -5, -8],
       [-1, -3, -7],
       [-1,  0, -3]])

## Case 3

In [6]:
x = np.random.randint(10, size=(3,1))
y = np.random.randint(10, size = 3)
print("x  array")
print(x)
print("y array")
print(y)

print("Their shapes are respectively")
print(x.shape)
print(y.shape)

x  array
[[3]
 [2]
 [9]]
y array
[2 0 8]
Their shapes are respectively
(3, 1)
(3,)


In [7]:
x - y

array([[ 1,  3, -5],
       [ 0,  2, -6],
       [ 7,  9,  1]])

# NumPy: Fancy Indexing

In [8]:
simple_array = np.array([10,20,30,40,50,60])
simple_array

array([10, 20, 30, 40, 50, 60])

In [9]:
simple_array[3]

40

In [10]:
simple_array[[2,1]]

array([30, 20])

## Application: np.argsort()

Returns the indices that would sort an array. [More Info](https://docs.scipy.org/doc/numpy/reference/generated/numpy.argsort.html)



In [11]:
x = np.array(['c', 'a', 'b'])
np.argsort(x)

array([1, 2, 0], dtype=int64)

In [12]:
x[np.argsort(x)]

array(['a', 'b', 'c'], 
      dtype='<U1')

# Detour to Dictionary

In [13]:
my_dict = dict()
print(my_dict)

{}


In [14]:
my_dict['Jan'] = 1
my_dict['Feb'] = 2
my_dict['Mar'] = 3

my_dict

{'Feb': 2, 'Jan': 1, 'Mar': 3}

<div class="alert alert-block alert-warning">
<h5>Important Note!</h5>

<p>Due to the nature of the **dictionary** data structure, we can not make any assumptions about the order in which items appear in the dictionary.</p>

</div>

In [15]:
my_dict['Jan']

1

In [19]:
# You can update the dictionary through the key
my_dict['Jan'] = my_dict['Jan'] + 4

In [20]:
my_dict

{'Feb': 2, 'Jan': 5, 'Mar': 3}

In [21]:
# Accessing elements not in the dict
my_dict['Dec']

KeyError: 'Dec'

In [22]:
# To check if a key is in the list
'Dec' in my_dict

False

In [23]:
'Mar' in my_dict

True

## Activity

1. Accept a string as an input from the user
2. Create a dictionary that contains the frequency of each word in the string.
3. Print the dictionary

Below is the sample interaction

In [26]:
# Accept a setence
sent = input("Enter a sentence ")

# Convert the sentence to a list of words
words_list = sent.split()

# Create an empty dictionary that contains words and frequencies
word_freq_dict = dict()

# Iterate through every word in the list
for word in words_list:
    ###
    if word in my_dict:
        word_freq_dict[word]=word_freq_dict[word]+1
    else:
        word_freq_dict[word]=1
    ###DID NOT FINISH THIS PROBLEM!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

print(word_freq_dict)

Enter a sentence my name is my name is my name is sean


KeyError: 'my'

## You can access the keys and values seperately

In [27]:
print(my_dict.keys())
print(my_dict.values())

list(my_dict.keys())

dict_keys(['Jan', 'Feb', 'Mar', 'my', 'name', 'is', 'sean'])
dict_values([5, 2, 3, 3, 3, 3, 1])


['Jan', 'Feb', 'Mar', 'my', 'name', 'is', 'sean']

In [28]:
np.array(list(my_dict.keys()))

array(['Jan', 'Feb', 'Mar', 'my', 'name', 'is', 'sean'], 
      dtype='<U4')

# Introduction to Pandas

<div class="alert alert-block alert-info">
<p> Source: Example datasets and discussion in this Jupyter Notebook was partly sourced from `Mike Dunn`, University of Notre Dame.  </p>
</div>

In [29]:
import pandas as pd
pd.__version__

'0.20.1'

## `DataFrame` & `Series` Basics

### Basics on data loading

<div class="alert alert-block alert-info">
<h5>Know your current working directory</h5>

<p>`import os`</p>
<p>`os.getcwd()`</p>

</div>

In [30]:
import os
print(os.getcwd())

C:\Users\smcmaho3


<div class="alert alert-block alert-danger">
<h5>Make sure the data is in the right place. </h5>
<p> </p>
<li> Open the above folder location, that was printed as an output of `os.getcwd()` command, using Windows (or Finder on Mac) file system</li>
<li> Make sure there is a folder named 'data' in the location you opened. If not create a folder</li>
<li> Copy the dowloaded data into the newly created 'data' folder </li>

</div>

In [32]:
print(os.listdir('./data/'))

['nd-football-2017-roster.csv']


When you specify './data/' in the above and below Python command, '.' means the current directory. 

In the following statement, the interpretation is that in the current directory as this Jupyter file, open the 'data' folder and look for 'nd-football-2017-roster.csv' file. 

In [33]:
athletes_data = pd.read_csv('./data/nd-football-2017-roster.csv')
type(athletes_data)

pandas.core.frame.DataFrame

<div class="alert alert-block alert-info">
The `type` function return the type of the object passed to it. Very handy.
</div> 

### `DataFrames` are made up of an `index` and one or more `Series`
Inside of every frame is an **`index`** and one or more **`Series`** objects.
Let's demonstrate this by looking at the first few elements of our `athletes_data` object.

In [34]:
#DataFrame athletes_data.head() provides the first few rows of the dataset

athletes_data.head()

Unnamed: 0,Number,Name,Position,Height,Weight,Class,Hometown
0,2,Dexter Williams,RB,71,202,JR,"Winter Garden, FL"
1,3,C.J. Sanders,WR,68,185,JR,"Granada Hills, CA"
2,4,Te'von Coney,LB,73,235,JR,"Palm Beach Gardens, FL"
3,4,Montgomery VanGorder,QB,73,217,SR,"Buford, GA"
4,5,Nyles Morgan,LB,73,238,SR,"Crete, IL"


The bold numbers running down the left hand side are the **`index`** of the **`DataFrame`**.  The bold strings running across the top are the names of the nested **`DataSeries`** objects.

In [36]:
name_series = athletes_data['Name']
print(name_series)

0           Dexter Williams
1              C.J. Sanders
2              Te'von Coney
3      Montgomery VanGorder
4              Nyles Morgan
5     Equanimeous St. Brown
6              Nick Watkins
7           Brandon Wimbush
8              Daelin Hayes
9               Chris Finke
10                 Ian Book
11             Tyler Luatua
12          Devin Studstill
13              C.J. Holmes
14              Nolan Henry
15         Isaiah Robertson
16           Troy Pride Jr.
17              Justin Yoon
18           Shaun Crawford
19            Jalen Elliott
20              Asmar Bilal
21           Drue Tranquill
22               Mick Assaf
23             Nick Coleman
24           Brandon Garcia
25           Austin Webster
26             Ashton White
27              Julian Love
28           Nicco Fertitta
29               Sam Kohler
              ...          
60            Elijah Taylor
61             Logan Plantz
62                 Sam Bush
63            Ryan Kilander
64              Jimm

<div class="alert alert-block alert-info">
<h5>Dictionary Like-Retrieval</h5>
<p>Did you see how I passed to the name of the **`DataSeries`** object that I wanted to the `athletes_data` frame? It was the same sort of syntax you'd use to retrieve a data element from a **`dict`**.</p>
<p>
As we continue to move along, we'll discover that **`DataFrame`** and **`dict`** types share many traits.
</p>
</div> 

### Every `Series` is made up of an index and a NumPy array
Now that we know every **`DataFrame`** is filled with **`Series`** objects, let's inspect `name_series` dig deeper into the data structures.

In [37]:
# Let's ask for the string representation of the object.
# You can ignore the slice notation at the end,
# I just don't want to display all the names.
name_series[0:10]

0          Dexter Williams
1             C.J. Sanders
2             Te'von Coney
3     Montgomery VanGorder
4             Nyles Morgan
5    Equanimeous St. Brown
6             Nick Watkins
7          Brandon Wimbush
8             Daelin Hayes
9              Chris Finke
Name: Name, dtype: object

So, as you can see, we've got two columns here.  
* The first column is the **`index`**.
* The second column, which holds the values of the series is nothing more than our good friend, the NumPy array.

You can retrieve the index and NumPy array separately from a series as follows:

In [38]:
# Get the Series index
name_series.index

RangeIndex(start=0, stop=90, step=1)

In [39]:
name_series.values

array(['Dexter Williams', 'C.J. Sanders', "Te'von Coney",
       'Montgomery VanGorder', 'Nyles Morgan', 'Equanimeous St. Brown',
       'Nick Watkins', 'Brandon Wimbush', 'Daelin Hayes', 'Chris Finke',
       'Ian Book', 'Tyler Luatua', 'Devin Studstill', 'C.J. Holmes',
       'Nolan Henry', 'Isaiah Robertson', 'Troy Pride Jr.', 'Justin Yoon',
       'Shaun Crawford', 'Jalen Elliott', 'Asmar Bilal', 'Drue Tranquill',
       'Mick Assaf', 'Nick Coleman', 'Brandon Garcia', 'Austin Webster',
       'Ashton White', 'Julian Love', 'Nicco Fertitta', 'Sam Kohler',
       'Kevin Stepherson', 'D.J. Morgan', 'Josh Adams', 'Tony Jones Jr.',
       'Grant Hammann', 'Donte Vaughn', 'Robert Regan', 'Deon McIntosh',
       'Christopher Schilling', 'Kier Murphy', 'Brett Segobiano',
       'Temitope Agoro', 'Jimmy Thompson', 'Julian Okwara', 'Jeff Riney',
       'Brian Ball', 'Jamir Jones', 'Jonathan Jones', 'Matt Bushland',
       'Chris Bury', 'Greer Martini', 'Brandon Hutson', 'Devyn Spruell',
    

## Going a Bit Deeper
The essential difference between an NumPy **`ndarray`** and a Pandas **`Series`** object is their indexes.

**NumPy arrays have indexes as well, but they are implicit and always integers**. You can't access an array's **`index`** property directly like you can on a Pandas series object as we did above.

Furthermore, series objects are not limited to having integer based indexes. You could have indexes of strings, floats, booleans, dates, etc. 

In [40]:
# Create a `DataSeries` object from a dictionary
# This results in a string based index.

sample_dict = {'R':'Not as cool. :(',
                'Python': 'Best Language Ever!',
                'C': 'Fundamental language',
                'Julia': 'A New language for Data Science'}

simple_series = pd.Series(sample_dict)
simple_series.index

Index(['C', 'Julia', 'Python', 'R'], dtype='object')

<div class="alert alert-block alert-warning">
<p>Notice that even though `R` was listed as the first key of my `dict` object, it was put in as fourth element in the index?
</p>
<p>
When a series is first created, it will sort the values of the index. Hence `C` came first then `Julia`, `Python`,  and then `R`. This has implications that need to kept track of. 
</p>
</div> 

In [41]:
simple_series

C                    Fundamental language
Julia     A New language for Data Science
Python                Best Language Ever!
R                         Not as cool. :(
dtype: object

In [42]:
simple_series['Python']

'Best Language Ever!'

In [43]:
simple_series['C':'Python']

C                    Fundamental language
Julia     A New language for Data Science
Python                Best Language Ever!
dtype: object

The same holds true for the index of a **`DataFrame`** object. When we loaded our `college_scorecard` frame from the CSV file, it generated an integer based index, which is the default behavior.

But we could change that.  For instance, we could make the institution names the index values:

In [44]:
athletes_data.index = athletes_data['Name']

athletes_data.head()

Unnamed: 0_level_0,Number,Name,Position,Height,Weight,Class,Hometown
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Dexter Williams,2,Dexter Williams,RB,71,202,JR,"Winter Garden, FL"
C.J. Sanders,3,C.J. Sanders,WR,68,185,JR,"Granada Hills, CA"
Te'von Coney,4,Te'von Coney,LB,73,235,JR,"Palm Beach Gardens, FL"
Montgomery VanGorder,4,Montgomery VanGorder,QB,73,217,SR,"Buford, GA"
Nyles Morgan,5,Nyles Morgan,LB,73,238,SR,"Crete, IL"


# Data Indexing and Selection

You'll find that many of the same techniques that we used with NumPy arrays will also be available for these objects. In addition, they add some additional functionality that will be very familiar to anyone who has experience with Python dictionaries.

In [45]:
college_scorecard = pd.read_csv(
    './data/college-scorecard-data-scrubbed.csv')
college_scorecard.head()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xcd in position 8: invalid continuation byte

### Encoding

Text files are encoded in different formats when they are written. To read them, you must decode them with the same standard or you'll have a problem.

For example, our `college-scorecard-data-scrubbed.csv` file was encoded using `latin-1`, but the default setting for Pandas in Python 3 is `utf-8` so we will get an error if we try to read the file without specify the correct encoding like so:

In [46]:
college_scorecard = pd.read_csv(
    './data/college-scorecard-data-scrubbed.csv', 
    encoding='latin-1')
college_scorecard.head()

Unnamed: 0,UNITID,OPEID,OPEID6,institution_name,city,state,url,predominant_degree_code,predominant_degree_desc,institutional_owner_code,...,pell_grant_receipents,full_time_retention_rate_4_year,full_time_retention_rate_less_than_4_year,part_time_rentention_rate_4_year,part_time_rentention_rate_less_than_4_year,students_with_federal_loans,median_student_earnings,median_student_debt,less_than_4_year_school_completion_rate,4_year_school_completion_rate
0,102580,884300,8843,Alaska Bible College,Palmer,AK,www.akbible.edu/,3,Bachelors,2,...,0.3571,0.3333,,,,0.2857,,PrivacySuppressed,,
1,103501,2541000,25410,Alaska Career College,Anchorage,AK,www.alaskacareercollege.edu,1,Certificate,3,...,0.7078,,0.7941,,,0.786,28700.0,8994,0.707589494,
2,442523,4138600,41386,Alaska Christian College,Soldotna,AK,www.alaskacc.edu,1,Certificate,2,...,0.8868,,0.4737,,1.0,0.6792,,PrivacySuppressed,0.0,
3,102669,106100,1061,Alaska Pacific University,Anchorage,AK,www.alaskapacific.edu,3,Bachelors,2,...,0.3152,0.7742,,1.0,,0.5297,47000.0,23250,,0.514833663
4,102711,3160300,31603,AVTEC-Alaska's Institute of Technology,Seward,AK,www.avtec.edu/,1,Certificate,1,...,0.0737,,1.0,,1.0,0.0664,33500.0,PrivacySuppressed,0.846055789,


In [47]:
college_scorecard = pd.read_csv(
    './data/college-scorecard-data-scrubbed.csv', 
    encoding='latin-1', 
    index_col='institution_name')
college_scorecard.head()

Unnamed: 0_level_0,UNITID,OPEID,OPEID6,city,state,url,predominant_degree_code,predominant_degree_desc,institutional_owner_code,institutional_owner_desc,...,pell_grant_receipents,full_time_retention_rate_4_year,full_time_retention_rate_less_than_4_year,part_time_rentention_rate_4_year,part_time_rentention_rate_less_than_4_year,students_with_federal_loans,median_student_earnings,median_student_debt,less_than_4_year_school_completion_rate,4_year_school_completion_rate
institution_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alaska Bible College,102580,884300,8843,Palmer,AK,www.akbible.edu/,3,Bachelors,2,PrivateNonProfit,...,0.3571,0.3333,,,,0.2857,,PrivacySuppressed,,
Alaska Career College,103501,2541000,25410,Anchorage,AK,www.alaskacareercollege.edu,1,Certificate,3,PrivateForProfit,...,0.7078,,0.7941,,,0.786,28700.0,8994,0.707589494,
Alaska Christian College,442523,4138600,41386,Soldotna,AK,www.alaskacc.edu,1,Certificate,2,PrivateNonProfit,...,0.8868,,0.4737,,1.0,0.6792,,PrivacySuppressed,0.0,
Alaska Pacific University,102669,106100,1061,Anchorage,AK,www.alaskapacific.edu,3,Bachelors,2,PrivateNonProfit,...,0.3152,0.7742,,1.0,,0.5297,47000.0,23250,,0.514833663
AVTEC-Alaska's Institute of Technology,102711,3160300,31603,Seward,AK,www.avtec.edu/,1,Certificate,1,Public,...,0.0737,,1.0,,1.0,0.0664,33500.0,PrivacySuppressed,0.846055789,


## Selecting Data from `Series` Objects

Let's start by grabbing the `url` series object out of our data frame:

In [48]:
url_series = college_scorecard['url']
url_series.head()

institution_name
Alaska Bible College                                 www.akbible.edu/
Alaska Career College                     www.alaskacareercollege.edu
Alaska Christian College                             www.alaskacc.edu
Alaska Pacific University                       www.alaskapacific.edu
AVTEC-Alaska's Institute of Technology                 www.avtec.edu/
Name: url, dtype: object

As a reminder, a `Series` object is comprised of an explicit index and the values. **Notice here that our `Series` object inherit the 'institution_name' column values as the index from the `DataFrame`.**

### Dictionary Like Features

Several of the methods available on Python **`dict`** objects are also available on `Series` objects. The reason that this is possible is because Pandas maintains a mapping relationship between the explicit index elements and the Series values - just like standard Python does between the keys & values of a dictionary.  

#### Membership Testing with `in`
You can determine if a given **index** exists in a `Series` using the `in` keyword:


In [49]:
'University of Notre Dame' in url_series

True

#### Value Retrieval via Index "Key"
You can retrieve a value from the `Series` by passing it the index "key" you are interest in.b

In [50]:
url_series['University of Notre Dame']

'www.nd.edu'

### Array Like Features
Now we will explore some of the array like features of `Series` objects. Most of this will be familiar given what you already know about NumPy arrays, so we will move quickly.

#### Slicing with Explicit Indexes & Implicit Indexes
Slicing is pretty straight forward with NumPy arrays because of their implicit integer based indexes. It gets a little bit more complicated with `Series` objects because the explicit index isn't necessarily integer based.

Just like normal slice, you can specify two elements that you want to be the start/end of what is returned. The difference here is that you can specify the actual index element names/keys instead of numbers.

Here will we ask for all the listings from Stanford to Notre Dame.

In [51]:
url_series['Stanford University':'University of Notre Dame']

institution_name
Stanford University                                                                             www.stanford.edu/
Starr King School for the Ministry                                                                   www.sksm.edu
SUM Bible College and Theological Seminary                                                            www.sum.edu
Summit College                                                                              www.summitcollege.edu
Sutter Beauty College                                                                     sutterbeautycollege.com
Taft College                                                                                  www.taftcollege.edu
Thanh Le College School of Cosmetology                                                     WWW.thanhlecollege.com
The Academy of Radio and TV Broadcasting                                                         WWW.ARBRADIO.COM
The California Maritime Academy                                        

<div class="alert alert-block alert-info">
<p>
It is important to note that the reverse request, `url_series['University of Notre Dame': 'Stanford University']` would have yielded no results.
</p>
<p>
This is because 'University of Notre Dame' appears after 'Stanford' in the CSV file. Remember that technical, the first item in a slice notation is the 'start' and the second is the 'end'. It is important that you have them in the right order.
</p>
</div> 

<div class="alert alert-block alert-danger">
<h5>Warning: Important Distinction</h5>
<p>
In a NumPy array slice (or when using an implicit index), the 'end' value of the slice notation is not included in the return slice.
</p>
<p>
Strangely, when using a slice with an explicit index - the end value is included. Be careful about this as you could end up with an extra record in your slices that you don't want.
</p>
</div> 

##### The Implicit Index Lurking in the Shadows

While it is true that every `Series` object has an explicit index - it is also true that there is also an implicit index that is always available. Because of this, you can continue to use "normal" slice notations on `Series` objects with non-integer based explicit indexes.

Here are a couple of examples.

In [52]:
# Using "normal" slice notations on our `url_series`
# First ten elements
url_series[:10]

institution_name
Alaska Bible College                                 www.akbible.edu/
Alaska Career College                     www.alaskacareercollege.edu
Alaska Christian College                             www.alaskacc.edu
Alaska Pacific University                       www.alaskapacific.edu
AVTEC-Alaska's Institute of Technology                 www.avtec.edu/
Charter College-Anchorage                      www.chartercollege.edu
Ilisagvik College                                   www.ilisagvik.edu
University of Alaska Anchorage                     www.uaa.alaska.edu
University of Alaska Fairbanks                            www.uaf.edu
University of Alaska Southeast                     www.uas.alaska.edu
Name: url, dtype: object

<div class="alert alert-block alert-danger">
<h5>Important Warning! Implicit vs. Explicit indexing</h5>
<p>
A confusing situation arises when you have a `Series` with an explicit integer index that doesn't start with 0 and increment 1 for each element.
</p>

<p>
Slice notations get convoluted in this case and you have to use some ** special attributes (.loc, .iloc, .ix) that are discussed in your textbook on page 109-110** to keep things straight. 
</p>
</div> 

#### Series Masking
You can do masking on `Series` objects in the same way you did so with NumPy Arrays. Review the Jupyter Notebook for Sept 19th for more information on masking using NumPy

Here a couple of examples:

In [53]:
# Let's get a new Series object with numeric data on SAT average scores.
sat_average_series = college_scorecard['sat_average']

In [54]:
# Return schools with SAT averages over 1200
sat_average_series[sat_average_series > 1200]

institution_name
Auburn University                                          1215.0
University of Alabama in Huntsville                        1219.0
Hendrix College                                            1244.0
California Institute of Technology                         1545.0
California Polytechnic State University-San Luis Obispo    1234.0
Chapman University                                         1210.0
Claremont McKenna College                                  1419.0
Harvey Mudd College                                        1500.0
Loyola Marymount University                                1218.0
Occidental College                                         1300.0
Pepperdine University                                      1234.0
Pomona College                                             1454.0
Santa Clara University                                     1309.0
Scripps College                                            1360.0
Soka University of America                                 

## Activity:

1. What schools have averages between 1400 & 1500
1. Is University of Notre Dame one of the schools? 
1. How about 'Harvard University'? 


In [58]:
new=sat_average_series[(sat_average_series > 1400) & (sat_average_series < 1500)]
print(new)
'University of Notre Dame' in new
'Harvard University' in new

institution_name
Claremont McKenna College                      1419.0
Pomona College                                 1454.0
Stanford University                            1465.0
Yale University                                1493.0
Georgetown University                          1414.0
Northwestern University                        1461.0
University of Notre Dame                       1450.0
Amherst College                                1439.0
Franklin W Olin College of Engineering         1475.0
Northeastern University                        1420.0
Tufts University                               1423.0
Williams College                               1452.0
Johns Hopkins University                       1439.0
Carleton College                               1408.0
Washington University in St Louis              1478.0
Duke University                                1454.0
Dartmouth College                              1444.0
Princeton University                           1491.0
Columbia Un

False

## Selecting Data from `DataFrame` Objects

Similiarly to what we found with `Series` objects. You can interact with `DataFrame` objects in ways that sometimes resemble a dictionary and other times a NumPy array.

### Dictionary Like Features


In [None]:
# You can retrieve an individual Series from a DataFrame
# by passing the Series name/key to the DataFrame
college_scorecard['religious_affiliation_desc'][:10]

In [None]:
# Test for the existence of a given column/Series in a DataFrame
'city' in college_scorecard

<div class="alert alert-block alert-warning">
<p> Note the distiction with `in` operator on a `Series` and on a `DataFrame`. When you use it on a `Series` it checks if it is present in the index. Whereas for a `DataFrame`, it check if it is present in the columns
<div>

### Array Like Features

#### Slicing (Explicit Index)
Slicing affects affects rows, not columns in a `DataFrame`. In other words, you can slice based on the index values, but not the column values. Let's get a slice of all rows from 'Alaska Bible College' to 'Alabama State University':

In [None]:
college_scorecard['Alaska Bible College': 'Alabama State University']

<div class="alert alert-block alert-info">
<p>
You can however use the `iloc`, and `loc` methods to slice based on columns.  **You can look into this on pages 113-114 of your textbook if you are interested.**</p>
</div> 

#### Slicing (implicit index)
You can also rely on the implicit integer index of the `DataFrame` (yes, it has one too) to retrieve rows by the numeric index.

**Just remember, the 'end' value of the slice is not included when using the implicit index.**

In [None]:
college_scorecard[0:5]

#### Masking

Masking operations likewise return rows from a `DataFrame`, but the **criteria of the masks will be a comparison on one of the columns/Series**. This is somewhat confusing sounding, so let's just demonstrate:

In [None]:
# Return all rows where the 'state' Series has a value of 'AK'
college_scorecard[college_scorecard['state'] == 'AK']

In [None]:
# Which colleges in IN offer Bachelors degrees?
# Again, notice the parathesis here
college_scorecard[(college_scorecard['state'] == 'IN') & (college_scorecard['predominant_degree_desc'] == 'Bachelors')]

## Activity On Football Athletes Data

1. Details of the players who are in freshmen class?
1. Details of the players whose position is defensive linemen (DL) and are in their the senior class? 
1. Average height of players whose position is defensive linemen (DL) and are in their the senior class? 

In [5]:
import pandas as pd
athletes_data = pd.read_csv('./data/nd-football-2017-roster.csv', index_col=['Name'])
athletes_data.head()

Unnamed: 0_level_0,Number,Position,Height,Weight,Class,Hometown
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dexter Williams,2,RB,71,202,JR,"Winter Garden, FL"
C.J. Sanders,3,WR,68,185,JR,"Granada Hills, CA"
Te'von Coney,4,LB,73,235,JR,"Palm Beach Gardens, FL"
Montgomery VanGorder,4,QB,73,217,SR,"Buford, GA"
Nyles Morgan,5,LB,73,238,SR,"Crete, IL"


In [14]:
athletes_data[athletes_data['Class'] == 'FR']
#^ didn't get this one
athletes_data[(athletes_data['Position'] == 'DL')&(athletes_data['Class'] == 'SR')]
dl_sr=athletes_data[(athletes_data['Position'] == 'DL')&(athletes_data['Class'] == 'SR')]
dl_sr['Height'].mean()


74.5

## UFunc Arithmatic with Index Preservation

Let us convert the height of the players into meters. The math to convert from inches to meters is to multiply by 0.0254. 

In [6]:
athletes_data['Height']*0.0254

Name
Dexter Williams          1.8034
C.J. Sanders             1.7272
Te'von Coney             1.8542
Montgomery VanGorder     1.8542
Nyles Morgan             1.8542
Equanimeous St. Brown    1.9558
Nick Watkins             1.8542
Brandon Wimbush          1.8542
Daelin Hayes             1.9050
Chris Finke              1.7526
Ian Book                 1.8288
Tyler Luatua             1.9050
Devin Studstill          1.8288
C.J. Holmes              1.8288
Nolan Henry              1.8288
Isaiah Robertson         1.8542
Troy Pride Jr.           1.8034
Justin Yoon              1.7780
Shaun Crawford           1.7526
Jalen Elliott            1.8542
Asmar Bilal              1.8796
Drue Tranquill           1.8796
Mick Assaf               1.8034
Nick Coleman             1.8288
Brandon Garcia           1.7526
Austin Webster           1.8034
Ashton White             1.7780
Julian Love              1.8034
Nicco Fertitta           1.7526
Sam Kohler               1.8288
                          ...  
Eli


Do you see how my index was still preserved? This is referred to as **index preservation** and we will see it come into play both for `Series` and `DataFrame` objects when we using arithmetic functions on them.

## Binary Functions and `DataFrame` Objects
Now let's try performing binary UFunc operations on DataFrames.

#### Operations between 2 DataFrames

To demonstate how arithmetic operations work between two different `DataFrame` objects I'll need to construct a couple of simple objects.

I'll go ahead and create two imaginary objects that hold sales data over two different years for the burger joint: **In-N-Out**

In [15]:
# 2015 Sales DataFrame
sales_2015 = pd.DataFrame([
        {'Burgers': 9574265, 'Fries': 7124736, 'Drinks': 11563762},
        {'Burgers': 6574265, 'Fries': 5124736, 'Drinks': 13563762},
    ], 
    index=['California', 'Texas'])

# 2016 Sales DataFrame
# They open their first Indiana store at Notre Dame!!!
# And they sell Irish Shakes nationwide to celebrate.
sales_2016 = pd.DataFrame([
        {'Burgers': 9742652, 'Fries': 7354736, 'Drinks': 11133762, 'Irish Shakes': 75812},
        {'Burgers': 7774222, 'Fries': 6214736, 'Drinks': 14563762, 'Irish Shakes': 15525},
        {'Burgers': 74265, 'Fries': 54736, 'Drinks': 43762, 'Irish Shakes': 23612},
    ], 
    index=['California', 'Texas', 'Indiana'])


Here's what those `DataFrames` look like separately:

In [16]:
print(sales_2015, sales_2016, sep='\n\n\n')

            Burgers    Drinks    Fries
California  9574265  11563762  7124736
Texas       6574265  13563762  5124736


            Burgers    Drinks    Fries  Irish Shakes
California  9742652  11133762  7354736         75812
Texas       7774222  14563762  6214736         15525
Indiana       74265     43762    54736         23612


In [17]:
sales_2015 + sales_2016

Unnamed: 0,Burgers,Drinks,Fries,Irish Shakes
California,19316917.0,22697524.0,14479472.0,
Indiana,,,,
Texas,14348487.0,28127524.0,11339472.0,


To have a value in the results of an operation between two `DataFrame` objects, there must be a value in both of the objects for a given Index/Column combination.

This is why there is no data for Indiana in our results (that index only existed in 2016) and no results for Irish Shakes (that column only existed in 2016).

## Loading JSON Files
In terms of web APIs, JSON is the dominant data transmission format on the internet right now - so you'll need to be familar with how to load it into **`DataFrame`** objects as well.

There are a wide variety of ways that JSON documents can be structured. Unless you want to really start getting down into the,  there are really only a few formats that Pandas will read without problems.

For our purposes, we'll use a pretty basic file that conforms to one of the standard formats just to get our feet wet.

I've uploaded a JSON formatted file `pokedex.json` for us to use.  Hopefully, you are a Pokemon fan.

In [19]:
# We use the `orient` parameter to tell Pandas what the basic 
# structure of the JSON is.  The other options are:
# split, index, columns, and values
# More Info: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_json.html
pokedex = pd.read_json('./data/pokedex.json', orient='records')
pokedex.head()

Unnamed: 0,avg_spawns,candy,candy_count,egg,height,id,img,multipliers,name,next_evolution,num,prev_evolution,spawn_chance,spawn_time,type,weaknesses,weight
0,69.0,Bulbasaur Candy,25.0,2 km,0.71 m,1,http://www.serebii.net/pokemongo/pokemon/001.png,[1.58],Bulbasaur,"[{'num': '002', 'name': 'Ivysaur'}, {'num': '0...",1,,0.69,20:00,"[Grass, Poison]","[Fire, Ice, Flying, Psychic]",6.9 kg
1,4.2,Bulbasaur Candy,100.0,Not in Eggs,0.99 m,2,http://www.serebii.net/pokemongo/pokemon/002.png,"[1.2, 1.6]",Ivysaur,"[{'num': '003', 'name': 'Venusaur'}]",2,"[{'num': '001', 'name': 'Bulbasaur'}]",0.042,07:00,"[Grass, Poison]","[Fire, Ice, Flying, Psychic]",13.0 kg
2,1.7,Bulbasaur Candy,,Not in Eggs,2.01 m,3,http://www.serebii.net/pokemongo/pokemon/003.png,,Venusaur,,3,"[{'num': '001', 'name': 'Bulbasaur'}, {'num': ...",0.017,11:30,"[Grass, Poison]","[Fire, Ice, Flying, Psychic]",100.0 kg
3,25.3,Charmander Candy,25.0,2 km,0.61 m,4,http://www.serebii.net/pokemongo/pokemon/004.png,[1.65],Charmander,"[{'num': '005', 'name': 'Charmeleon'}, {'num':...",4,,0.253,08:45,[Fire],"[Water, Ground, Rock]",8.5 kg
4,1.2,Charmander Candy,100.0,Not in Eggs,1.09 m,5,http://www.serebii.net/pokemongo/pokemon/005.png,[1.79],Charmeleon,"[{'num': '006', 'name': 'Charizard'}]",5,"[{'num': '004', 'name': 'Charmander'}]",0.012,19:00,[Fire],"[Water, Ground, Rock]",19.0 kg
