Dataframes

In [10]:
# Two dimensional series object, the dataframe is a two axes-labelled array

In [11]:
import pandas as pd
record1 = pd.Series({'Name': 'Alice','Class':'Math', 'Score': 88})
record2 = pd.Series({'Name': 'Ben','Class':'Python for Data Science specialization', 'Score': 89})
record3 = pd.Series({'Name': 'Jon','Class':'Please god let me type consistently at 80wpm', 'Score': 100})


In [12]:
df = pd.DataFrame([record1, record2, record3], index = ['school1','school2','school1'])
df.head()

Unnamed: 0,Name,Class,Score
school1,Alice,Math,88
school2,Ben,Python for Data Science specialization,89
school1,Jon,Please god let me type consistently at 80wpm,100


In [13]:
# HTML!!!!
# We can also pass a list of dictionaries and index them in a similar way, and we get the same result.
# Similarly as to with Series, we can index dataframes using .loc and .iloc

In [14]:
df.loc['school1']

Unnamed: 0,Name,Class,Score
school1,Alice,Math,88
school1,Jon,Please god let me type consistently at 80wpm,100


In [15]:
# Using loc where there is more than one entry for a name will return a new DataFrame where the entries are only those selected in the .loc query.

In [16]:
type(df.loc['school1'])

pandas.core.frame.DataFrame

In [17]:
type(df.loc['school2'])

pandas.core.series.Series

In [20]:
# As we can see, these two objects have different types, solely because there was more than one entry for school1.
# If we only want to see the Name for example, we can sort by those with the following query.
df.loc['school1','Name']
# Here, the first entry is still the index being sorted, and all the following entries will specify what columns of data we want to see that satisfy the first entry, here , that is only the names of everyone in school1.

school1    Alice
school1      Jon
Name: Name, dtype: object

In [21]:
# If we wanted to ignore what school they went to, and only wanted to see the names, we could transpose the DataFrame (Thanks Dr. Sozer!!).
df.T

Unnamed: 0,school1,school2,school1.1
Name,Alice,Ben,Jon
Class,Math,Python for Data Science specialization,Please god let me type consistently at 80wpm
Score,88,89,100


In [22]:
# We can query the transpose as follows.
df.T.loc['Name']

school1    Alice
school2      Ben
school1      Jon
Name: Name, dtype: object

In [23]:
# In pandas, columns always have a name, and thus, querying df['Name'] and df.T.loc['Name'] will yield the same result.
df['Name']

school1    Alice
school2      Ben
school1      Jon
Name: Name, dtype: object

In [25]:
type(df['Name'])
# The result of a column projection is always a Series object. Due to this, we are able to chain operations together in a very nice way. For instance, we can select all of the wors with are related to school using .loc, and then project the name column from just one of those rows.
df.loc['school1']['Name']

school1    Alice
school1      Jon
Name: Name, dtype: object

In [28]:
type(df.loc['school1']['Name'])
# This would be a DataFrame if we hadnt chosen to further sort by just the names, which ended up turning it into a series.
# Chaining, by indexing on the return type of another index, can come with some costsa and is best avoided if we can use another approach. 
# This is because pandas returns a copy of the DataFrame instead of a view of the DataFrame.
# The .loc attribute also accepts slicing, so we are able to use the colon (:) in order to select multiple columns, as we would in a list.
df.loc[:,['Name', 'Score']]
# Here, the first entry means that we want to select all of the rows, and the second one lets us select which columns of these rows we want to access.

Unnamed: 0,Name,Score
school1,Alice,88
school2,Ben,89
school1,Jon,100


In [29]:
# We can delete data using drop function.
# The drop function doesnt change the DataFrame in place, what it does is create a copy, but this can be changed in the parameters of the drop function.
copy_df = df.copy()
copy_df.drop('Name', inplace = True, axis = 1)
copy_df


Unnamed: 0,Class,Score
school1,Math,88
school2,Python for Data Science specialization,89
school1,Please god let me type consistently at 80wpm,100


In [31]:
# Since we made inplace True, we dropped the Name permanently.
# Also, we can add new columns to the DataFrame by simply assigning them some value using the indexing operator.
df["Soccer Team"]= "Arsenal"
df

Unnamed: 0,Name,Class,Score,Soccer Team
school1,Alice,Math,88,Arsenal
school2,Ben,Python for Data Science specialization,89,Arsenal
school1,Jon,Please god let me type consistently at 80wpm,100,Arsenal


DataFrame Indexing and Loading

In [32]:
# In Jupyter Notebooks, we can use the shell, to call orders to the shell we have to use the ! sign before our line of code.
! cat datasets/Admission_Predict.csv

Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR ,CGPA,Research,Chance of Admit 
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4,4.5,8.87,1,0.76
3,316,104,3,3,3.5,8,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2,3,8.21,0,0.65
6,330,115,5,4.5,3,9.34,1,0.9
7,321,109,3,3,4,8.2,1,0.75
8,308,101,2,3,4,7.9,0,0.68
9,302,102,1,2,1.5,8,0,0.5
10,323,108,3,3.5,3,8.6,0,0.45
11,325,106,3,3.5,4,8.4,1,0.52
12,327,111,4,4,4.5,9,1,0.84
13,328,112,4,4,4.5,9.1,1,0.78
14,307,109,3,4,3,8,1,0.62
15,311,104,3,3.5,2,8.2,1,0.61
16,314,105,3,3.5,2.5,8.3,0,0.54
17,317,107,3,4,3,8.7,0,0.66
18,319,106,3,4,3,8,1,0.65
19,318,110,3,4,3,8.8,0,0.63
20,303,102,3,3.5,3,8.5,0,0.62
21,312,107,3,3,2,7.9,1,0.64
22,325,114,4,3,2,8.4,0,0.7
23,328,116,5,5,5,9.5,1,0.94
24,334,119,5,5,4.5,9.7,1,0.95
25,336,119,5,4,3.5,9.8,1,0.97
26,340,120,5,4.5,4.5,9.6,1,0.94
27,322,109,5,4.5,3.5,8.8,0,0.76
28,298,98,2,1.5,2.5,7.5,1,0.44
29,295,93,1,2,2,7.2,0,0.46
30,310,99,2,1.5,2,7.3,0,0.54
31,300,97,2,3,3,8.1,1,0.65
32,327,103,3,

In [40]:
'''import csv

with open("datasets/Admission_Predict.csv",'r') as file:
    reader = csv.DictReader(file)
    for row in reader:
        print(row)
        '''

'import csv\n\nwith open("datasets/Admission_Predict.csv",\'r\') as file:\n    reader = csv.DictReader(file)\n    for row in reader:\n        print(row)\n        '

In [41]:
import pandas as pd

# We can turn csv files into DataFrames very easily through the read_csv() function

df = pd.read_csv('datasets/Admission_Predict.csv')

df.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


In [44]:
# Since idk what SOP and LOR are, im going to rename them using the rename() function, which takes a parameter names columns, adn we need to pass into a dictionary which the has the old names as the keys and the values as the new names.
new_df = df.rename(columns={"GRE Score" :"GRE Score", "TOEFL Score": "TOEFL Score", "University Rating": "University Rating", 'SOP':'Statement of Purpose',"LOR": 'Letter of Reccomendation', "CGPA":"CGPA",'Research':"Research", "Chance of Admit":"Chance of Admit"})
new_df

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,Statement of Purpose,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.00,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.80
4,5,314,103,2,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...,...
395,396,324,110,3,3.5,3.5,9.04,1,0.82
396,397,325,107,3,3.0,3.5,9.11,1,0.84
397,398,330,116,4,5.0,4.5,9.45,1,0.91
398,399,312,103,3,3.5,4.0,8.78,0,0.67


In [48]:
new_df.columns
# Here, we can see that we didnt change the name of LOR because in reality, the name is followed by a space, when we only changed LOR, with no space.
# No problem though, we can do quite a few things.
# We can use strip() in order to strip all of the names and make sure that this doesn't happen to us again.
new_df = new_df.rename(columns= {"LOR":"Letter of Reccomendation"})
new_df = new_df.rename(mapper=str.strip, axis='columns')
new_df.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,Statement of Purpose,Letter of Reccomendation,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


In [49]:
# Here, we just made a copy of the original DataFrame.
df.columns

Index(['Serial No.', 'GRE Score', 'TOEFL Score', 'University Rating', 'SOP',
       'LOR ', 'CGPA', 'Research', 'Chance of Admit '],
      dtype='object')

In [51]:
# We can also use the columns attribute by assigning it to a list of column names which will directly rename the columns. This will directly modify the original DataFrame.
# With this, we can also change a few columns out of a dataset that has a lot, which makes our work a lot easier.
cols = list(df.columns)
cols = [x.lower().strip() for x in cols]
df.columns = cols
df
# Here we used list comprehension in order to manipulate and edit all of the columns in our DataFrame.

Unnamed: 0,serial no.,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.00,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.80
4,5,314,103,2,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...,...
395,396,324,110,3,3.5,3.5,9.04,1,0.82
396,397,325,107,3,3.0,3.5,9.11,1,0.84
397,398,330,116,4,5.0,4.5,9.45,1,0.91
398,399,312,103,3,3.5,4.0,8.78,0,0.67


Querying DataFrames

In [52]:
# For this, it is important to use Boolean Masking.
# A Boolean Mask is a np.array which can be one dimensional like a series, or two dimensional lieke a DataFrame , where each of the values in the array is either True or False.
import pandas as pd
df = pd.read_csv("datasets/Admission_Predict.csv",index_col = 0)
# Now we can clean up the names of the columns in a nice way.
df.columns = [x.lower().strip() for x in df.columns]
df.head()

Unnamed: 0_level_0,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65


In [58]:
admit_mask = df["chance of admit"]>0.9
admit_mask
# Learned about this in Py for Data Analysis!!
# Now we can use this to create a new DataFrame in which we only keep the students which have a chance of admission of higher than 90%.
df.where(admit_mask).head() 
# This makes every row where admit_mask is False and turns it into NaN, we can get rid of these with the following.
df.where(admit_mask).dropna().head()

Unnamed: 0_level_0,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337.0,118.0,4.0,4.5,4.5,9.65,1.0,0.92
23,328.0,116.0,5.0,5.0,5.0,9.5,1.0,0.94
24,334.0,119.0,5.0,5.0,4.5,9.7,1.0,0.95
25,336.0,119.0,5.0,4.0,3.5,9.8,1.0,0.97
26,340.0,120.0,5.0,4.5,4.5,9.6,1.0,0.94


In [59]:
# There is an easier way to do this though, as the last way was pretty wordy and youd expect that if youre boolean masking, youre doing it with the intention of dropping the NaN values automatically.
df[df['chance of admit'] >0.9].head()
# What this is essentially doing is indexing df by only the columns in the series that returned True, which makes more sense now than it did originally when reading the book.

Unnamed: 0_level_0,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
23,328,116,5,5.0,5.0,9.5,1,0.94
24,334,119,5,5.0,4.5,9.7,1,0.95
25,336,119,5,4.0,3.5,9.8,1,0.97
26,340,120,5,4.5,4.5,9.6,1,0.94


In [61]:
# We can also combine multiple Boolean Masks, AND has been turned to the ampersand symbol (&), and OR has been turned into the pipe (|).
(df['chance of admit'] > 0.7) & (df['chance of admit']<0.95)
# Always remember to use the parenthesis around the objects you want to combine.


Serial No.
1       True
2       True
3       True
4       True
5      False
       ...  
396     True
397     True
398     True
399    False
400    False
Name: chance of admit, Length: 400, dtype: bool

In [63]:
df[df['chance of admit'].gt(0.7).lt(0.9)].head()
# This is gonna be really big for cleaning data when using pandas and DataFrames.

Unnamed: 0_level_0,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
5,314,103,2,2.0,3.0,8.21,0,0.65
8,308,101,2,3.0,4.0,7.9,0,0.68
9,302,102,1,2.0,1.5,8.0,0,0.5
10,323,108,3,3.5,3.0,8.6,0,0.45
11,325,106,3,3.5,4.0,8.4,1,0.52


Indexing DataFrames

In [64]:
# In order to set an index for a column, we can use the set_index() function, but it is important to know and remember that this function is destructive, meaning that it doesnt conserve the old index and it will be lost.
import pandas as pd
df = pd.read_csv('datasets/Admission_Predict.csv', index_col = 0)
df.head()

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65


In [66]:
df['Serial Number']= df.index
df = df.set_index('Chance of Admit ')
df.head()
# Here, we are replacing the Serial Number as the index, and we can now index by the chance of admit.

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Serial Number
Chance of Admit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.92,337,118,4,4.5,4.5,9.65,1,1
0.76,324,107,4,4.0,4.5,8.87,1,2
0.72,316,104,3,3.0,3.5,8.0,1,3
0.8,322,110,3,3.5,2.5,8.67,1,4
0.65,314,103,2,2.0,3.0,8.21,0,5


In [67]:
df = df.reset_index()
df.head()

Unnamed: 0,Chance of Admit,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Serial Number
0,0.92,337,118,4,4.5,4.5,9.65,1,1
1,0.76,324,107,4,4.0,4.5,8.87,1,2
2,0.72,316,104,3,3.0,3.5,8.0,1,3
3,0.8,322,110,3,3.5,2.5,8.67,1,4
4,0.65,314,103,2,2.0,3.0,8.21,0,5


In [68]:
# We can also multi-level index, which happens when we use set_index() and pass it a list of indexes that we want to promote to an index.
# This is very similar to the composite keys in relational database systems.
df = pd.read_csv('datasets/census.csv')
df.head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
0,40,3,6,1,0,Alabama,Alabama,4779736,4780127,4785161,...,0.002295,-0.193196,0.381066,0.582002,-0.467369,1.030015,0.826644,1.383282,1.724718,0.712594
1,50,3,6,1,1,Alabama,Autauga County,54571,54571,54660,...,7.242091,-2.915927,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.59227,-2.187333
2,50,3,6,1,3,Alabama,Baldwin County,182265,182265,183193,...,14.83296,17.647293,21.845705,19.243287,17.197872,15.844176,18.559627,22.727626,20.317142,18.293499
3,50,3,6,1,5,Alabama,Barbour County,27457,27457,27341,...,-4.728132,-2.50069,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299
4,50,3,6,1,7,Alabama,Bibb County,22915,22919,22861,...,-5.527043,-5.068871,-6.201001,-0.177537,0.177258,-5.088389,-4.363636,-5.403729,0.754533,1.107861


In [69]:
# In this data set there are two summarixed levels, one that contains summary data for a county within a state, and one that contains summary data for each state. I want to see a list of all th euique values in a given volumn. In this DataFrame, we see that the posible values for the sum level are using the unique function on the DataFrame.
# We can see that the SUMLEV for states is 40 and the SUMLEV for counties is 50.
df['SUMLEV'].unique()

array([40, 50])

In [71]:
df = df[df['SUMLEV'] == 50]
df

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
1,50,3,6,1,1,Alabama,Autauga County,54571,54571,54660,...,7.242091,-2.915927,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.592270,-2.187333
2,50,3,6,1,3,Alabama,Baldwin County,182265,182265,183193,...,14.832960,17.647293,21.845705,19.243287,17.197872,15.844176,18.559627,22.727626,20.317142,18.293499
3,50,3,6,1,5,Alabama,Barbour County,27457,27457,27341,...,-4.728132,-2.500690,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299
4,50,3,6,1,7,Alabama,Bibb County,22915,22919,22861,...,-5.527043,-5.068871,-6.201001,-0.177537,0.177258,-5.088389,-4.363636,-5.403729,0.754533,1.107861
5,50,3,6,1,9,Alabama,Blount County,57322,57322,57373,...,1.807375,-1.177622,-1.748766,-2.062535,-1.369970,1.859511,-0.848580,-1.402476,-1.577232,-0.884411
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3188,50,4,8,56,37,Wyoming,Sweetwater County,43806,43806,43593,...,1.072643,16.243199,-5.339774,-14.252889,-14.248864,1.255221,16.243199,-5.295460,-14.075283,-14.070195
3189,50,4,8,56,39,Wyoming,Teton County,21294,21294,21297,...,-1.589565,0.972695,19.525929,14.143021,-0.564849,0.654527,2.408578,21.160658,16.308671,1.520747
3190,50,4,8,56,41,Wyoming,Uinta County,21118,21118,21102,...,-17.755986,-4.916350,-6.902954,-14.215862,-12.127022,-18.136812,-5.536861,-7.521840,-14.740608,-12.606351
3191,50,4,8,56,43,Wyoming,Washakie County,8533,8533,8545,...,-11.637475,-0.827815,-2.013502,-17.781491,1.682288,-11.990126,-1.182592,-2.250385,-18.020168,1.441961


In [None]:
# If we only care about the total population estimates and the total number of births, we can create a list of the columns we want to keep and then project those.
columns_to_keep = ['STNAME','CTYNAME','BIRTHS2010','BIRTHS2011','BIRTHS2012','BIRTHS2013','BIRTHS2014','BIRTHS2015','POPESTIMATE2010','POPESTIMATE2011','POPESTIMATE2012','POPESTIMATE2013','POPESTIMATE2014','POPESTIMATE2015']
df = df[columns_to_keep]
df.head()

Unnamed: 0,STNAME,CTYNAME,BIRTHS2010,BIRTHS2011,BIRTHS2012,BIRTHS2013,BIRTHS2014,BIRTHS2015,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015
1,Alabama,Autauga County,151,636,615,574,623,600,54660,55253,55175,55038,55290,55347
2,Alabama,Baldwin County,517,2187,2092,2160,2186,2240,183193,186659,190396,195126,199713,203709
3,Alabama,Barbour County,70,335,300,283,260,269,27341,27226,27159,26973,26815,26489
4,Alabama,Bibb County,44,266,245,259,247,253,22861,22733,22642,22512,22549,22583
5,Alabama,Blount County,183,744,710,646,618,603,57373,57711,57776,57734,57658,57673


In [79]:
# How can we query this DataFrame?
# We can use a multi-index. To do this, we can use a loc query, with our parameters in the order of importance, for example, if I was trying to query for Burlington, Ontario, i would use df.loc["Ontario","Burlington"]
df = df.set_index(["STNAME","CTYNAME"])
df.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,BIRTHS2010,BIRTHS2011,BIRTHS2012,BIRTHS2013,BIRTHS2014,BIRTHS2015,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015
STNAME,CTYNAME,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Alabama,Autauga County,151,636,615,574,623,600,54660,55253,55175,55038,55290,55347
Alabama,Baldwin County,517,2187,2092,2160,2186,2240,183193,186659,190396,195126,199713,203709
Alabama,Barbour County,70,335,300,283,260,269,27341,27226,27159,26973,26815,26489
Alabama,Bibb County,44,266,245,259,247,253,22861,22733,22642,22512,22549,22583
Alabama,Blount County,183,744,710,646,618,603,57373,57711,57776,57734,57658,57673


In [80]:
# Now we can query for the info about Washtenaw County, Michigan, can should use:
df.loc["Michigan","Washtenaw County"]


BIRTHS2010            977
BIRTHS2011           3826
BIRTHS2012           3780
BIRTHS2013           3662
BIRTHS2014           3683
BIRTHS2015           3709
POPESTIMATE2010    345563
POPESTIMATE2011    349048
POPESTIMATE2012    351213
POPESTIMATE2013    354289
POPESTIMATE2014    357029
POPESTIMATE2015    358880
Name: (Michigan, Washtenaw County), dtype: int64

In [81]:
type(df.loc["Michigan","Washtenaw County"])

pandas.core.series.Series

In [83]:
# If we wanted to compare data about counties, we would use tuples, which pandas will unpack.
df.loc[[('Michigan', "Washtenaw County"),("Michigan","Wayne County")]]

Unnamed: 0_level_0,Unnamed: 1_level_0,BIRTHS2010,BIRTHS2011,BIRTHS2012,BIRTHS2013,BIRTHS2014,BIRTHS2015,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015
STNAME,CTYNAME,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Michigan,Washtenaw County,977,3826,3780,3662,3683,3709,345563,349048,351213,354289,357029,358880
Michigan,Wayne County,5918,23819,23270,23377,23607,23586,1815199,1801273,1792514,1775713,1766008,1759335
