## Extracting and transforming data 

In [1]:
import pandas as pd
# sclicing in DataFrame is a view not a copy, but if need new line, the original are not changed
df2 = pd.DataFrame([['1980', 'Blondie', 'Call Me', '6'],
       ['1981', 'Chistorpher Cross', 'Arthurs Theme', '3'],
       ['1982', 'Joan Jett', 'I Love Rock and Roll', '7']])
df2.columns = ['year', 'artist', 'song', 'chart weeks']
# df3 = df2.ix[:2, 'artist':'chart weeks'].copy()
df3 = df2.ix[:2, 'artist':'chart weeks']
df3['chart weeks'] = df3['chart weeks'] + '2'
df3['test'] = 10
print df3
print df2

              artist                  song chart weeks  test
0            Blondie               Call Me          62    10
1  Chistorpher Cross         Arthurs Theme          32    10
2          Joan Jett  I Love Rock and Roll          72    10
   year             artist                  song chart weeks
0  1980            Blondie               Call Me          62
1  1981  Chistorpher Cross         Arthurs Theme          32
2  1982          Joan Jett  I Love Rock and Roll          72


In [3]:
# slicing in Series returns a view not a copy
a = pd.Series([0,2,3,4,5])
a[1:4] = 9
a

0    0
1    9
2    9
3    9
4    5
dtype: int64

In [None]:
# Create the boolean array: high_turnout
high_turnout = election['turnout'] > 70

# Filter the election DataFrame with the high_turnout array: high_turnout_df
high_turnout_df = election[high_turnout]

# Print the high_turnout_results DataFrame
print(high_turnout_df)

# Import numpy
import numpy as np

# Create the boolean array: too_close
too_close = election['margin'] < 1

# Assign np.nan to the 'winner' column where the results were too close to call
election['winner'][too_close] = np.nan

# Print the output of election.info()
print(election.info())

# Select the 'age' and 'cabin' columns: df
df = titanic[['age', 'cabin']]

# Print the shape of df
print(df.shape)

# Drop rows in df with how='any' and print the shape
print(df.dropna(how='any').shape)

# Drop rows in df with how='all' and print the shape
print(df.dropna(how='all').shape)

# Call .dropna() with thresh=1000 and axis='columns' and print the output of .info() from titanic
print(titanic.dropna(thresh=1000, axis='columns').info())

In [None]:
# Write a function to convert degrees Fahrenheit to degrees Celsius: to_celsius
def to_celsius(F):
    return 5/9*(F - 32)

# Apply the function over 'Mean TemperatureF' and 'Mean Dew PointF': df_celsius
df_celsius = weather[['Mean TemperatureF','Mean Dew PointF']].apply(to_celsius)

# Reassign the columns df_celsius
df_celsius.columns = ['Mean TemperatureC', 'Mean Dew PointC']

# Print the output of df_celsius.head()
print(df_celsius.head())

# The .map() method is used to transform values according to a Python dictionary look-up.
# Create the dictionary: red_vs_blue
red_vs_blue = {'Obama':'blue' , 'Romney':'red'}

# Use the dictionary to map the 'winner' column to the new column: election['color']
election['color'] = election['winner'].map(red_vs_blue)

# Print the output of election.head()
print(election.head())



# When performance is paramount, you should avoid using .apply() and .map() 
# because those constructs perform Python for-loops over the data stored in a pandas Series or DataFrame. 
# By using vectorized functions instead, you can loop over the data at the same speed as compiled code (C, Fortran, etc.)! 
# NumPy, SciPy and pandas come with a variety of vectorized functions (called Universal Functions or UFuncs in NumPy).
# Import zscore from scipy.stats
from scipy.stats import zscore

# Call zscore with election['turnout'] as input: turnout_zscore
turnout_zscore = zscore(election['turnout'])

# Print the type of turnout_zscore
print(type(turnout_zscore))

# Assign turnout_zscore to a new column: election['turnout_zscore']
election['turnout_zscore'] = turnout_zscore

# Print the output of election.head()
print(election.head())



## Advanced indexing

In [44]:
# indexes are immutable objects. This means that if you want to change or modify the index in a dataframe, 
# then you need to change the whole index.

# In [1]: sales
# Out[1]: 
#              eggs  salt  spam
# state month                  
# CA    1        47  12.0    17
#       2       110  50.0    31
# NY    1       221  89.0    72
#       2        77  87.0    20
# TX    1       132   NaN    52
#       2       205  60.0    55

# Create the list of new indexes: new_idx
new_idx = [month.upper() for month in sales.index]

# Assign new_idx to sales.index
sales.index = new_idx

# Print the sales DataFrame
print(sales)


# Assign the string 'MONTHS' to sales.index.name
sales.index.name = 'MONTHS'

# Print the sales DataFrame
print(sales)

# Assign the string 'PRODUCTS' to sales.columns.name 
sales.columns.name = 'PRODUCTS'

# Print the sales dataframe again
print(sales)

# You can also build the DataFrame and index independently, and then put them together. 
# If you take this route, be careful, 
# as any mistakes in generating the DataFrame or the index can cause the data and the index to be aligned incorrectly.
# Generate the list of months: months
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun']

# Assign months to sales.index
sales.index = months

# Print the modified sales DataFrame
print(sales)

# Extracting elements from the outermost level of a MultiIndex is just like in the case of a single-level Index
# Print sales.loc[['CA', 'TX']]
print(sales.loc[['CA', 'TX']])

# Print sales['CA':'TX']
# this selects rows with slicing. different from choosing columns with bracket
print(sales['CA':'TX'])

# With a MultiIndex, you should always ensure the index is sorted. 
# You can skip this only if you know the data is already sorted on the index fields.
# Set the index to be the columns ['state', 'month']: sales
sales = sales.set_index(['state', 'month'])

# Sort the MultiIndex: sales
sales = sales.sort_index()

# Print the sales DataFrame
print(sales)

# Set the index to the column 'state': sales
sales = sales.set_index(['state'])

# Print the sales DataFrame
print(sales)

# Access the data from 'NY'
print(sales.loc['NY'])


# Looking up indexed data is fast and efficient. And you have already seen that lookups based on the outermost 
# level of a MultiIndex work just like lookups on DataFrames that have a single-level Index.
# Looking up data based on inner levels of a MultiIndex can be a bit trickier.
# you need to use slice(None) in the slicing parameter for the outermost dimension(s) instead of the usual :, 
# or use pd.IndexSlice. 
# http://pandas.pydata.org/pandas-docs/stable/advanced.html
# Look up data for NY in month 1: NY_month1
NY_month1 = sales.loc[('NY', 1)]

# Look up data for CA and TX in month 2: CA_TX_month2
CA_TX_month2 = sales.loc[(['CA', 'TX'], 2),:]

# Look up data for all states in month 2: all_month2
all_month2 = sales.loc[(slice(None), 2),:]


     city weekday visitors signups
0  Austin     Mon      326       3
1  Austin     Sun      139       7
2  Dallas     Mon      456       5
3  Dallas     Sun      237      12


AttributeError: 'numpy.int64' object has no attribute 'upper'

## Rearranging and reshaping data 

### Pivot - spread rows into columns

In [56]:
# In [2]: users
# Out[2]: 
#   weekday    city  visitors  signups
# 0     Sun  Austin       139        7
# 1     Sun  Dallas       237       12
# 2     Mon  Austin       326        3
# 3     Mon  Dallas       456        5

#prepare the sales dataframe
import numpy as np
users_values = np.array([['Austin', 'Mon', 326, 3],
       ['Austin', 'Sun', 139, 7],
       ['Dallas', 'Mon', 456, 5],
       ['Dallas', 'Sun', 237, 12]], dtype=object)
users = pd.DataFrame(sales_values)
users.columns = ['city', 'weekday', 'visitors', 'signups']
print users

# Pivot the users DataFrame: visitors_pivot
visitors_pivot = users.pivot(index='weekday', columns='city', values='visitors')

# Print the pivoted DataFrame
print(visitors_pivot)

# Pivot users with signups indexed by weekday and city: signups_pivot
signups_pivot = users.pivot(index='weekday', columns='city', values='signups')

# Print signups_pivot
print(signups_pivot)

# Pivot users pivoted by both signups and visitors: pivot
pivot = users.pivot(index='weekday', columns='city')

# Print the pivoted DataFrame
print(pivot)

     city weekday visitors signups
0  Austin     Mon      326       3
1  Austin     Sun      139       7
2  Dallas     Mon      456       5
3  Dallas     Sun      237      12
city    Austin Dallas
weekday              
Mon        326    456
Sun        139    237
city    Austin Dallas
weekday              
Mon          3      5
Sun          7     12
        visitors        signups       
city      Austin Dallas  Austin Dallas
weekday                               
Mon          326    456       3      5
Sun          139    237       7     12


### Stack & Unstack

In [57]:
# for multilevel index, the above pivot method won't work
# so introduce stack and unstack methods.
users = users.set_index(['city', 'weekday'])
# Unstack users by 'weekday': byweekday
byweekday = users.unstack(level='weekday')

# Print the byweekday DataFrame
print(byweekday)

# Stack byweekday by 'weekday' and print it
print(byweekday.stack(level='weekday'))

# Unstack users by 'city': bycity
bycity = users.unstack(level='city')

# Print the bycity DataFrame
print(bycity)

# Stack bycity by 'city' and print it
print(bycity.stack(level='city'))

        visitors      signups    
weekday      Mon  Sun     Mon Sun
city                             
Austin       326  139       3   7
Dallas       456  237       5  12
               visitors signups
city   weekday                 
Austin Mon          326       3
       Sun          139       7
Dallas Mon          456       5
       Sun          237      12
        visitors        signups       
city      Austin Dallas  Austin Dallas
weekday                               
Mon          326    456       3      5
Sun          139    237       7     12
               visitors signups
weekday city                   
Mon     Austin      326       3
        Dallas      456       5
Sun     Austin      139       7
        Dallas      237      12


### Swap level

In [50]:
# Stack 'city' back into the index of bycity: newusers
newusers = bycity.stack(level='city')

# Swap the levels of the index of newusers: newusers
newusers = newusers.swaplevel(0,1)

# Print newusers and verify that the index is not sorted
print(newusers)

# Sort the index of newusers: newusers
newusers = newusers.sort_index()

# Print newusers and verify that the index is now sorted
print(newusers)

# Verify that the new DataFrame is equal to the original
print(newusers.equals(users))

(               visitors signups
 city   weekday                 
 Austin Mon          326       3
        Sun          139       7
 Dallas Mon          456       5
        Sun          237      12,      city weekday visitors signups
 0  Austin     Mon      326       3
 1  Austin     Sun      139       7
 2  Dallas     Mon      456       5
 3  Dallas     Sun      237      12)

### Melt - "unpivot" - Gather columns into rows
The goal of melting is to restore a pivoted DataFrame to its original form, or to change it from a wide shape to a long shape. 
You can explicitly specify the columns that should remain in the reshaped DataFrame with id_vars, and list which columns to convert into values with value_vars.
If you don't pass a name to the values in pd.melt(), you will lose the name of your variable. You can fix this by using the value_name keyword argument.

In [None]:
# In [1]: visitors_by_city_weekday
# Out[1]: 
# city     Austin  Dallas
# weekday                
# Mon         326     456
# Sun         139     237

# Reset the index: visitors_by_city_weekday
visitors_by_city_weekday = visitors_by_city_weekday.reset_index() 

# Print visitors_by_city_weekday
print(visitors_by_city_weekday)

# Melt visitors_by_city_weekday: visitors
visitors = pd.melt(visitors_by_city_weekday, id_vars=['weekday'], value_name='visitors')

# Print visitors
print(visitors)

# You can move multiple columns into a single column (making the data long and skinny) by "melting" multiple columns. 
# In [3]: users
# Out[3]: 
#   weekday    city  visitors  signups
# 0     Sun  Austin       139        7
# 1     Sun  Dallas       237       12
# 2     Mon  Austin       326        3
# 3     Mon  Dallas       456        5
# Melt users: skinny
skinny = pd.melt(users, id_vars=['city', 'weekday'])

# Print skinny
print(skinny)



# # Obtaining key-value pairs with melt()
# Sometimes, all you need is some key-value pairs, and the context does not matter. 
# If said context is in the index, you can easily obtain what you want.

# Set the new index: users_idx
users_idx = users.set_index(['city', 'weekday'])

# Print the users_idx DataFrame
print(users_idx)

# Obtain the key-value pairs: kv_pairs
kv_pairs = pd.melt(users_idx, col_level=0)

# Print the key-value pairs
print(kv_pairs)


### Pivot table

In [None]:
# Create the DataFrame with the appropriate pivot table: by_city_day
by_city_day = users.pivot_table(index='weekday', columns='city')

# Print by_city_day
print(by_city_day)



# Use a pivot table to display the count of each column: count_by_weekday1
count_by_weekday1 = users.pivot_table(index='weekday', aggfunc='count')

# Print count_by_weekday
print(count_by_weekday1)

# Replace 'aggfunc='count'' with 'aggfunc=len': count_by_weekday2
count_by_weekday2 = users.pivot_table(index='weekday', aggfunc=len)

# Verify that the same result is obtained
print('==========================================')
print(count_by_weekday1.equals(count_by_weekday2))


# Create the DataFrame with the appropriate pivot table: signups_and_visitors
signups_and_visitors = users.pivot_table(index='weekday', aggfunc=sum)

# Print signups_and_visitors
print(signups_and_visitors)

# Add in the margins: signups_and_visitors_total 
signups_and_visitors_total = users.pivot_table(index='weekday', aggfunc=sum, margins=True)

# Print signups_and_visitors_total
print(signups_and_visitors_total)


## Grouping data 
the main advantages of storing data explicitly as categorical types instead of object types:
1. computations are faster
2. Categorical data require less space in memory

In [None]:
# Group titanic by 'pclass'
by_class = titanic.groupby('pclass')

# Aggregate 'survived' column of by_class by count
count_by_class = by_class['survived'].count()

# Print count_by_class
print(count_by_class)

# Group titanic by 'embarked' and 'pclass'
by_mult = titanic.groupby(['embarked', 'pclass'])

# Aggregate 'survived' column of by_mult by count
count_mult = by_mult['survived'].count()

# Print count_mult
print(count_mult)

In [6]:
import pandas as pd
# https://s3.amazonaws.com/assets.datacamp.com/production/course_1650/datasets/life_expectancy.csv
life_fname = 'life_expectancy.csv'
regions_fname = 'regions.csv'
# Read life_fname into a DataFrame: life
life = pd.read_csv(life_fname, index_col='Country')

# Read regions_fname into a DataFrame: regions
regions = pd.read_csv(regions_fname, index_col='Country')

# Group life by regions['region']: life_by_region
# By setting the index of both DataFrames to the country name, 
# you'll then use the region information to group the countries in 
# the life expectancy DataFrame and compute the mean value for 2010.
# life's index groupby regions['region'] needs they share the same index and use regions['region'] as a mapping
life_by_region = life.groupby(regions['region'])
# doesn't works if remove the country from the index
# life.reset_index().groupby(regions['region'])['2010'].mean()
# life.groupby(regions.reset_index()['region'])

# Print the mean over the '2010' column of life_by_region
print(life_by_region['2010'].mean())

region
America                       74.037350
East Asia & Pacific           73.405750
Europe & Central Asia         75.656387
Middle East & North Africa    72.805333
South Asia                    68.189750
Sub-Saharan Africa            57.575080
Name: 2010, dtype: float64


In [23]:
# The .agg() method can be used with a tuple or list of aggregations as input.
# When applying multiple aggregations on multiple columns, the aggregated DataFrame has a multi-level column index.

# Group titanic by 'pclass': by_class
by_class = titanic.groupby('pclass')

# Select 'age' and 'fare'
by_class_sub = by_class[['age','fare']]

# Aggregate by_class_sub by 'max' and 'median': aggregated
aggregated = by_class_sub.agg(['max', 'median'])

# Print the maximum age in each class
print(aggregated.loc[:, ('age','max')])

# Print the median fare in each class
print(aggregated.loc[:, ('fare', 'median')])


NameError: name 'titanic' is not defined

In [27]:
# If you have a DataFrame with a multi-level row index, the individual levels can be used to perform the groupby. 

#                                   fertility    life  population  \
# Year region  Country                                              
# 1964 America Antigua and Barbuda      4.250  63.775     58653.0   
#              Argentina                3.068  65.388  21966478.0   
#              Aruba                    4.059  67.113     57031.0   
#              Bahamas                  4.220  64.189    133709.0   
#              Barbados                 4.094  62.819    234455.0   

#                                   child_mortality      gdp  
# Year region  Country                                        
# 1964 America Antigua and Barbuda            72.78   5008.0  
#              Argentina                      57.43   8227.0  
#              Aruba                            NaN   5505.0  
#              Bahamas                        48.56  18160.0  
#              Barbados                       64.70   5681.0
                
# Read the CSV file into a DataFrame and sort the index: gapminder
gapminder = pd.read_csv('gapminder.csv', index_col=['Year', 'region', 'Country']).sort_index()

# Group gapminder by 'Year' and 'region: by_year_region
# groupby again on the index without duplicated index
by_year_region = gapminder.groupby(level=['Year', 'region'])

# Define the function to compute spread: spread
def spread(series):
    return series.max() - series.min()

# Create the dictionary: aggregator
aggregator = {'population':'sum', 'child_mortality':'mean', 'gdp':spread}

# Aggregate by_year_region using the dictionary: aggregated
aggregated = by_year_region.agg(aggregator)

# Print the last 6 entries of aggregated 
print(aggregated.tail(6))


                                   population  child_mortality       gdp
Year region                                                             
1973 America                     5.456463e+08        88.629667   24363.0
     East Asia & Pacific         1.384693e+09       100.646087   86215.0
     Europe & Central Asia       7.554238e+08        46.040000   36609.0
     Middle East & North Africa  1.514955e+08       121.090000  163972.0
     South Asia                  7.664669e+08       210.837500    1068.0
     Sub-Saharan Africa          1.483449e+08       219.697143    7733.0


In [26]:
# the function passed to groupby is applied on the each of index values

# Groubpy operations can also be performed on transformations of the index values. 
# In the case of a DateTimeIndex, we can extract portions of the datetime over which to group.

# Read file: sales
sales = pd.read_csv('sales.csv', index_col='Date', parse_dates=True)

# Create a groupby object: by_day
# use .strftime('%a') to transform the index datetime values to abbreviated days of the week.
by_day = sales.groupby(sales.index.strftime('%a'))

# Create sum: units_sum
units_sum = by_day['Units'].sum()

# Print units_sum
print(units_sum)

# Trying following to find out, the results are different.
print(sales.reset_index().sort_index().groupby(sales.index.strftime('%a')).groups)
print(sales.reset_index().sort('Company').groupby(sales.index.strftime('%a')).groups)

Mon    48
Sat     7
Thu    59
Tue    13
Wed    48
Name: Units, dtype: int64
{'Mon': Int64Index([0, 1, 8, 9, 12], dtype='int64'), 'Sat': Int64Index([7, 15, 16], dtype='int64'), 'Thu': Int64Index([5, 6, 13, 14, 18], dtype='int64'), 'Tue': Int64Index([2], dtype='int64'), 'Wed': Int64Index([3, 4, 10, 11, 17], dtype='int64')}
{'Mon': Int64Index([4, 5, 2, 17, 14], dtype='int64'), 'Sat': Int64Index([12, 13, 8], dtype='int64'), 'Thu': Int64Index([6, 11, 9, 1, 18], dtype='int64'), 'Tue': Int64Index([7], dtype='int64'), 'Wed': Int64Index([0, 16, 10, 15, 3], dtype='int64')}




### Transformation

In [35]:
# you can apply a .transform() method after grouping to apply a function to groups of data independently. 
# The z-score is also useful to find outliers: a z-score value of +/- 3 is generally considered to be an outlier.

# Import zscore
from scipy.stats import zscore

gapminder_2010 = pd.read_csv('gapminder_2010.csv', index_col=['Country']).sort_index()
# Group gapminder_2010: standardized
standardized = gapminder_2010.groupby('region')['life', 'fertility'].transform(zscore)

# Construct a Boolean Series to identify outliers: outliers
outliers = (standardized['life'] < -3) | (standardized['fertility'] > 3)

# Filter gapminder_2010 by the outliers: gm_outliers
gm_outliers = gapminder_2010.loc[outliers]

# Print gm_outliers
print(gm_outliers)



                            region  fertility    life  population  \
Country                                                             
Guatemala                  America      3.974  71.100  14388929.0   
Haiti                      America      3.350  45.000   9993247.0   
Tajikistan   Europe & Central Asia      3.780  66.830   6878637.0   
Timor-Leste    East Asia & Pacific      6.237  65.952   1124355.0   

             child_mortality     gdp  
Country                               
Guatemala               34.5  6849.0  
Haiti                  208.8  1518.0  
Tajikistan              52.6  2110.0  
Timor-Leste             63.8  1777.0  


### Filling missing data (imputation) by group
Many statistical and machine learning packages cannot determine the best action to take when missing data entries are encountered. Dealing with missing data is natural in pandas (both in using the default behavior and in defining a custom behavior). In Chapter 1, you practiced using the .dropna() method to drop missing values. Now, you will practice imputing missing values. You can use .groupby() and .transform() to fill missing data appropriately for each group.

In [36]:
titanic = pd.read_csv('titanic.csv')
# Create a groupby object: by_sex_class
by_sex_class = titanic.groupby(['sex','pclass'])

# Write a function that imputes median
def impute_median(series):
    return series.fillna(series.median())

# Impute age and assign to titanic['age']
titanic['age'] = by_sex_class['age'].transform(impute_median)

# Print the output of titanic.tail(10)
print(titanic.tail(10))


      Unnamed: 0  pclass  survived                                     name  \
1299        1299       3         0                      Yasbeck, Mr. Antoni   
1300        1300       3         1  Yasbeck, Mrs. Antoni (Selini Alexander)   
1301        1301       3         0                     Youseff, Mr. Gerious   
1302        1302       3         0                        Yousif, Mr. Wazli   
1303        1303       3         0                    Yousseff, Mr. Gerious   
1304        1304       3         0                     Zabour, Miss. Hileni   
1305        1305       3         0                    Zabour, Miss. Thamine   
1306        1306       3         0                Zakarian, Mr. Mapriededer   
1307        1307       3         0                      Zakarian, Mr. Ortin   
1308        1308       3         0                       Zimmerman, Mr. Leo   

         sex   age  sibsp  parch  ticket     fare cabin embarked boat   body  \
1299    male  27.0      1      0    2659  14.4542 

### Other transformations with .apply
The .apply() method when used on a groupby object performs an arbitrary function on each of the groups. These functions can be aggregations, transformations or more complex workflows. The .apply() method will then combine the results in an intelligent way.

In [37]:
def disparity(gr):
    # Compute the spread of gr['gdp']: s
    s = gr['gdp'].max() - gr['gdp'].min()
    # Compute the z-score of gr['gdp'] as (gr['gdp']-gr['gdp'].mean())/gr['gdp'].std(): z
    z = (gr['gdp'] - gr['gdp'].mean())/gr['gdp'].std()
    # Return a DataFrame with the inputs {'z(gdp)':z, 'regional spread(gdp)':s}
    return pd.DataFrame({'z(gdp)':z , 'regional spread(gdp)':s})

# Group gapminder_2010 by 'region': regional
regional = gapminder_2010.groupby('region')

# Apply the disparity function on regional: reg_disp
reg_disp = regional.apply(disparity)

# Print the disparity of 'United States', 'United Kingdom', and 'China'
print(reg_disp.loc[['United States', 'United Kingdom', 'China']])


                regional spread(gdp)    z(gdp)
Country                                       
United States                47855.0  3.013374
United Kingdom               89037.0  0.572873
China                        96993.0 -0.432756


### Grouping and filtering with .apply()
By using .apply(), you can write functions that filter rows within groups. The .apply() method will handle the iteration over individual groups and then re-combine them back into a Series or DataFrame.

In [38]:
def c_deck_survival(gr):

    c_passengers = gr['cabin'].str.startswith('C').fillna(False)

    return gr.loc[c_passengers, 'survived'].mean()

# Create a groupby object using titanic over the 'sex' column: by_sex
by_sex = titanic.groupby('sex')

# Call by_sex.apply with the function c_deck_survival and print the result
c_surv_by_sex = by_sex.apply(c_deck_survival)

# Print the survival rates
print(c_surv_by_sex)


sex
female    0.913043
male      0.312500
dtype: float64


### Grouping and filtering with .filter()
You can use groupby with the .filter() method to remove whole groups of rows from a DataFrame based a boolean condition.

In [39]:
# Read the CSV file into a DataFrame: sales
sales = pd.read_csv('sales.csv', index_col='Date', parse_dates=True)

# Group sales by 'Company': by_company
by_company = sales.groupby('Company')

# Compute the sum of the 'Units' of by_company: by_com_sum
by_com_sum = by_company['Units'].sum()
print(by_com_sum)

# Filter 'Units' where the sum is > 35: by_com_filt
by_com_filt = by_company.filter(lambda g: g['Units'].sum() > 35)
print(by_com_filt)


Company
Acme Coporation    34
Hooli              30
Initech            30
Mediacore          45
Streeplex          36
Name: Units, dtype: int64
                       Company   Product  Units
Date                                           
2015-02-02 21:00:00  Mediacore  Hardware      9
2015-02-04 15:30:00  Streeplex  Software     13
2015-02-09 09:00:00  Streeplex   Service     19
2015-02-09 13:00:00  Mediacore  Software      7
2015-02-19 11:00:00  Mediacore  Hardware     16
2015-02-19 16:00:00  Mediacore   Service     10
2015-02-21 05:00:00  Mediacore  Software      3
2015-02-26 09:00:00  Streeplex   Service      4


### Filtering and grouping with .map()
You have seen how to group by a column, or by multiple columns. Sometimes, you may instead want to group by a function/transformation of a column. The key here is that the Series is indexed the same way as the DataFrame. You can also mix and match column grouping with Series grouping.

In [40]:
# Create the Boolean Series: under10
under10 = (titanic['age'] < 10).map({True:'under 10', False:'over 10'})

# Group by under10 and compute the survival rate
survived_mean_1 = titanic.groupby(under10)['survived'].mean()
print(survived_mean_1)

# Group by under10 and pclass and compute the survival rate
survived_mean_2 = titanic.groupby([under10, 'pclass'])['survived'].mean()
print(survived_mean_2)


age
over 10     0.366748
under 10    0.609756
Name: survived, dtype: float64
age       pclass
over 10   1         0.617555
          2         0.380392
          3         0.238897
under 10  1         0.750000
          2         1.000000
          3         0.446429
Name: survived, dtype: float64


## Bringing it all together 
Case Study - Summer Olympics

In [45]:
# Read file: sales
medals = pd.read_csv('ALL_MEDALISTS.csv', header=4)

In [49]:
medals.head()

Unnamed: 0,City,Edition,Sport,Discipline,Athlete,NOC,Gender,Event,Event_gender,Medal
0,Athens,1896,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100m freestyle,M,Gold
1,Athens,1896,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100m freestyle,M,Silver
2,Athens,1896,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100m freestyle for sailors,M,Bronze
3,Athens,1896,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100m freestyle for sailors,M,Gold
4,Athens,1896,Aquatics,Swimming,"CHASAPIS, Spiridon",GRE,Men,100m freestyle for sailors,M,Silver


In [50]:
# Suppose you have loaded the data into a DataFrame medals. 
# You now want to find the total number of medals awarded to the USA per edition. 
USA_edition_grouped = medals.loc[medals.NOC == 'USA'].groupby('Edition')
USA_edition_grouped['Medal'].count().head()

Edition
1896     20
1900     55
1904    394
1908     63
1912    101
Name: Medal, dtype: int64

In [87]:
# Select the 'NOC' column of medals: country_names
country_names = medals['NOC']

# Count the number of medals won by each country: medal_counts
medal_counts1 = medals.groupby(country_names)['Medal'].count().sort_values(ascending=False)
medal_counts2 = country_names.value_counts()
# medal_counts2.index.name = 'NOC'
# medal_counts1.index, medal_counts2.index,
# len(medal_counts1) == len(medal_counts2)
re = medal_counts1.sort_index() == medal_counts2.sort_index()
re[re] 
re.sum()
# Print top 15 countries ranked by medals
# medal_counts2.head()

138