# Chapter 3

In [1]:
## Merging dataframs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
# data prep
dr = {'branch_id': [10, 20, 30, 47],
      'city': ['Austin', 'Denver', 'Springfield', 'Mendocino'],
      'revenue': [100, 83, 4, 200]}
revenue = pd.DataFrame(dr)

dm = {'branch_id': [10, 20, 47, 31],
      'city': ['Austin', 'Denver', 'Mendocino', 'Springfield'],
      'manager': ['Charles', 'Joel', 'Brett', 'Sally']}
managers = pd.DataFrame(dm)

In [8]:
# Merge revenue with managers on 'city': merge_by_city
merge_by_city = pd.merge(revenue, managers, on='city')

# Print merge_by_city
print(merge_by_city)

# Merge revenue with managers on 'branch_id': merge_by_id
merge_by_id = pd.merge(revenue, managers, on='branch_id')

# Print merge_by_id
print(merge_by_id)

   branch_id_x         city  revenue  branch_id_y  manager
0           10       Austin      100           10  Charles
1           20       Denver       83           20     Joel
2           30  Springfield        4           31    Sally
3           47    Mendocino      200           47    Brett
   branch_id     city_x  revenue     city_y  manager
0         10     Austin      100     Austin  Charles
1         20     Denver       83     Denver     Joel
2         47  Mendocino      200  Mendocino    Brett


In [9]:
# data prep
dr = {'branch_id': [10, 20, 30, 47],
      'city': ['Austin', 'Denver', 'Springfield', 'Mendocino'],
      'revenue': [100, 83, 4, 200],
      'state': ['TX', 'CO', 'IL', 'CA']}
revenue = pd.DataFrame(dr)

dm = {'branch': ['Austin', 'Denver', 'Mendocino', 'Springfield'], 
      'branch_id': [10, 20, 47, 31],
      'manager': ['Charles', 'Joel', 'Brett', 'Sally'],
      'state': ['TX', 'CO', 'CA', 'MO']}
managers = pd.DataFrame(dm)

In [11]:
# Merge revenue & managers on 'city' & 'branch': combined
combined = pd.merge(revenue, managers, left_on='city', right_on='branch')

# Print combined
print(combined)

   branch_id_x         city  revenue state_x       branch  branch_id_y  \
0           10       Austin      100      TX       Austin           10   
1           20       Denver       83      CO       Denver           20   
2           30  Springfield        4      IL  Springfield           31   
3           47    Mendocino      200      CA    Mendocino           47   

   manager state_y  
0  Charles      TX  
1     Joel      CO  
2    Sally      MO  
3    Brett      CA  


In [12]:
# ...or combine on branch_id
combined = pd.merge(revenue, managers, on='branch_id')

# Print combined
print(combined)

   branch_id       city  revenue state_x     branch  manager state_y
0         10     Austin      100      TX     Austin  Charles      TX
1         20     Denver       83      CO     Denver     Joel      CO
2         47  Mendocino      200      CA  Mendocino    Brett      CA


In [14]:
# data prep
dr = {'branch_id': [10, 20, 30, 47],
      'city': ['Austin', 'Denver', 'Springfield', 'Mendocino'],
      'revenue': [100, 83, 4, 200]}
revenue = pd.DataFrame(dr)

dm = {'branch_id': [10, 20, 47, 31],
      'city': ['Austin', 'Denver', 'Mendocino', 'Springfield'],
      'manager': ['Charles', 'Joel', 'Brett', 'Sally']}
managers = pd.DataFrame(dm)

In [15]:
# Add 'state' column to revenue: revenue['state']
revenue['state'] = ['TX','CO','IL','CA']

# Add 'state' column to managers: managers['state']
managers['state'] = ['TX','CO','CA','MO']

# Merge revenue & managers on 'branch_id', 'city', & 'state': combined
combined = pd.merge(revenue, managers, on=['branch_id', 'city', 'state'])

# Print combined
print(combined)

   branch_id       city  revenue state  manager
0         10     Austin      100    TX  Charles
1         20     Denver       83    CO     Joel
2         47  Mendocino      200    CA    Brett


## Joining Dataframes

In [21]:
# data prep
dr = {'branch_id': [10, 20, 30, 47],
      'city': ['Austin', 'Denver', 'Springfield', 'Mendocino'],
      'revenue': [100, 83, 4, 200],
      'state': ['TX','CO','IL','CA']}
revenue = pd.DataFrame(dr)

dm = {'branch': ['Austin', 'Denver', 'Mendocino', 'Springfield'],
      'branch_id': [10, 20, 47, 31], 
      'manager': ['Charles', 'Joel', 'Brett', 'Sally'],
      'state': ['TX','CO','CA','MO']}
managers = pd.DataFrame(dm)

ds = {'city': ['Mendocino', 'Denver', 'Austin', 'Springfield', 'Springfield'],
      'state': ['CA', 'CO', 'TX', 'MO', 'IL'],
      'units': [1, 4, 2, 5, 1]}
sales = pd.DataFrame(ds)

In [22]:
# Merge revenue and sales: revenue_and_sales
revenue_and_sales = pd.merge(revenue, sales, how='right', on=['city', 'state'])

# Print revenue_and_sales
print(revenue_and_sales)

# Merge sales and managers: sales_and_managers
sales_and_managers = pd.merge(sales, managers,
                              how='left',
                              left_on=['city', 'state'],
                              right_on=['branch', 'state'])

# Print sales_and_managers
print(sales_and_managers)

   branch_id         city  revenue state  units
0       10.0       Austin    100.0    TX      2
1       20.0       Denver     83.0    CO      4
2       30.0  Springfield      4.0    IL      1
3       47.0    Mendocino    200.0    CA      1
4        NaN  Springfield      NaN    MO      5
          city state  units       branch  branch_id  manager
0    Mendocino    CA      1    Mendocino       47.0    Brett
1       Denver    CO      4       Denver       20.0     Joel
2       Austin    TX      2       Austin       10.0  Charles
3  Springfield    MO      5  Springfield       31.0    Sally
4  Springfield    IL      1          NaN        NaN      NaN


In [28]:
# Perform the first merge: merge_default
merge_default = pd.merge(sales_and_managers, revenue_and_sales)

# Print merge_default
print(merge_default)

# Perform the second merge: merge_outer
merge_outer = pd.merge(sales_and_managers, revenue_and_sales, how='outer')

# Print merge_outer
print(merge_outer)

# Perform the third merge: merge_outer_on
merge_outer_on = pd.merge(sales_and_managers, revenue_and_sales,
                          how='outer',
                          on=['city', 'state'])

# Print merge_outer_on
print(merge_outer_on)

        city state  units     branch  branch_id  manager  revenue
0  Mendocino    CA      1  Mendocino       47.0    Brett    200.0
1     Denver    CO      4     Denver       20.0     Joel     83.0
2     Austin    TX      2     Austin       10.0  Charles    100.0
          city state  units       branch  branch_id  manager  revenue
0    Mendocino    CA      1    Mendocino       47.0    Brett    200.0
1       Denver    CO      4       Denver       20.0     Joel     83.0
2       Austin    TX      2       Austin       10.0  Charles    100.0
3  Springfield    MO      5  Springfield       31.0    Sally      NaN
4  Springfield    IL      1          NaN        NaN      NaN      NaN
5  Springfield    IL      1          NaN       30.0      NaN      4.0
6  Springfield    MO      5          NaN        NaN      NaN      NaN
          city state  units_x       branch  branch_id_x  manager  branch_id_y  \
0    Mendocino    CA        1    Mendocino         47.0    Brett         47.0   
1       Denver

## Ordered Merges

In [32]:
# data prep
da = {'date': [pd.Timestamp('2016-01-01 00:00:00'),
               pd.Timestamp('2016-02-08 00:00:00'),
               pd.Timestamp('2016-01-17 00:00:00')],
      'ratings': ['Cloudy', 'Cloudy', 'Sunny']}
austin = pd.DataFrame(da)

dh = {'date': [pd.Timestamp('2016-01-04 00:00:00'),
               pd.Timestamp('2016-01-01 00:00:00'),
               pd.Timestamp('2016-03-01 00:00:00')],
      'ratings': ['Rainy', 'Cloudy', 'Sunny']}
houston = pd.DataFrame(dh)

In [35]:
austin.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
date       3 non-null datetime64[ns]
ratings    3 non-null object
dtypes: datetime64[ns](1), object(1)
memory usage: 128.0+ bytes


In [38]:
# Perform the first ordered merge: tx_weather
tx_weather = pd.merge_ordered(austin, houston)

# Print tx_weather
print(tx_weather)

# Perform the second ordered merge: tx_weather_suff
tx_weather_suff = pd.merge_ordered(austin, houston, on='date', suffixes=['_aus', '_hus'])

# Print tx_weather_suff
print(tx_weather_suff)

# Perform the third ordered merge: tx_weather_ffill
tx_weather_ffill = pd.merge_ordered(austin, houston,
                                    on='date',
                                    suffixes=['_aus', '_hus'],
                                    fill_method='ffill')

# Print tx_weather_ffill
print(tx_weather_ffill)

        date ratings
0 2016-01-01  Cloudy
1 2016-01-04   Rainy
2 2016-01-17   Sunny
3 2016-02-08  Cloudy
4 2016-03-01   Sunny
        date ratings_aus ratings_hus
0 2016-01-01      Cloudy      Cloudy
1 2016-01-04         NaN       Rainy
2 2016-01-17       Sunny         NaN
3 2016-02-08      Cloudy         NaN
4 2016-03-01         NaN       Sunny
        date ratings_aus ratings_hus
0 2016-01-01      Cloudy      Cloudy
1 2016-01-04      Cloudy       Rainy
2 2016-01-17       Sunny       Rainy
3 2016-02-08      Cloudy       Rainy
4 2016-03-01      Cloudy       Sunny


In [48]:
# data prep
from urllib.request import urlretrieve

file = 'oil_price.csv'
url = 'https://assets.datacamp.com/production/course_1681/datasets/' + file
urlretrieve(url, file)
oil = pd.read_csv(file, parse_dates=['Date'])

file = 'automobiles.csv'
url = 'https://assets.datacamp.com/production/course_1681/datasets/' + file
urlretrieve(url, file)
auto = pd.read_csv(file, parse_dates=['yr'])

In [49]:
oil.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 2 columns):
Date     156 non-null datetime64[ns]
Price    156 non-null float64
dtypes: datetime64[ns](1), float64(1)
memory usage: 2.5 KB


In [51]:
# Merge auto and oil: merged
merged = pd.merge_asof(auto, oil,
                      left_on='yr',
                      right_on='Date')

# Print the tail of merged
print(merged.tail())

# Resample merged: yearly
yearly = merged.resample('A', on='Date')[['mpg', 'Price']].mean()

# Print yearly
print(yearly)

# print yearly.corr()
print(yearly.corr())

      mpg  cyl  displ  hp  weight  accel         yr  origin             name  \
387  27.0    4  140.0  86    2790   15.6 1982-01-01      US  ford mustang gl   
388  44.0    4   97.0  52    2130   24.6 1982-01-01  Europe        vw pickup   
389  32.0    4  135.0  84    2295   11.6 1982-01-01      US    dodge rampage   
390  28.0    4  120.0  79    2625   18.6 1982-01-01      US      ford ranger   
391  31.0    4  119.0  82    2720   19.4 1982-01-01      US       chevy s-10   

          Date  Price  
387 1982-01-01  33.85  
388 1982-01-01  33.85  
389 1982-01-01  33.85  
390 1982-01-01  33.85  
391 1982-01-01  33.85  
                  mpg  Price
Date                        
1970-12-31  17.689655   3.35
1971-12-31  21.111111   3.56
1972-12-31  18.714286   3.56
1973-12-31  17.100000   3.56
1974-12-31  22.769231  10.11
1975-12-31  20.266667  11.16
1976-12-31  21.573529  11.16
1977-12-31  23.375000  13.90
1978-12-31  24.061111  14.85
1979-12-31  25.093103  14.85
1980-12-31  33.803704  32.5