In [3]:
import pandas as pd

In [4]:
data1 = {
    'Zone': ['West', 'South', 'West', 'North', 'East', 'South'],
    'Person': ['Jack', 'Jane', 'Jill', 'Jason', 'Jordan', 'Jenna'],
    'Division': ['Sales', 'Marketing', 'Sales', 'Operations', 'Admin', 'Sales']
}

df1 = pd.DataFrame(data1)

In [5]:
data2 = {
    'Region'  : ['West', 'East', 'West', 'North', 'East', 'Midwest'],
    'Person'  : ['Jack', 'Jackson', 'Jill', 'Jason', 'Jordan', 'Adam'],
    'Division': ['Sales', 'Management', 'Sales', 'Operations', 'Admin', 'Admin']
}

df2 = pd.DataFrame(data2)

In [31]:
df1

Unnamed: 0,Zone,Person,Division
0,West,Jack,Sales
1,South,Jane,Marketing
2,West,Jill,Sales
3,North,Jason,Operations
4,East,Jordan,Admin
5,South,Jenna,Sales


In [16]:
df1.merge(df2)
# Where the rows in df1 match the rows in df2, create this new dataframe
# This looks for matching values on every single column
# This is default behavior. Don't have to specify a primary key.
# So, if you don't identify UID to join on, it will just perform the operation this way

Unnamed: 0,Zone,Person,Division,Region
0,West,Jack,Sales,West
1,West,Jill,Sales,West
2,North,Jason,Operations,North
3,East,Jordan,Admin,East


df1.merge(df2, left_on='Zone', right_on='Region')

In [18]:
df1.merge(df2, left_on='Zone', right_on='Region')
# Looking for matching zone columns on the left, and matching region columns on the right
# df1 is left, df2 is right
# Basically a where zone = region clause

Unnamed: 0,Zone,Person_x,Division_x,Region,Person_y,Division_y
0,West,Jack,Sales,West,Jack,Sales
1,West,Jack,Sales,West,Jill,Sales
2,West,Jill,Sales,West,Jack,Sales
3,West,Jill,Sales,West,Jill,Sales
4,North,Jason,Operations,North,Jason,Operations
5,East,Jordan,Admin,East,Jackson,Management
6,East,Jordan,Admin,East,Jordan,Admin


In [20]:
# This gives you all possible merge combinations
# Zone and Region values are not unique

In [21]:
df1.merge(df2, on='Person')
# Merging on a shared label

Unnamed: 0,Zone,Person,Division_x,Region,Division_y
0,West,Jack,Sales,West,Sales
1,West,Jill,Sales,West,Sales
2,North,Jason,Operations,North,Operations
3,East,Jordan,Admin,East,Admin


In [24]:
df1.merge(df2, left_on=['Zone','Person'], right_on=['Region', 'Person'])

Unnamed: 0,Zone,Person,Division_x,Region,Division_y
0,West,Jack,Sales,West,Sales
1,West,Jill,Sales,West,Sales
2,North,Jason,Operations,North,Operations
3,East,Jordan,Admin,East,Admin


In [32]:
# Lots of different ways to combine data frames
# If you don't specify, it looks for matching values

df1.merge(df2, how='left')
# Df1 is the left; df2 is the right
# Since the region values don't occur in the right dataframe, it returns null values

Unnamed: 0,Zone,Person,Division,Region
0,West,Jack,Sales,West
1,South,Jane,Marketing,
2,West,Jill,Sales,West
3,North,Jason,Operations,North
4,East,Jordan,Admin,East
5,South,Jenna,Sales,


In [34]:
df1.merge(df2, how='right')
# This shows the oppsite; null values in the Zone column
# You keep everything on the right side and kick out null values where there are blanks

Unnamed: 0,Zone,Person,Division,Region
0,West,Jack,Sales,West
1,West,Jill,Sales,West
2,North,Jason,Operations,North
3,East,Jordan,Admin,East
4,,Jackson,Management,East
5,,Adam,Admin,Midwest


In [36]:
# If you want to stack data, you concat
df3 = df1.copy()
df3

Unnamed: 0,Zone,Person,Division
0,West,Jack,Sales
1,South,Jane,Marketing
2,West,Jill,Sales
3,North,Jason,Operations
4,East,Jordan,Admin
5,South,Jenna,Sales


In [40]:
pd.concat([df1, df3])
# Has to be passed in as a list
# You can only merge two data frames at a time, but you can concatenate as many as you want
# Good if you have a bunch of csv files in the same directory

Unnamed: 0,Zone,Person,Division
0,West,Jack,Sales
1,South,Jane,Marketing
2,West,Jill,Sales
3,North,Jason,Operations
4,East,Jordan,Admin
5,South,Jenna,Sales
0,West,Jack,Sales
1,South,Jane,Marketing
2,West,Jill,Sales
3,North,Jason,Operations


In [43]:
df = pd.read_excel("/Users/aoifeduna/AoifeRepo/aoiferepo/Lectures/Unit2/data/superstore.xls")
# Need to do 2 separate merges

In [42]:
df.head()

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582
2,3,CA-2016-138688,2016-06-12,2016-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714
3,4,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031
4,5,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164


In [47]:
df.groupby('Region')
# Group by statements are their own special data type. To make it mean anything you have to pass in an aggregate.

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x10a121210>

In [48]:
df.groupby('Region').sum()

Unnamed: 0_level_0,Row ID,Postal Code,Sales,Quantity,Discount,Profit
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Central,11685963,151786150,501239.8908,8780,558.34,39706.3625
East,14073919,50171698,678781.24,10618,414.0,91522.78
South,8213295,55875052,391721.905,6209,238.55,46749.4303
West,15971838,293739752,725457.8245,12266,350.2,108418.4489


In [56]:
# If you want to group by multiple items you have to pass in a list.

df.groupby(['Region', 'Category']).sum()
# This is returning a data frame to us
# All normal rules for data frames apply to this
# This has a multi-level index

Unnamed: 0_level_0,Unnamed: 1_level_0,Row ID,Postal Code,Sales,Quantity,Discount,Profit
Region,Category,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Central,Furniture,2500638,31498984,163797.1638,1827,143.04,-2871.0494
Central,Office Supplies,7146637,92822095,167026.415,5409,359.4,8879.9799
Central,Technology,2038688,27465071,170416.312,1544,55.9,33697.432
East,Furniture,2991153,10407666,208291.204,2214,92.6,3046.1658
East,Office Supplies,8433550,29791451,205516.055,6462,244.7,41014.5791
East,Technology,2649216,9972581,264973.981,1942,76.7,47462.0351
South,Furniture,1666568,11471303,117298.684,1291,40.35,6771.2061
South,Office Supplies,5079770,34072173,125651.313,3800,166.6,19986.3928
South,Technology,1466957,10331576,148771.908,1118,31.6,19991.8314
West,Furniture,3534967,64818073,252612.7435,2696,92.9,11504.9503


In [55]:
df.groupby(['Region', 'Category']).sum().reset_index()
# This returns it to the format of a normal data frame

Unnamed: 0,Region,Category,Row ID,Postal Code,Sales,Quantity,Discount,Profit
0,Central,Furniture,2500638,31498984,163797.1638,1827,143.04,-2871.0494
1,Central,Office Supplies,7146637,92822095,167026.415,5409,359.4,8879.9799
2,Central,Technology,2038688,27465071,170416.312,1544,55.9,33697.432
3,East,Furniture,2991153,10407666,208291.204,2214,92.6,3046.1658
4,East,Office Supplies,8433550,29791451,205516.055,6462,244.7,41014.5791
5,East,Technology,2649216,9972581,264973.981,1942,76.7,47462.0351
6,South,Furniture,1666568,11471303,117298.684,1291,40.35,6771.2061
7,South,Office Supplies,5079770,34072173,125651.313,3800,166.6,19986.3928
8,South,Technology,1466957,10331576,148771.908,1118,31.6,19991.8314
9,West,Furniture,3534967,64818073,252612.7435,2696,92.9,11504.9503


In [58]:
df.groupby(['Region', 'Category'])['Sales'].sum()
# This is just grabbing the sales column from the groupby statement

Region   Category       
Central  Furniture          163797.1638
         Office Supplies    167026.4150
         Technology         170416.3120
East     Furniture          208291.2040
         Office Supplies    205516.0550
         Technology         264973.9810
South    Furniture          117298.6840
         Office Supplies    125651.3130
         Technology         148771.9080
West     Furniture          252612.7435
         Office Supplies    220853.2490
         Technology         251991.8320
Name: Sales, dtype: float64

In [60]:
df.groupby(['Region', 'Category'])['Sales'].agg(['mean', 'min', 'max'])
# This allows you to select multiple aggregate functions at the same time

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,min,max
Region,Category,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Central,Furniture,340.534644,1.892,3504.9
Central,Office Supplies,117.458801,0.444,9892.74
Central,Technology,405.753124,1.98,17499.95
East,Furniture,346.574383,2.96,4416.174
East,Office Supplies,120.044425,0.852,4663.736
East,Technology,495.278469,2.97,11199.968
South,Furniture,353.309289,2.784,4297.644
South,Office Supplies,126.282727,1.167,6354.95
South,Technology,507.753952,1.584,22638.48
West,Furniture,357.302325,3.48,3610.848


In [68]:
df.resample('W', on='Order Date')['Sales'].sum()
# Sum of sales on any week within our data set
# Can do M or D or Q (quarter) or Y

Order Date
2014-01-05      324.0440
2014-01-12     4599.5720
2014-01-19     4509.1270
2014-01-26     3842.3880
2014-02-02     1642.3100
                 ...    
2017-12-03    32354.5700
2017-12-10    24006.9580
2017-12-17    10495.9630
2017-12-24    23662.3640
2017-12-31     8977.8318
Freq: W-SUN, Name: Sales, Length: 209, dtype: float64

In [70]:
df.resample('W-Fri', on='Order Date')['Sales'].sum()
# Can specify what day the week ends on
# This would be the week ending on Friday

Order Date
2014-01-03       16.4480
2014-01-10     4897.2280
2014-01-17     4075.6090
2014-01-24     3188.5960
2014-01-31     2059.0140
                 ...    
2017-12-08    26018.0330
2017-12-15    16953.4520
2017-12-22    18388.9940
2017-12-29    16423.8718
2018-01-05      713.7900
Freq: W-FRI, Name: Sales, Length: 210, dtype: float64