# Pandas (continues)

In [1]:
import pandas as pd
import numpy as np

## Catenating datasets

In [2]:
# axis = 0 catenates vertically and axis = 1 catenates horizontally. 
# pandas dataframe works similarly but row indices and the column names require extra attention.
# also the difference between np.concatenate & pd.concat
## helper for creating dataframes
def makedf(cols, ind):
    data = {c : [str(c) + str(i) for i in ind] for c in cols}
    return pd.DataFrame(data, ind)

In [3]:
a=makedf("AB", [0,1])
a

Unnamed: 0,A,B
0,A0,B0
1,A1,B1


In [4]:
b=makedf("AB", [2,3])
b

Unnamed: 0,A,B
2,A2,B2
3,A3,B3


In [5]:
c=makedf("CD", [0,1])
c

Unnamed: 0,C,D
0,C0,D0
1,C1,D1


In [6]:
d=makedf("BC", [2,3])
d

Unnamed: 0,B,C
2,B2,C2
3,B3,C3


In [7]:
## concat a and b -- works as expected
pd.concat([a,b])   # The default axis is 0

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3


In [8]:
r=pd.concat([a,a])
r

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
0,A0,B0
1,A1,B1


In [9]:
r.loc[0,"A"]

0    A0
0    A0
Name: A, dtype: object

In [10]:
## indices should be unique, so that's not what we aim to. 
## here's a tool to chek duplicated indices
try:
    pd.concat([a,a], verify_integrity=True)
except ValueError as e:
    import sys
    print(e, file=sys.stderr)

Indexes have overlapping values: Int64Index([0, 1], dtype='int64')


In [11]:
## automatic renumbering of rows
pd.concat([a,a], ignore_index=True)

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A0,B0
3,A1,B1


In [12]:
## hierarchical indexing -- multiple level indices
r2=pd.concat([a,a], keys=['first', 'second'])
r2

Unnamed: 0,Unnamed: 1,A,B
first,0,A0,B0
first,1,A1,B1
second,0,A0,B0
second,1,A1,B1


In [13]:
r2["A"]["first"][0] # works as numpy

'A0'

In [14]:
## it works similarly with horizontal catenation
pd.concat([a,c], axis=1)

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1


In [15]:
## if you concatenate vertically two DataFrames that don't have the same columns
## this is called outer join, considers all the columns (the common ones and the different ones)
pd.concat([a,d], sort=False)    # sort option is used to silence a deprecation message

Unnamed: 0,A,B,C
0,A0,B0,
1,A1,B1,
2,,B2,C2
3,,B3,C3


In [16]:
## here is the same catenation but with inner join
pd.concat([a,d], join="inner")

Unnamed: 0,B
0,B0
1,B1
2,B2
3,B3


In [17]:
## exercose 01 (split date continues)
#url = "https://raw.githubusercontent.com/annassanchez/HY-2021-DataScience/main/week04/Helsingin_pyorailijamaarat.csv"
#kk = pd.read_csv(url, sep = ';', header = 0)

def split_date(df):
    df = df.dropna(axis=0, how='all')
    df = df.dropna(axis=1, how='all')
    df[["Weekday", "Day", "Month", "Year", "Hour"]] = df['Päivämäärä'].str.split(expand=True)
    df[["Weekday"]] = df[["Weekday"]].replace({"ma": "Mon", "ti": "Tue", "ke": "Wed", "to": "Thu", 
        "pe": "Fri", "la": "Sat", "su": "Sun"})
    df[["Month"]] = df[["Month"]].replace({"tammi": "1", "helmi": "2", "maalis": "3", "huhti": "4", 
        "touko": "5", "kesä": "6", "heinä": "7", "elo": "8", "syys": "9", "loka": "10", "marras": "11",
        "joulu": "12"})
    df["Hour"] = df["Hour"].str.split(":", expand=True)[0].map(int)
    df['Päivämäärä'] = df["Weekday"] + " " + df["Day"] + " " + df["Month"] + " " + df["Year"] + " " + df["Hour"].map(str)
    df = df[df.columns.tolist()[-5:] + [df.columns.tolist()[0]]]
    return df.astype({"Weekday":object, "Day":int,  "Month":int, "Year":int, "Hour": int})

In [18]:
def split_date_continues():
    #d = split_date()
    url = "https://raw.githubusercontent.com/annassanchez/HY-2021-DataScience/main/week04/Helsingin_pyorailijamaarat.csv"
    data = pd.read_csv(url, sep = ';', header = 0)
    d = split_date(data)
    data = data.dropna(axis=0, how='all')
    data = data.dropna(axis=1, how='all')
    #data.drop(['Päivämäärä'], inplace = True, axis = 1)
    final = pd.concat([d, data], axis = 1)
    final.drop([final.columns.tolist()[5]],inplace = True, axis = 1)
    return final#.astype({"Weekday":object, "Day":np.int32,  "Month":np.int32, "Year":np.int32, "Hour": float})


In [19]:
url = "https://raw.githubusercontent.com/annassanchez/HY-2021-DataScience/main/week04/Helsingin_pyorailijamaarat.csv"
df = pd.read_csv(url, sep = ';', header = 0)
split_date(df)

Unnamed: 0,Weekday,Day,Month,Year,Hour,Päivämäärä
0,Wed,1,1,2014,0,Wed 1 1 2014 0
1,Wed,1,1,2014,1,Wed 1 1 2014 1
2,Wed,1,1,2014,2,Wed 1 1 2014 2
3,Wed,1,1,2014,3,Wed 1 1 2014 3
4,Wed,1,1,2014,4,Wed 1 1 2014 4
...,...,...,...,...,...,...
37123,Tue,27,3,2018,19,Tue 27 3 2018 19
37124,Tue,27,3,2018,20,Tue 27 3 2018 20
37125,Tue,27,3,2018,21,Tue 27 3 2018 21
37126,Tue,27,3,2018,22,Tue 27 3 2018 22


In [20]:
df = split_date_continues()
print("Shape:", df.shape)
print("Column names:\n", df.columns)
print(df.head())

Shape: (37128, 25)
Column names:
 Index(['Weekday', 'Day', 'Month', 'Year', 'Hour', 'Auroransilta',
       'Eteläesplanadi', 'Huopalahti (asema)', 'Kaisaniemi/Eläintarhanlahti',
       'Kaivokatu', 'Kulosaaren silta et.', 'Kulosaaren silta po. ',
       'Kuusisaarentie', 'Käpylä, Pohjoisbaana',
       'Lauttasaaren silta eteläpuoli', 'Merikannontie',
       'Munkkiniemen silta eteläpuoli', 'Munkkiniemi silta pohjoispuoli',
       'Heperian puisto/Ooppera', 'Pitkäsilta itäpuoli',
       'Pitkäsilta länsipuoli', 'Lauttasaaren silta pohjoispuoli',
       'Ratapihantie', 'Viikintie', 'Baana'],
      dtype='object')
  Weekday  Day  Month  ...  Ratapihantie  Viikintie  Baana
0     Wed    1      1  ...           NaN        NaN    8.0
1     Wed    1      1  ...           NaN        NaN    4.0
2     Wed    1      1  ...           NaN        NaN   11.0
3     Wed    1      1  ...           NaN        NaN    3.0
4     Wed    1      1  ...           NaN        NaN    4.0

[5 rows x 25 columns]


In [21]:
## alternative solution
days = dict(zip("ma ti ke to pe la su".split(), "Mon Tue Wed Thu Fri Sat Sun".split()))
months = dict(zip("tammi helmi maalis huhti touko kesä heinä elo syys loka marras joulu".split(), range(1, 13)))

def split_date(df):
    d = df["Päivämäärä"].str.split(expand=True)
    d.columns = ["Weekday", "Day", "Month", "Year", "Hour"]
    hourmin = d["Hour"].str.split(":", expand=True)
    d["Hour"] = hourmin.iloc[:, 0]
    d["Weekday"] = d["Weekday"].map(days)
    d["Month"] = d["Month"].map(months)
    d = d.astype({"Weekday": object, "Day": int, "Month": int, "Year": int, "Hour": int})
    return d
def split_date_continues():
    url = "https://raw.githubusercontent.com/annassanchez/HY-2021-DataScience/main/week04/Helsingin_pyorailijamaarat.csv"
    df = pd.read_csv(url, sep = ';', header = 0)
    df = df.dropna(axis=0, how="all").dropna(axis=1, how="all")
    d = split_date(df)
    df = df.drop("Päivämäärä", axis=1)
    result = pd.concat([d, df], axis=1)
    return result


df = split_date_continues()
print("Shape:", df.shape)
print("Column names:\n", df.columns)
print(df.head())

Shape: (37128, 25)
Column names:
 Index(['Weekday', 'Day', 'Month', 'Year', 'Hour', 'Auroransilta',
       'Eteläesplanadi', 'Huopalahti (asema)', 'Kaisaniemi/Eläintarhanlahti',
       'Kaivokatu', 'Kulosaaren silta et.', 'Kulosaaren silta po. ',
       'Kuusisaarentie', 'Käpylä, Pohjoisbaana',
       'Lauttasaaren silta eteläpuoli', 'Merikannontie',
       'Munkkiniemen silta eteläpuoli', 'Munkkiniemi silta pohjoispuoli',
       'Heperian puisto/Ooppera', 'Pitkäsilta itäpuoli',
       'Pitkäsilta länsipuoli', 'Lauttasaaren silta pohjoispuoli',
       'Ratapihantie', 'Viikintie', 'Baana'],
      dtype='object')
  Weekday  Day  Month  ...  Ratapihantie  Viikintie  Baana
0     Wed    1      1  ...           NaN        NaN    8.0
1     Wed    1      1  ...           NaN        NaN    4.0
2     Wed    1      1  ...           NaN        NaN   11.0
3     Wed    1      1  ...           NaN        NaN    3.0
4     Wed    1      1  ...           NaN        NaN    4.0

[5 rows x 25 columns]


##Merging dataframes

In [22]:
## original dataframe with the wages and ages
df = pd.DataFrame([[1000, "Jack", 21], [1500, "John", 29]], columns=["Wage", "Name", "Age"])
df

Unnamed: 0,Wage,Name,Age
0,1000,Jack,21
1,1500,John,29


In [23]:
## occupations of persons
df2 = pd.DataFrame({"Name" : ["John", "Jack"], "Occupation": ["Plumber", "Carpenter"]})
df2

Unnamed: 0,Name,Occupation
0,John,Plumber
1,Jack,Carpenter


In [24]:
# merge joins two dataframes based on a common field and will keep the indices aligned
pd.merge(df, df2)

Unnamed: 0,Wage,Name,Age,Occupation
0,1000,Jack,21,Carpenter
1,1500,John,29,Plumber


In [25]:
## sometimes not all the keys appear in both dataframes:
df3 = pd.concat([df2, pd.DataFrame({ "Name" : ["James"], "Occupation":["Painter"]})], ignore_index=True)
df3

Unnamed: 0,Name,Occupation
0,John,Plumber
1,Jack,Carpenter
2,James,Painter


In [26]:
## when merging, only joins the common index
pd.merge(df, df3)                # By default an inner join is computed

Unnamed: 0,Wage,Name,Age,Occupation
0,1000,Jack,21,Carpenter
1,1500,John,29,Plumber


In [27]:
pd.merge(df, df3, how="outer")   # Outer join

Unnamed: 0,Wage,Name,Age,Occupation
0,1000.0,Jack,21.0,Carpenter
1,1500.0,John,29.0,Plumber
2,,James,,Painter


In [28]:
## one-to-many relationship is also possible on merges
books = pd.DataFrame({"Title" : ["War and Peace", "Good Omens", "Good Omens"] ,
                      "Author" : ["Tolstoi", "Terry Pratchett", "Neil Gaiman"]})
books

Unnamed: 0,Title,Author
0,War and Peace,Tolstoi
1,Good Omens,Terry Pratchett
2,Good Omens,Neil Gaiman


In [29]:
collections = pd.DataFrame([["Oodi", "War and Peace"],
                           ["Oodi", "Good Omens"],
                           ["Pasila", "Good Omens"],
                           ["Kallio", "War and Peace"]], columns=["Library", "Title"])
collections

Unnamed: 0,Library,Title
0,Oodi,War and Peace
1,Oodi,Good Omens
2,Pasila,Good Omens
3,Kallio,War and Peace


In [30]:
libraries_with_books_by = pd.merge(books, collections)
libraries_with_books_by

Unnamed: 0,Title,Author,Library
0,War and Peace,Tolstoi,Oodi
1,War and Peace,Tolstoi,Kallio
2,Good Omens,Terry Pratchett,Oodi
3,Good Omens,Terry Pratchett,Pasila
4,Good Omens,Neil Gaiman,Oodi
5,Good Omens,Neil Gaiman,Pasila


In [31]:
## exercise 02: cycling weather
def cycling_weather():
  url_weather = 'https://raw.githubusercontent.com/annassanchez/HY-2021-DataScience/main/week05/kumpula-weather-2017.csv'
  url_cycling = 'https://raw.githubusercontent.com/annassanchez/HY-2021-DataScience/main/week05/Helsingin_pyorailijamaarat.csv'
  df_weather = pd.read_csv(url_weather, sep=',')
  df_cycling = pd.read_csv(url_cycling, sep=';')
  df_cycling = df_cycling.dropna(axis=0, how="all").dropna(axis=1, how="all")
  df_cycling[["Weekday", "Day", "Month", "Year", "Hour"]] = df_cycling['Päivämäärä'].str.split(expand=True)
  df_cycling[["Month"]] = df_cycling[["Month"]].replace({"tammi": "1", "helmi": "2", "maalis": "3", 
                                                          "huhti": "4", "touko": "5", "kesä": "6", 
                                                          "heinä": "7", "elo": "8", "syys": "9", 
                                                          "loka": "10", "marras": "11", "joulu": "12"})
  df_cycling = df_cycling.astype({"Weekday": object, "Day": int, "Month": int, "Year": int})
  df_cyc_weat = pd.merge(df_weather, df_cycling, left_on=['d', 'm', 'Year'], 
                          right_on=["Day", "Month", "Year"])
  df_cyc_weat.drop(['m', 'd', 'Time', 'Time zone', 'Päivämäärä'], inplace=True, axis = 1)
  return df_cyc_weat

In [32]:
cycling_weather()

Unnamed: 0,Year,Precipitation amount (mm),Snow depth (cm),Air temperature (degC),Auroransilta,Eteläesplanadi,Huopalahti (asema),Kaisaniemi/Eläintarhanlahti,Kaivokatu,Kulosaaren silta et.,Kulosaaren silta po.,Kuusisaarentie,"Käpylä, Pohjoisbaana",Lauttasaaren silta eteläpuoli,Merikannontie,Munkkiniemen silta eteläpuoli,Munkkiniemi silta pohjoispuoli,Heperian puisto/Ooppera,Pitkäsilta itäpuoli,Pitkäsilta länsipuoli,Lauttasaaren silta pohjoispuoli,Ratapihantie,Viikintie,Baana,Weekday,Day,Month,Hour
0,2017,-1.0,-1.0,0.6,,11.0,8.0,14.0,,0.0,10.0,2.0,,21.0,8.0,10.0,4.0,22.0,19.0,14.0,15.0,8.0,,13.0,su,1,1,00:00
1,2017,-1.0,-1.0,0.6,,3.0,3.0,8.0,,0.0,8.0,5.0,,7.0,5.0,4.0,3.0,12.0,5.0,12.0,17.0,1.0,,2.0,su,1,1,01:00
2,2017,-1.0,-1.0,0.6,,2.0,5.0,5.0,,0.0,2.0,3.0,,6.0,5.0,4.0,4.0,4.0,5.0,6.0,5.0,5.0,,3.0,su,1,1,02:00
3,2017,-1.0,-1.0,0.6,,2.0,1.0,0.0,,0.0,0.0,2.0,,6.0,0.0,2.0,0.0,6.0,6.0,10.0,0.0,1.0,,7.0,su,1,1,03:00
4,2017,-1.0,-1.0,0.6,,1.0,0.0,2.0,,0.0,1.0,1.0,,1.0,0.0,1.0,1.0,7.0,1.0,6.0,1.0,0.0,,3.0,su,1,1,04:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,2017,3.2,-1.0,1.6,2.0,3.0,3.0,4.0,10.0,0.0,8.0,4.0,5.0,10.0,8.0,4.0,3.0,3.0,13.0,10.0,1.0,4.0,5.0,11.0,su,31,12,19:00
8756,2017,3.2,-1.0,1.6,2.0,5.0,3.0,3.0,10.0,0.0,6.0,4.0,0.0,6.0,5.0,6.0,4.0,8.0,9.0,7.0,3.0,4.0,1.0,5.0,su,31,12,20:00
8757,2017,3.2,-1.0,1.6,4.0,7.0,1.0,4.0,11.0,0.0,5.0,0.0,2.0,6.0,4.0,4.0,2.0,7.0,6.0,4.0,2.0,5.0,0.0,8.0,su,31,12,21:00
8758,2017,3.2,-1.0,1.6,1.0,7.0,1.0,4.0,15.0,0.0,3.0,0.0,4.0,4.0,7.0,4.0,3.0,17.0,4.0,8.0,2.0,2.0,3.0,8.0,su,31,12,22:00


In [33]:
## alternative solution
days = dict(zip("ma ti ke to pe la su".split(), "Mon Tue Wed Thu Fri Sat Sun".split()))
months = dict(zip("tammi helmi maalis huhti touko kesä heinä elo syys loka marras joulu".split(), range(1, 13)))

def split_date(df):
    d = df["Päivämäärä"].str.split(expand=True)
    d.columns = ["Weekday", "Day", "Month", "Year", "Hour"]
    hourmin = d["Hour"].str.split(":", expand=True)
    d["Hour"] = hourmin.iloc[:, 0]
    d["Weekday"] = d["Weekday"].map(days)
    d["Month"] = d["Month"].map(months)
    d = d.astype({"Weekday": object, "Day": int, "Month": int, "Year": int, "Hour": int})
    return d
def split_date_continues():
    df = pd.read_csv("src/Helsingin_pyorailijamaarat.csv", sep=";")
    df = df.dropna(axis=0, how="all").dropna(axis=1, how="all")
    d = split_date(df)
    df = df.drop("Päivämäärä", axis=1)
    return pd.concat([d, df], axis=1)
def cycling_weather():
    wh = pd.read_csv("src/kumpula-weather-2017.csv")
    bike = split_date_continues()
    result = pd.merge(wh, bike, left_on=["Year", "m", "d"], right_on=["Year", "Month", "Day"])
    return result.drop(['m', 'd', 'Time', 'Time zone'], axis=1)

In [34]:
## exercise 03: top hands
def top_bands():
  url_top40 = 'https://raw.githubusercontent.com/annassanchez/HY-2021-DataScience/main/week05/UK-top40-1964-1-2.tsv'
  url_bands = 'https://raw.githubusercontent.com/annassanchez/HY-2021-DataScience/main/week05/bands.tsv'
  df_top40 = pd.read_csv(url_top40, sep='\t')
  df_top40['Artist'] = df_top40['Artist'].str.title()
  df_bands = pd.read_csv(url_bands, sep='\t')
  df_topBands = pd.merge(df_top40, df_bands, left_on='Artist', right_on='Band', how = 'right')
  return df_topBands

In [35]:
top_bands().shape

(9, 13)

In [36]:
## alternative solution
def top_bands():
    top40 = pd.read_csv("src/UK-top40-1964-1-2.tsv", sep="\t")
    bands = pd.read_csv("src/bands.tsv", sep="\t")
    bands["Band"] = bands["Band"].str.upper()
    result = pd.merge(top40, bands, left_on="Artist", right_on="Band")
    return result 

## Aggregates and groupings

In [37]:
wh = pd.read_csv("https://raw.githubusercontent.com/csmastersUH/data_analysis_with_python_2020/master/kumpula-weather-2017.csv")

In [38]:
## rename method -- renames the columns of a DataFrame
wh3 = wh.rename(columns={"m": "Month", "d": "Day", "Precipitation amount (mm)" : "Precipitation",
                         "Snow depth (cm)" : "Snow", "Air temperature (degC)" : "Temperature"})
wh3.head()

Unnamed: 0,Year,Month,Day,Time,Time zone,Precipitation,Snow,Temperature
0,2017,1,1,00:00,UTC,-1.0,-1.0,0.6
1,2017,1,2,00:00,UTC,4.4,-1.0,-3.9
2,2017,1,3,00:00,UTC,6.6,7.0,-6.5
3,2017,1,4,00:00,UTC,-1.0,13.0,-12.8
4,2017,1,5,00:00,UTC,-1.0,10.0,-17.8


In [39]:
## the groupby mehtod splits the dataframe into groups -- for example, you can add from months
groups = wh3.groupby("Month")
groups

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fbb96cce910>

In [40]:
len(groups)

12

In [41]:
for key, group in groups:
    print(key, len(group))

1 31
2 28
3 31
4 30
5 31
6 30
7 31
8 31
9 30
10 31
11 30
12 31


In [42]:
groups.get_group(2)                 # Group with index two is February

Unnamed: 0,Year,Month,Day,Time,Time zone,Precipitation,Snow,Temperature
31,2017,2,1,00:00,UTC,1.5,4.0,-0.6
32,2017,2,2,00:00,UTC,0.2,5.0,-0.8
33,2017,2,3,00:00,UTC,-1.0,6.0,-0.2
34,2017,2,4,00:00,UTC,2.7,6.0,0.4
35,2017,2,5,00:00,UTC,-1.0,7.0,-2.5
36,2017,2,6,00:00,UTC,-1.0,7.0,-7.3
37,2017,2,7,00:00,UTC,-1.0,8.0,-12.1
38,2017,2,8,00:00,UTC,-1.0,8.0,-8.8
39,2017,2,9,00:00,UTC,-1.0,8.0,-10.1
40,2017,2,10,00:00,UTC,-1.0,8.0,-8.3


In [43]:
## groupby object works like a df, so some operations are allowed, as subsetting:
groups["Temperature"]

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7fbb963dd1d0>

In [44]:
## also aggregation methods are normally included
## the mean aggregation was performed on each group, and the results wer automatically combined
groups["Temperature"].mean()

Month
1     -2.316129
2     -2.389286
3      0.983871
4      2.676667
5      9.783871
6     13.726667
7     16.035484
8     16.183871
9     11.826667
10     5.454839
11     3.950000
12     1.741935
Name: Temperature, dtype: float64

In [45]:
## let's try with other aggregation
groups["Precipitation"].sum()

Month
1      26.9
2      21.0
3      29.7
4      26.9
5      -5.9
6      59.3
7      14.2
8      70.1
9      51.2
10    173.5
11    117.2
12    133.6
Name: Precipitation, dtype: float64

In [46]:
## the negative precipitation values are causing trouble
## make them 0
wh4 = wh3.copy()
wh4.loc[wh4.Precipitation == -1, "Precipitation"] = 0
wh4.loc[wh4.Snow == -1, "Snow"] = 0
wh4.head()

Unnamed: 0,Year,Month,Day,Time,Time zone,Precipitation,Snow,Temperature
0,2017,1,1,00:00,UTC,0.0,0.0,0.6
1,2017,1,2,00:00,UTC,4.4,0.0,-3.9
2,2017,1,3,00:00,UTC,6.6,7.0,-6.5
3,2017,1,4,00:00,UTC,0.0,13.0,-12.8
4,2017,1,5,00:00,UTC,0.0,10.0,-17.8


In [47]:
wh4.groupby("Month")["Precipitation"].sum()

Month
1      38.9
2      35.0
3      41.7
4      39.9
5      16.1
6      76.3
7      31.2
8      86.1
9      65.2
10    184.5
11    120.2
12    140.6
Name: Precipitation, dtype: float64

### other ways to operate on groups

In [48]:
## filtering -- some of the groups are filtered out
def myfilter(df):                                     # The filter function must return a boolean value
    return df["Precipitation"].sum() >= 150

wh4.groupby("Month").filter(myfilter)                 # Filter out months with total precipitation less that 150 mm

Unnamed: 0,Year,Month,Day,Time,Time zone,Precipitation,Snow,Temperature
273,2017,10,1,00:00,UTC,0.0,0.0,9.1
274,2017,10,2,00:00,UTC,6.4,0.0,9.2
275,2017,10,3,00:00,UTC,21.5,0.0,8.3
276,2017,10,4,00:00,UTC,12.7,0.0,11.2
277,2017,10,5,00:00,UTC,0.6,0.0,8.8
278,2017,10,6,00:00,UTC,0.7,0.0,7.7
279,2017,10,7,00:00,UTC,11.7,0.0,8.1
280,2017,10,8,00:00,UTC,14.1,0.0,9.3
281,2017,10,9,00:00,UTC,18.3,0.0,8.6
282,2017,10,10,00:00,UTC,24.2,0.0,8.1


In [49]:
## transformation -- each group's df is manipulated in a way tha retains it shape
pd.concat([wh4.iloc[:, 0:3],
           wh4.groupby("Month")[["Precipitation", "Snow", "Temperature"]].transform(lambda x : x - x.mean())],
          axis=1)

Unnamed: 0,Year,Month,Day,Precipitation,Snow,Temperature
0,2017,1,1,-1.254839,-6.903226,2.916129
1,2017,1,2,3.145161,-6.903226,-1.583871
2,2017,1,3,5.345161,0.096774,-4.183871
3,2017,1,4,-1.254839,6.096774,-10.483871
4,2017,1,5,-1.254839,3.096774,-15.483871
...,...,...,...,...,...,...
360,2017,12,27,-3.435484,-1.483871,2.058065
361,2017,12,28,-0.835484,-1.483871,1.058065
362,2017,12,29,3.264516,-1.483871,2.058065
363,2017,12,30,-0.435484,-1.483871,0.758065


In [50]:
## apply -- applies a function and returns a df, series or a scalar
wh4.groupby("Month").apply(lambda df : df.sort_values("Temperature"))

Unnamed: 0_level_0,Unnamed: 1_level_0,Year,Month,Day,Time,Time zone,Precipitation,Snow,Temperature
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,4,2017,1,5,00:00,UTC,0.0,10.0,-17.8
1,5,2017,1,6,00:00,UTC,0.3,10.0,-17.8
1,3,2017,1,4,00:00,UTC,0.0,13.0,-12.8
1,2,2017,1,3,00:00,UTC,6.6,7.0,-6.5
1,15,2017,1,16,00:00,UTC,0.0,8.0,-4.2
...,...,...,...,...,...,...,...,...,...
12,360,2017,12,27,00:00,UTC,1.1,0.0,3.8
12,362,2017,12,29,00:00,UTC,7.8,0.0,3.8
12,342,2017,12,9,00:00,UTC,0.2,0.0,4.2
12,336,2017,12,3,00:00,UTC,7.2,0.0,5.0
