# Pandas (continues)

In [3]:
import pandas as pd
import numpy as np

## Catenating datasets

In [4]:
# axis = 0 catenates vertically and axis = 1 catenates horizontally. 
# pandas dataframe works similarly but row indices and the column names require extra attention.
# also the difference between np.concatenate & pd.concat
## helper for creating dataframes
def makedf(cols, ind):
    data = {c : [str(c) + str(i) for i in ind] for c in cols}
    return pd.DataFrame(data, ind)

In [5]:
a=makedf("AB", [0,1])
a

Unnamed: 0,A,B
0,A0,B0
1,A1,B1


In [6]:
b=makedf("AB", [2,3])
b

Unnamed: 0,A,B
2,A2,B2
3,A3,B3


In [7]:
c=makedf("CD", [0,1])
c

Unnamed: 0,C,D
0,C0,D0
1,C1,D1


In [8]:
d=makedf("BC", [2,3])
d

Unnamed: 0,B,C
2,B2,C2
3,B3,C3


In [9]:
## concat a and b -- works as expected
pd.concat([a,b])   # The default axis is 0

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3


In [10]:
r=pd.concat([a,a])
r

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
0,A0,B0
1,A1,B1


In [11]:
r.loc[0,"A"]

0    A0
0    A0
Name: A, dtype: object

In [12]:
## indices should be unique, so that's not what we aim to. 
## here's a tool to chek duplicated indices
try:
    pd.concat([a,a], verify_integrity=True)
except ValueError as e:
    import sys
    print(e, file=sys.stderr)

Indexes have overlapping values: Int64Index([0, 1], dtype='int64')


In [13]:
## automatic renumbering of rows
pd.concat([a,a], ignore_index=True)

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A0,B0
3,A1,B1


In [14]:
## hierarchical indexing -- multiple level indices
r2=pd.concat([a,a], keys=['first', 'second'])
r2

Unnamed: 0,Unnamed: 1,A,B
first,0,A0,B0
first,1,A1,B1
second,0,A0,B0
second,1,A1,B1


In [15]:
r2["A"]["first"][0] # works as numpy

'A0'

In [16]:
## it works similarly with horizontal catenation
pd.concat([a,c], axis=1)

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1


In [17]:
## if you concatenate vertically two DataFrames that don't have the same columns
## this is called outer join, considers all the columns (the common ones and the different ones)
pd.concat([a,d], sort=False)    # sort option is used to silence a deprecation message

Unnamed: 0,A,B,C
0,A0,B0,
1,A1,B1,
2,,B2,C2
3,,B3,C3


In [18]:
## here is the same catenation but with inner join
pd.concat([a,d], join="inner")

Unnamed: 0,B
0,B0
1,B1
2,B2
3,B3


In [51]:
## exercose 01 (split date continues)
#url = "https://raw.githubusercontent.com/annassanchez/HY-2021-DataScience/main/week04/Helsingin_pyorailijamaarat.csv"
#kk = pd.read_csv(url, sep = ';', header = 0)

def split_date(df):
    df = df.dropna(axis=0, how='all')
    df = df.dropna(axis=1, how='all')
    df[["Weekday", "Day", "Month", "Year", "Hour"]] = df['Päivämäärä'].str.split(expand=True)
    df[["Weekday"]] = df[["Weekday"]].replace({"ma": "Mon", "ti": "Tue", "ke": "Wed", "to": "Thu", 
        "pe": "Fri", "la": "Sat", "su": "Sun"})
    df[["Month"]] = df[["Month"]].replace({"tammi": "1", "helmi": "2", "maalis": "3", "huhti": "4", 
        "touko": "5", "kesä": "6", "heinä": "7", "elo": "8", "syys": "9", "loka": "10", "marras": "11",
        "joulu": "12"})
    df["Hour"] = df["Hour"].str.split(":", expand=True)[0].map(int)
    df['Päivämäärä'] = df["Weekday"] + " " + df["Day"] + " " + df["Month"] + " " + df["Year"] + " " + df["Hour"].map(str)
    df = df[df.columns.tolist()[-5:] + [df.columns.tolist()[0]]]
    return df.astype({"Weekday":object, "Day":int,  "Month":int, "Year":int, "Hour": int})

In [63]:
def split_date_continues():
    #d = split_date()
    url = "https://raw.githubusercontent.com/annassanchez/HY-2021-DataScience/main/week04/Helsingin_pyorailijamaarat.csv"
    data = pd.read_csv(url, sep = ';', header = 0)
    d = split_date(data)
    data = data.dropna(axis=0, how='all')
    data = data.dropna(axis=1, how='all')
    #data.drop(['Päivämäärä'], inplace = True, axis = 1)
    final = pd.concat([d, data], axis = 1)
    final.drop([final.columns.tolist()[5]],inplace = True, axis = 1)
    return final#.astype({"Weekday":object, "Day":np.int32,  "Month":np.int32, "Year":np.int32, "Hour": float})


In [53]:
url = "https://raw.githubusercontent.com/annassanchez/HY-2021-DataScience/main/week04/Helsingin_pyorailijamaarat.csv"
df = pd.read_csv(url, sep = ';', header = 0)
split_date(df)

Unnamed: 0,Weekday,Day,Month,Year,Hour,Päivämäärä
0,Wed,1,1,2014,0,Wed 1 1 2014 0
1,Wed,1,1,2014,1,Wed 1 1 2014 1
2,Wed,1,1,2014,2,Wed 1 1 2014 2
3,Wed,1,1,2014,3,Wed 1 1 2014 3
4,Wed,1,1,2014,4,Wed 1 1 2014 4
...,...,...,...,...,...,...
37123,Tue,27,3,2018,19,Tue 27 3 2018 19
37124,Tue,27,3,2018,20,Tue 27 3 2018 20
37125,Tue,27,3,2018,21,Tue 27 3 2018 21
37126,Tue,27,3,2018,22,Tue 27 3 2018 22


In [64]:
df = split_date_continues()
print("Shape:", df.shape)
print("Column names:\n", df.columns)
print(df.head())

Shape: (37128, 25)
Column names:
 Index(['Weekday', 'Day', 'Month', 'Year', 'Hour', 'Auroransilta',
       'Eteläesplanadi', 'Huopalahti (asema)', 'Kaisaniemi/Eläintarhanlahti',
       'Kaivokatu', 'Kulosaaren silta et.', 'Kulosaaren silta po. ',
       'Kuusisaarentie', 'Käpylä, Pohjoisbaana',
       'Lauttasaaren silta eteläpuoli', 'Merikannontie',
       'Munkkiniemen silta eteläpuoli', 'Munkkiniemi silta pohjoispuoli',
       'Heperian puisto/Ooppera', 'Pitkäsilta itäpuoli',
       'Pitkäsilta länsipuoli', 'Lauttasaaren silta pohjoispuoli',
       'Ratapihantie', 'Viikintie', 'Baana'],
      dtype='object')
  Weekday  Day  Month  ...  Ratapihantie  Viikintie  Baana
0     Wed    1      1  ...           NaN        NaN    8.0
1     Wed    1      1  ...           NaN        NaN    4.0
2     Wed    1      1  ...           NaN        NaN   11.0
3     Wed    1      1  ...           NaN        NaN    3.0
4     Wed    1      1  ...           NaN        NaN    4.0

[5 rows x 25 columns]


In [66]:
## alternative solution
days = dict(zip("ma ti ke to pe la su".split(), "Mon Tue Wed Thu Fri Sat Sun".split()))
months = dict(zip("tammi helmi maalis huhti touko kesä heinä elo syys loka marras joulu".split(), range(1, 13)))

def split_date(df):
    d = df["Päivämäärä"].str.split(expand=True)
    d.columns = ["Weekday", "Day", "Month", "Year", "Hour"]
    hourmin = d["Hour"].str.split(":", expand=True)
    d["Hour"] = hourmin.iloc[:, 0]
    d["Weekday"] = d["Weekday"].map(days)
    d["Month"] = d["Month"].map(months)
    d = d.astype({"Weekday": object, "Day": int, "Month": int, "Year": int, "Hour": int})
    return d
def split_date_continues():
    url = "https://raw.githubusercontent.com/annassanchez/HY-2021-DataScience/main/week04/Helsingin_pyorailijamaarat.csv"
    df = pd.read_csv(url, sep = ';', header = 0)
    df = df.dropna(axis=0, how="all").dropna(axis=1, how="all")
    d = split_date(df)
    df = df.drop("Päivämäärä", axis=1)
    result = pd.concat([d, df], axis=1)
    return result


df = split_date_continues()
print("Shape:", df.shape)
print("Column names:\n", df.columns)
print(df.head())

Shape: (37128, 25)
Column names:
 Index(['Weekday', 'Day', 'Month', 'Year', 'Hour', 'Auroransilta',
       'Eteläesplanadi', 'Huopalahti (asema)', 'Kaisaniemi/Eläintarhanlahti',
       'Kaivokatu', 'Kulosaaren silta et.', 'Kulosaaren silta po. ',
       'Kuusisaarentie', 'Käpylä, Pohjoisbaana',
       'Lauttasaaren silta eteläpuoli', 'Merikannontie',
       'Munkkiniemen silta eteläpuoli', 'Munkkiniemi silta pohjoispuoli',
       'Heperian puisto/Ooppera', 'Pitkäsilta itäpuoli',
       'Pitkäsilta länsipuoli', 'Lauttasaaren silta pohjoispuoli',
       'Ratapihantie', 'Viikintie', 'Baana'],
      dtype='object')
  Weekday  Day  Month  ...  Ratapihantie  Viikintie  Baana
0     Wed    1      1  ...           NaN        NaN    8.0
1     Wed    1      1  ...           NaN        NaN    4.0
2     Wed    1      1  ...           NaN        NaN   11.0
3     Wed    1      1  ...           NaN        NaN    3.0
4     Wed    1      1  ...           NaN        NaN    4.0

[5 rows x 25 columns]
