# Pandas

In [1]:
import pandas as pd
import numpy as np

## Creation of dataframes
### Creating DataFrames from a NumPy array

In [2]:
df=pd.DataFrame(np.random.randn(2,3), columns=["First", "Second", "Third"], index=["a", "b"])
df

Unnamed: 0,First,Second,Third
a,0.88096,-1.059179,0.194803
b,-0.68286,-0.949791,-0.876984


In [3]:
## asign columns and rows with the Index object
df.index                            # These are the "row names"

Index(['a', 'b'], dtype='object')

In [4]:
df.columns                          # These are the "column names"

Index(['First', 'Second', 'Third'], dtype='object')

In [5]:
# columns or index argument is left out, when an implicit integer index is used:
df2=pd.DataFrame(np.random.randn(2,3), index=["a", "b"])
df2

Unnamed: 0,0,1,2
a,-1.328372,-0.021633,-2.271863
b,1.095543,1.06522,-1.142138


In [6]:
## the column index is ab object as the range type:
df2.columns

RangeIndex(start=0, stop=3, step=1)

### Create Dataframe from columns

In [7]:
## columns can be specified as a list, an array, or a Series. column names are given with the `columns` parameter -- for series, the `name`attribute
s1 = pd.Series([1,2,3])
s1

0    1
1    2
2    3
dtype: int64

In [8]:
s2 = pd.Series([4,5,6], name="b")
s2

0    4
1    5
2    6
Name: b, dtype: int64

In [9]:
## give the column name explicitly:
pd.DataFrame(s1, columns=["a"])

Unnamed: 0,a
0,1
1,2
2,3


In [10]:
## using the name of a series: 
pd.DataFrame(s2)

Unnamed: 0,b
0,4
1,5
2,6


### Creating dataframes from rows

In [11]:
df=pd.DataFrame([{"Wage" : 1000, "Name" : "Jack", "Age" : 21}, {"Wage" : 1500, "Name" : "John", "Age" : 29}])
df

Unnamed: 0,Wage,Name,Age
0,1000,Jack,21
1,1500,John,29


In [48]:
df = pd.DataFrame([[1000, "Jack", 21], [1500, "John", 29]], columns=["Wage", "Name", "Age"])
df

Unnamed: 0,Wage,Name,Age
0,1000,Jack,21
1,1500,John,29


In [13]:
## exercise 01: cities
def cities():  
    indices = ['Helsinki', 'Espoo', 'Tampere', 'Vantaa', 'Oulu']  
    population = [643272, 279044, 231853, 223027, 201810]
    total_area = [715.48, 528.03, 689.59, 240.35, 3817.52]
    s_population = pd.Series(population, index = indices)
    s_total_area = pd.Series(total_area, index = indices)
    df = pd.DataFrame({"Population" : s_population, "Total area" : s_total_area})
    return df

In [14]:
## another solution:
def cities():
    a=[[643272, 715.48],
       [279044, 528.03],
       [231853, 689.59],
       [223027, 240.35],
       [201810, 3817.52]]
    cols=["Population", "Total area"]
    ind=["Helsinki", "Espoo", "Tampere", "Vantaa", "Oulu"]
    df = pd.DataFrame(a, index=ind, columns=cols)
    return df

In [15]:
cities()

Unnamed: 0,Population,Total area
Helsinki,643272,715.48
Espoo,279044,528.03
Tampere,231853,689.59
Vantaa,223027,240.35
Oulu,201810,3817.52


In [16]:
## exercise 02: power of a series
def powers_of_series(s, k):
    df_power = pd.DataFrame()
    for i in range(1,k+1):
        s_ = pd.Series((s.values) ** i, name = i)
        if i==1:
            df_power = pd.DataFrame(s_)
        if i != 1:
            df_power[i] = s_
    return df_power

In [17]:
s = pd.Series([1,2,3,4], index=list("abcd"))
print(powers_of_series(s, 3))

   1   2   3
0  1   1   1
1  2   4   8
2  3   9  27
3  4  16  64


In [18]:
##actual solution
def powers_of_series(s, k):
    c=[ s**i for i in range(1,k+1) ]
    df = pd.DataFrame(dict(zip(range(1,k+1), c)))
    return df

In [19]:
## exercise 03: municipal imformation
df = pd.read_csv("municipal.tsv", sep = '\t' )
print(f'Shape: {df.shape[0]}, {df.shape[1]}')
print("Columns:")
for column in df.columns:
    print(column)

Shape: 490, 7
Columns:
Region 2018
Population
Population change from the previous year, %
Share of Swedish-speakers of the population, %
Share of foreign citizens of the population, %
Proportion of the unemployed among the labour force, %
Proportion of pensioners of the population, %


In [20]:
df = pd.read_csv("municipal.tsv", sep="\t")
print("Shape: {}, {}".format(*df.shape))
print("Columns:")
for name in df.columns:
    print(name)

Shape: 490, 7
Columns:
Region 2018
Population
Population change from the previous year, %
Share of Swedish-speakers of the population, %
Share of foreign citizens of the population, %
Proportion of the unemployed among the labour force, %
Proportion of pensioners of the population, %


## accessing columns and rows of DataFrame

In [21]:
## for accessing information on the dataframe, the [] notation will not work as it does with arrays
try:
    df[0]
except KeyError:
    import sys
    print("Key error", file=sys.stderr)

Key error


In [24]:
## you need to say the explicit indices for the column contain that integer. this will work:
df["Wage"]

0    1000
1    1500
Name: Wage, dtype: int64

In [25]:
## also fancy indexing words:
df[["Wage", "Name"]]

Unnamed: 0,Wage,Name
0,1000,Jack
1,1500,John


In [26]:
## if one indexes with a slice or boolean mask, then the rows are referred to:
df[0:1]                           # slice

Unnamed: 0,Wage,Name,Age
0,1000,Jack,21


In [27]:
df[df.Wage > 1200]               # boolean mask

Unnamed: 0,Wage,Name,Age
1,1500,John,29


In [28]:
## for getting a single element of the dataframe, you chain the bracket calls
df["Wage"][1]                    # Note order of dimensions

1500

In [29]:
##exercise 04: municipalities of finland
def municipalities_of_finland():
    municipalities = pd.read_csv("municipal.tsv", sep="\t", index_col = 'Region 2018')[1:312]
    print(municipalities.shape)
    return municipalities

In [30]:
municipalities_of_finland()

(311, 6)


Unnamed: 0_level_0,Population,"Population change from the previous year, %","Share of Swedish-speakers of the population, %","Share of foreign citizens of the population, %","Proportion of the unemployed among the labour force, %","Proportion of pensioners of the population, %"
Region 2018,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Akaa,16769,-0.9,0.2,1.6,14.6,26.1
Alajärvi,9831,-0.7,0.1,1.9,13.9,32.0
Alavieska,2610,-1.1,0.2,0.6,10.8,28.4
Alavus,11713,-1.6,0.1,1.1,11.3,31.5
Asikkala,8248,-0.9,0.2,1.6,12.0,35.5
...,...,...,...,...,...,...
Ylivieska,15251,0.3,0.3,1.2,13.3,23.1
Ylöjärvi,32878,0.2,0.3,1.2,11.7,20.3
Ypäjä,2372,-0.4,0.7,1.9,13.2,31.4
Ähtäri,5906,-1.3,0.1,0.9,13.0,35.1


In [31]:
## alternative solution
def municipalities_of_finland():
    df = pd.read_csv("municipal.tsv", sep="\t", index_col=0)
    return df["Akaa":"Äänekoski"]

In [32]:
## exercise 05: swedish and foreigners
def swedish_and_foreigners():
    municipalities = pd.read_csv("municipal.tsv", sep="\t", index_col = 'Region 2018')[1:312]
    swedish = municipalities[(municipalities['Share of Swedish-speakers of the population, %'] > 5) & (municipalities['Share of foreign citizens of the population, %'] > 5)]
    swedish = swedish[['Population', 'Share of Swedish-speakers of the population, %', 'Share of foreign citizens of the population, %']]
    return swedish

In [33]:
swedish_and_foreigners()#.value_counts('Share of Swedish-speakers of the population, %')

Unnamed: 0_level_0,Population,"Share of Swedish-speakers of the population, %","Share of foreign citizens of the population, %"
Region 2018,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Brändö,452,72.6,8.4
Eckerö,948,89.7,11.5
Espoo,279044,7.2,10.5
Finström,2580,89.8,10.5
Föglö,532,84.2,17.3
Geta,495,86.9,13.5
Hammarland,1547,89.7,11.6
Helsinki,643272,5.7,9.5
Jomala,4859,89.1,8.5
Kaskinen,1274,29.9,5.3


In [35]:
## another solution:
def swedish_and_foreigners():
    df = pd.read_csv("municipal.tsv", index_col=0, sep="\t")
    df = df["Akaa":"Äänekoski"]
    m = ((df["Share of Swedish-speakers of the population, %"] > 5.0) &
         (df["Share of foreign citizens of the population, %"] > 5.0))
    df = df[m]
    df = df[["Population", "Share of Swedish-speakers of the population, %",
             "Share of foreign citizens of the population, %"]]
    return df

In [39]:
## exercise 06: growing municipalities
def growing_municipalities(df):
    municipalities = pd.read_csv("municipal.tsv", sep="\t", index_col = 'Region 2018')[1:312]
    growing = municipalities[municipalities['Population change from the previous year, %'] > 0]
    return growing[['Population change from the previous year, %']]

In [41]:
print(f"Proportion of growing municipalities: {0:3.1f}%".format(growing_municipalities(df)))

Proportion of growing municipalities: 0.0%


In [45]:
def growing_municipalities(df):
    m = df[df['Population change from the previous year, %']>0]
    return m.shape[0]/df.shape[0]
    
filename = "municipal.tsv"
df = pd.read_csv(filename,sep='\t')
statement = "Proportion of growing municipalities:" 
proportion = "{:.1f}%".format(growing_municipalities(df)*100)
print(statement,proportion)

Proportion of growing municipalities: 21.6%


In [43]:
## alternative solution

def growing_municipalities(df):
    c="Population change from the previous year, %"
    n = len(df)
    k = sum(df[c] > 0.0)
    return k / n

def main():
    df = pd.read_csv("src/municipal.tsv", index_col=0, sep="\t")
    df = df["Akaa":"Äänekoski"]
    proportion = growing_municipalities(df)
    print(f"Proportion of growing municipalities: {proportion:.1%}")

## Alternative indexing data selection

In [50]:
df.loc[1, "Wage"]

1500

In [49]:
df.iloc[-1,-1] #right lower corner of the DataFrame

29

In [51]:
df.loc[1, ["Name", "Wage"]]

Name    John
Wage    1500
Name: 1, dtype: object

In [57]:
## exercise 07: subsetting with loc
def subsetting_with_loc():
    df = pd.read_csv("municipal.tsv", index_col=0, sep="\t")
    df = df["Akaa":"Äänekoski"]
    df = df.loc[:, ['Population', 'Share of Swedish-speakers of the population, %', 'Share of foreign citizens of the population, %']]
    return df

In [58]:
subsetting_with_loc()

Unnamed: 0_level_0,Population,"Share of Swedish-speakers of the population, %","Share of foreign citizens of the population, %"
Region 2018,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Akaa,16769,0.2,1.6
Alajärvi,9831,0.1,1.9
Alavieska,2610,0.2,0.6
Alavus,11713,0.1,1.1
Asikkala,8248,0.2,1.6
...,...,...,...
Ylivieska,15251,0.3,1.2
Ylöjärvi,32878,0.3,1.2
Ypäjä,2372,0.7,1.9
Ähtäri,5906,0.1,0.9


In [59]:
## alternative solution
def subsetting_with_loc():
    df = pd.read_csv("municipal.tsv", index_col=0, sep="\t")
    df = df.loc["Akaa":"Äänekoski", ["Population", "Share of Swedish-speakers of the population, %", "Share of foreign citizens of the population, %"]]
    return df

In [75]:
## exercise 08: subsetting by positions
def subsetting_by_positions():
    df = pd.read_csv("UK-top40-1964-1-2.tsv", index_col=0, sep="\t")
    df = df.iloc[:10, [1,2]]
    return df

In [76]:
subsetting_by_positions()

Unnamed: 0_level_0,Title,Artist
Pos,Unnamed: 1_level_1,Unnamed: 2_level_1
1,I WANT TO HOLD YOUR HAND,THE BEATLES
2,GLAD ALL OVER,THE DAVE CLARK FIVE
3,SHE LOVES YOU,THE BEATLES
4,YOU WERE MADE FOR ME,FREDDIE AND THE DREAMERS
5,TWENTY FOUR HOURS FROM TULSA,GENE PITNEY
6,I ONLY WANT TO BE WITH YOU,DUSTY SPRINGFIELD
7,DOMINIQUE,THE SINGING NUN
8,MARIA ELENA,LOS INDIOS TABAJARAS
9,SECRET LOVE,KATHY KIRBY
10,DON'T TALK TO HIM,CLIFF RICHARD


In [72]:
## alternative solution
def subsetting_by_positions():
    df = pd.read_csv("UK-top40-1964-1-2.tsv", sep="\t")
    return df.iloc[:10,2:4]

## Summary statistics

In [77]:
wh = pd.read_csv("https://raw.githubusercontent.com/csmastersUH/data_analysis_with_python_2020/master/kumpula-weather-2017.csv")

In [78]:
wh2 = wh.drop(["Year", "m", "d"], axis=1)  # taking averages over these is not very interesting
wh2.mean()

  wh2.mean()


Precipitation amount (mm)    1.966301
Snow depth (cm)              0.966480
Air temperature (degC)       6.527123
dtype: float64

In [79]:
##the describe method gives summary stadistics for each numeric columns in a dataframe format.
wh.describe()

Unnamed: 0,Year,m,d,Precipitation amount (mm),Snow depth (cm),Air temperature (degC)
count,365.0,365.0,365.0,365.0,358.0,365.0
mean,2017.0,6.526027,15.720548,1.966301,0.96648,6.527123
std,0.0,3.452584,8.808321,4.858423,3.717472,7.183934
min,2017.0,1.0,1.0,-1.0,-1.0,-17.8
25%,2017.0,4.0,8.0,-1.0,-1.0,1.2
50%,2017.0,7.0,16.0,0.2,-1.0,4.8
75%,2017.0,10.0,23.0,2.7,0.0,12.9
max,2017.0,12.0,31.0,35.0,15.0,19.6


In [None]:
## exercise 09: snow depth

In [143]:
def snow_depth(): 
    df = pd.read_csv('kumpula-weather-2017.csv', sep = ',')    
    snow = df.loc[:, ['Snow depth (cm)']]
    max_snow = snow.describe()
    return max_snow.iloc[-1,-1]

In [146]:
proportion = snow_depth()
print(f"Max snow depth: {proportion:.1f}")

Proportion of growing municipalities: 15.0


In [None]:
## alternative solution
def snow_depth():
    df = pd.read_csv("src/kumpula-weather-2017.csv")
    return df["Snow depth (cm)"].max()

In [154]:
## exercise 10: average temperature in july
def average_temperature():
    df = pd.read_csv("kumpula-weather-2017.csv")
    july_temp = df.loc[df['m'] == 7,:]
    avg_july_temp = july_temp['Air temperature (degC)'].mean()
    return avg_july_temp

In [155]:
proportion = average_temperature()
print(f"Average temperature in July: {proportion:.1f}")

Max snow depth: 16.0


In [156]:
## alternative solution
def average_temperature():
    df = pd.read_csv("src/kumpula-weather-2017.csv", sep=",")
    m = df["m"] == 7
    return df[m]["Air temperature (degC)"].mean()

In [162]:
## exercise 11: below zero
def below_zero():
    df = pd.read_csv("kumpula-weather-2017.csv", sep=",")
    below_zero_df = df.loc[df['Air temperature (degC)'] < 0]
    below_zero_days = below_zero_df.shape[0]
    return below_zero_days

In [165]:
print(f"Number of days below zero: {below_zero():.0f}")

Number of days below zero: 49


In [166]:
## alternative solution
def below_zero():
    df = pd.read_csv("kumpula-weather-2017.csv")
    return sum(df["Air temperature (degC)"] < 0.0)

## Missing data

In [167]:
#if we check the unique values of the column, we see some nan values
wh["Snow depth (cm)"].unique()

array([-1.,  7., 13., 10., 12.,  9.,  8.,  5.,  6.,  4.,  3., 15., 14.,
        2., nan,  0.])

In [None]:
# the floeat type accepts nan in additionto normal floating point numbers. THis value represents the results from illegal operations -- as 0/0

In [168]:
pd.Series([1,3,2])

0    1
1    3
2    2
dtype: int64

In [169]:
pd.Series([1,3,2, np.nan])

0    1.0
1    3.0
2    2.0
3    NaN
dtype: float64

In [None]:
## for non-numeric types the special value None is used to denote a missing value, the dtype is promoted to object

In [170]:
pd.Series(["jack", "joe", None])

0    jack
1     joe
2    None
dtype: object

In [171]:
##pandas excludes the missing values from summary statistics -- but we have functions to handle null values
wh.isnull()      # returns a boolean mask DataFrame

Unnamed: 0,Year,m,d,Time,Time zone,Precipitation amount (mm),Snow depth (cm),Air temperature (degC)
0,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...
360,False,False,False,False,False,False,False,False
361,False,False,False,False,False,False,False,False
362,False,False,False,False,False,False,False,False
363,False,False,False,False,False,False,False,False


In [172]:
# combining it with the any method, you can mask the null rows
# notnull is inverse from isnull
wh[wh.isnull().any(axis=1)]

Unnamed: 0,Year,m,d,Time,Time zone,Precipitation amount (mm),Snow depth (cm),Air temperature (degC)
74,2017,3,16,00:00,UTC,1.8,,3.4
163,2017,6,13,00:00,UTC,0.6,,12.6
308,2017,11,5,00:00,UTC,0.2,,8.4
309,2017,11,6,00:00,UTC,2.0,,7.5
313,2017,11,10,00:00,UTC,3.6,,7.2
321,2017,11,18,00:00,UTC,11.3,,5.9
328,2017,11,25,00:00,UTC,8.5,,4.2


In [173]:
#  dropna drops columns that contain missing values from the dataframe
# the drop and tthresh parameters allow to specify how many values need to be missing, in order to drop the element
wh.dropna().shape   # Default axis is 0

(358, 8)

In [174]:
#depending on the axis, you drop the ros (0) or columns (1)
wh.dropna(axis=1).shape # Drops the columns containing missing values

(365, 7)

In [175]:
# fillna fills the missing values with constant or interpolated values
# ffill: use the previous value to fill the current value
# bfill: use the next value to fill the current value
wh = wh.fillna(method='ffill')
wh[wh.isnull().any(axis=1)]

Unnamed: 0,Year,m,d,Time,Time zone,Precipitation amount (mm),Snow depth (cm),Air temperature (degC)
