# Pandas

In [1]:
import pandas as pd
import numpy as np

## Creation of dataframes
### Creating DataFrames from a NumPy array

In [2]:
df=pd.DataFrame(np.random.randn(2,3), columns=["First", "Second", "Third"], index=["a", "b"])
df

Unnamed: 0,First,Second,Third
a,-0.349441,0.74243,-1.625606
b,-1.510291,-2.841044,-0.559524


In [3]:
## asign columns and rows with the Index object
df.index                            # These are the "row names"

Index(['a', 'b'], dtype='object')

In [4]:
df.columns                          # These are the "column names"

Index(['First', 'Second', 'Third'], dtype='object')

In [5]:
# columns or index argument is left out, when an implicit integer index is used:
df2=pd.DataFrame(np.random.randn(2,3), index=["a", "b"])
df2

Unnamed: 0,0,1,2
a,-0.744415,-3.110915,-0.722941
b,-0.09633,-0.308629,-0.419322


In [6]:
## the column index is ab object as the range type:
df2.columns

RangeIndex(start=0, stop=3, step=1)

### Create Dataframe from columns

In [7]:
## columns can be specified as a list, an array, or a Series. column names are given with the `columns` parameter -- for series, the `name`attribute
s1 = pd.Series([1,2,3])
s1

0    1
1    2
2    3
dtype: int64

In [8]:
s2 = pd.Series([4,5,6], name="b")
s2

0    4
1    5
2    6
Name: b, dtype: int64

In [9]:
## give the column name explicitly:
pd.DataFrame(s1, columns=["a"])

Unnamed: 0,a
0,1
1,2
2,3


In [10]:
## using the name of a series: 
pd.DataFrame(s2)

Unnamed: 0,b
0,4
1,5
2,6


### Creating dataframes from rows

In [11]:
df=pd.DataFrame([{"Wage" : 1000, "Name" : "Jack", "Age" : 21}, {"Wage" : 1500, "Name" : "John", "Age" : 29}])
df

Unnamed: 0,Wage,Name,Age
0,1000,Jack,21
1,1500,John,29


In [12]:
df = pd.DataFrame([[1000, "Jack", 21], [1500, "John", 29]], columns=["Wage", "Name", "Age"])
df

Unnamed: 0,Wage,Name,Age
0,1000,Jack,21
1,1500,John,29


In [13]:
## exercise 01: cities
def cities():  
    indices = ['Helsinki', 'Espoo', 'Tampere', 'Vantaa', 'Oulu']  
    population = [643272, 279044, 231853, 223027, 201810]
    total_area = [715.48, 528.03, 689.59, 240.35, 3817.52]
    s_population = pd.Series(population, index = indices)
    s_total_area = pd.Series(total_area, index = indices)
    df = pd.DataFrame({"Population" : s_population, "Total area" : s_total_area})
    return df

In [15]:
## another solution:
def cities():
    a=[[643272, 715.48],
       [279044, 528.03],
       [231853, 689.59],
       [223027, 240.35],
       [201810, 3817.52]]
    cols=["Population", "Total area"]
    ind=["Helsinki", "Espoo", "Tampere", "Vantaa", "Oulu"]
    df = pd.DataFrame(a, index=ind, columns=cols)
    return df

In [16]:
cities()

Unnamed: 0,Population,Total area
Helsinki,643272,715.48
Espoo,279044,528.03
Tampere,231853,689.59
Vantaa,223027,240.35
Oulu,201810,3817.52


In [19]:
## exercise 02: power of a series
def powers_of_series(s, k):
    df_power = pd.DataFrame()
    for i in range(1,k+1):
        s_ = pd.Series((s.values) ** i, name = i)
        if i==1:
            df_power = pd.DataFrame(s_)
        if i != 1:
            df_power[i] = s_
    return df_power

In [20]:
s = pd.Series([1,2,3,4], index=list("abcd"))
print(powers_of_series(s, 3))

   1   2   3
0  1   1   1
1  2   4   8
2  3   9  27
3  4  16  64


In [None]:
##actual solution
def powers_of_series(s, k):
    c=[ s**i for i in range(1,k+1) ]
    df = pd.DataFrame(dict(zip(range(1,k+1), c)))
    return df

In [39]:
## exercise 03: municipal imformation
df = pd.read_csv("municipal.tsv", sep = '\t' )
print(f'Shape: {df.shape[0]}, {df.shape[1]}')
print("Columns:")
for column in df.columns:
    print(column)

Shape: 490, 7
Columns:
Region 2018
Population
Population change from the previous year, %
Share of Swedish-speakers of the population, %
Share of foreign citizens of the population, %
Proportion of the unemployed among the labour force, %
Proportion of pensioners of the population, %


In [33]:
df = pd.read_csv("municipal.tsv", sep="\t")
print("Shape: {}, {}".format(*df.shape))
print("Columns:")
for name in df.columns:
    print(name)

Shape: 490, 7
Columns:
Region 2018
Population
Population change from the previous year, %
Share of Swedish-speakers of the population, %
Share of foreign citizens of the population, %
Proportion of the unemployed among the labour force, %
Proportion of pensioners of the population, %
