# Pandas

13/04/18
Summary of first class in Pandas
-  Series
-  Dataframes
-  Slicing and indexing
-  Exploratory commands
-  Sorting and ranking
-  Missing data

### Series

In [1]:
import pandas as pd
import numpy as np

In [2]:
serie = pd.Series([1,2,3,4,5,6,7,8,9,10])
serie

0     1
1     2
2     3
3     4
4     5
5     6
6     7
7     8
8     9
9    10
dtype: int64

In [3]:
print(serie.values)
print(serie.index)  #index is automatically created
serie_with_index = pd.Series([1,2,3],index=list("abc")) #index is specified
print(serie_with_index.index)
serie_with_index

[ 1  2  3  4  5  6  7  8  9 10]
RangeIndex(start=0, stop=10, step=1)
Index(['a', 'b', 'c'], dtype='object')


a    1
b    2
c    3
dtype: int64

Some operations we can perform on indexes, these are similar as in np

In [4]:
serie[2] #indexing using the index labels

3

In [5]:
serie % 2 == 0  #will return booleans

0    False
1     True
2    False
3     True
4    False
5     True
6    False
7     True
8    False
9     True
dtype: bool

In [6]:
serie[serie % 2 == 0 ] #will filter the series depending on the outcome of the conditional expression

1     2
3     4
5     6
7     8
9    10
dtype: int64

In [7]:
serie*2  #we can perfom operations on all values

0     2
1     4
2     6
3     8
4    10
5    12
6    14
7    16
8    18
9    20
dtype: int64

In [8]:
print(0 in serie) #will return True because 0 is in the index
print(10 in serie) #will return False because 10 is not in the index
print(10 in serie.values) #will return True becasue 10 is in the values

True
False
True


In [9]:
random_dict = {"afaw": ["fqjjqw","jajsf"], "qfj": "jfjwjjwf", "fj": "jfajsjdasd"} #we can create a series from a dictionary
series_dict = pd.Series(random_dict)
series_dict

afaw    [fqjjqw, jajsf]
fj           jfajsjdasd
qfj            jfjwjjwf
dtype: object

In [None]:
series_dict.name = "random_dictionary"
series_dict.index.name = "random_index_labels"
series_dict

### DataFrames

In [10]:
dfdata = {
    'province' : ['M', 'M', 'M', 'B', 'B'],
    'population': [1.5e6, 2e6, 3e6, 5e5, 1.5e6],
    'year' : [1900, 1950, 2000, 1900, 2000]   
}

df1 = pd.DataFrame(dfdata) #we can also create a dataframe from a dictionary, all arrays must be the same lenght
df1

Unnamed: 0,population,province,year
0,1500000.0,M,1900
1,2000000.0,M,1950
2,3000000.0,M,2000
3,500000.0,B,1900
4,1500000.0,B,2000


In [11]:
print(df1.columns) #the keys from the dict will be the columns by default
print(df1.index) #numeric index by default

Index(['population', 'province', 'year'], dtype='object')
RangeIndex(start=0, stop=5, step=1)


In [12]:
df1.columns = ["pop", "prv", "yr"] #we can use the same column and index functions to replace them, however they are inmutable, cant be changed
df1.index = list("abcde")
df1

Unnamed: 0,pop,prv,yr
a,1500000.0,M,1900
b,2000000.0,M,1950
c,3000000.0,M,2000
d,500000.0,B,1900
e,1500000.0,B,2000


In [14]:
print(df1.items) #dict
print(df1.keys()) #column names
print(df1.values) #values

<bound method DataFrame.iteritems of          pop prv    yr
a  1500000.0   M  1900
b  2000000.0   M  1950
c  3000000.0   M  2000
d   500000.0   B  1900
e  1500000.0   B  2000>
Index(['pop', 'prv', 'yr'], dtype='object')
[[1500000.0 'M' 1900]
 [2000000.0 'M' 1950]
 [3000000.0 'M' 2000]
 [500000.0 'B' 1900]
 [1500000.0 'B' 2000]]


In [15]:
df1["pop"] #to scl

a    1500000.0
b    2000000.0
c    3000000.0
d     500000.0
e    1500000.0
Name: pop, dtype: float64

In [16]:
df1[["pop", "prv"]] #only works with double brackets

Unnamed: 0,pop,prv
a,1500000.0,M
b,2000000.0,M
c,3000000.0,M
d,500000.0,B
e,1500000.0,B


In [17]:
print(df1[1:3]) #uses a numerical positional reference
print(df1.iloc[1:3]) # same as before
print(df1.loc["d":]) #loc uses the index labels
print(df1.iloc[1:3,2:]) #we can also slice rows and columns at the same time

         pop prv    yr
b  2000000.0   M  1950
c  3000000.0   M  2000
         pop prv    yr
b  2000000.0   M  1950
c  3000000.0   M  2000
         pop prv    yr
d   500000.0   B  1900
e  1500000.0   B  2000
     yr
b  1950
c  2000


In [18]:
print(df1["pop"] > 1500000) # conditional expression returning booleans
print(df1 > 1500000)
print(df1[df1["pop"] > 1500000]) # we can then use the expression to return values
print(df1[(df1["pop"] > 1500000) & (df1["yr"] > 1976)]) # can use operators such as & (and), | (or), ~ (opposite)

a    False
b     True
c     True
d    False
e    False
Name: pop, dtype: bool
     pop   prv     yr
a  False  True  False
b   True  True  False
c   True  True  False
d  False  True  False
e  False  True  False
         pop prv    yr
b  2000000.0   M  1950
c  3000000.0   M  2000
         pop prv    yr
c  3000000.0   M  2000


In [19]:
df1["new_col"] = "NEW" #to create a new column
print(df1)
df1["new_col"] = ["Y","N","Y","Y","N"] #to introduce values
print(df1)

         pop prv    yr new_col
a  1500000.0   M  1900     NEW
b  2000000.0   M  1950     NEW
c  3000000.0   M  2000     NEW
d   500000.0   B  1900     NEW
e  1500000.0   B  2000     NEW
         pop prv    yr new_col
a  1500000.0   M  1900       Y
b  2000000.0   M  1950       N
c  3000000.0   M  2000       Y
d   500000.0   B  1900       Y
e  1500000.0   B  2000       N


In [20]:
df1.T  #transposes df

Unnamed: 0,a,b,c,d,e
pop,1.5e+06,2e+06,3e+06,500000,1.5e+06
prv,M,M,M,B,B
yr,1900,1950,2000,1900,2000
new_col,Y,N,Y,Y,N


In [21]:
print(df1.drop(index=["a","b"])) #removes the rows a & b, inplace=False default
print(df1.drop(columns=["pop","yr"]))

         pop prv    yr new_col
c  3000000.0   M  2000       Y
d   500000.0   B  1900       Y
e  1500000.0   B  2000       N
  prv new_col
a   M       Y
b   M       N
c   M       Y
d   B       Y
e   B       N


In [22]:
df2 = df1
df3 = df1.copy()  #we need to use copy to make a real copy instead of copying the reference
print(df2)
df1["pop"] = "???"
print(df2)
print(df3)

         pop prv    yr new_col
a  1500000.0   M  1900       Y
b  2000000.0   M  1950       N
c  3000000.0   M  2000       Y
d   500000.0   B  1900       Y
e  1500000.0   B  2000       N
   pop prv    yr new_col
a  ???   M  1900       Y
b  ???   M  1950       N
c  ???   M  2000       Y
d  ???   B  1900       Y
e  ???   B  2000       N
         pop prv    yr new_col
a  1500000.0   M  1900       Y
b  2000000.0   M  1950       N
c  3000000.0   M  2000       Y
d   500000.0   B  1900       Y
e  1500000.0   B  2000       N


In [23]:
df_random = pd.DataFrame(np.random.randn(4,3) * 17 + 15, columns=list('bde'), index=list('BMPZ'))
df_random

Unnamed: 0,b,d,e
B,15.368067,36.240232,3.875989
M,62.74909,2.174994,12.385257
P,7.72818,6.420763,55.358633
Z,29.175079,0.19443,22.588693


In [24]:
print(df_random.apply(lambda x: x.max() -x.min()))  #takes a func as argument and passes the series as args
print(df_random.apply(lambda x: x.max() -x.min(), axis=1)) #same but for rows
print(df_random.applymap(lambda x: str(x)+" mm")) #iterates through every element of the df, similar to a map

def f(series):
    return pd.Series([series.max(), series.min()], index=['max', 'min'])
print(df_random.apply(f))

b    55.020909
d    36.045802
e    51.482644
dtype: float64
B    32.364243
M    60.574096
P    48.937869
Z    28.980649
dtype: float64
                       b                      d                      e
B  15.368066731534462 mm    36.2402319636121 mm   3.875988523750923 mm
M   62.74908971802465 mm  2.1749941916530986 mm  12.385257236071423 mm
P   7.728180403996898 mm   6.420763338890222 mm   55.35863262572573 mm
Z   29.17507942193945 mm  0.1944303013353359 mm    22.5886934580049 mm
            b          d          e
max  62.74909  36.240232  55.358633
min   7.72818   0.194430   3.875989


In [25]:
df1

Unnamed: 0,pop,prv,yr,new_col
a,???,M,1900,Y
b,???,M,1950,N
c,???,M,2000,Y
d,???,B,1900,Y
e,???,B,2000,N


In [26]:
df1.sort_values("yr") #we can specify a tiebreaker by passing it a list

Unnamed: 0,pop,prv,yr,new_col
a,???,M,1900,Y
d,???,B,1900,Y
b,???,M,1950,N
c,???,M,2000,Y
e,???,B,2000,N


In [27]:
df1["yr"].rank()

a    1.5
b    3.0
c    4.5
d    1.5
e    4.5
Name: yr, dtype: float64

#### Exercise

Write a function that takes a Series and returns the top 10% registers. In this case, earners. Test it with this Series:

```python
salaries = pd.Series([150000, 90000, 120000,30000,10000,5000,40000, 50000, 80000, 35000, 27000,14000, 28000, 22000,25000])
```

In [28]:
def top_salaries(serie):
    return serie.sort_values(ascending=False)[:round(len(serie)*0.1)]
salaries = pd.Series([150000, 90000, 120000,30000,10000,5000,40000, 50000, 80000, 35000, 27000,14000, 28000, 22000,25000])
top_salaries(salaries)

0    150000
2    120000
dtype: int64

#### Exploratory commands
When first exploring a dataframe

In [29]:
print(df1.head())
print(df1.tail())

   pop prv    yr new_col
a  ???   M  1900       Y
b  ???   M  1950       N
c  ???   M  2000       Y
d  ???   B  1900       Y
e  ???   B  2000       N
   pop prv    yr new_col
a  ???   M  1900       Y
b  ???   M  1950       N
c  ???   M  2000       Y
d  ???   B  1900       Y
e  ???   B  2000       N


In [30]:
print(df1.shape)
print(df1.size)

(5, 4)
20


In [31]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, a to e
Data columns (total 4 columns):
pop        5 non-null object
prv        5 non-null object
yr         5 non-null int64
new_col    5 non-null object
dtypes: int64(1), object(3)
memory usage: 360.0+ bytes


In [32]:
df1.count()

pop        5
prv        5
yr         5
new_col    5
dtype: int64

In [33]:
df1.describe()

Unnamed: 0,yr
count,5.0
mean,1950.0
std,50.0
min,1900.0
25%,1900.0
50%,1950.0
75%,2000.0
max,2000.0


In [34]:
df1["prv"].value_counts()

M    3
B    2
Name: prv, dtype: int64

In [35]:
df1["prv"].unique()

array(['M', 'B'], dtype=object)

#### Missing data

In [36]:
string_data = pd.Series(['Ma', 'Lu', 'Ca', 'Va', np.nan])
string_data

0     Ma
1     Lu
2     Ca
3     Va
4    NaN
dtype: object

In [37]:
np.nan == np.nan

False

In [38]:
print(string_data.isnull())
print(~string_data.isnull()) #opposite
print(string_data.notnull())

0    False
1    False
2    False
3    False
4     True
dtype: bool
0     True
1     True
2     True
3     True
4    False
dtype: bool
0     True
1     True
2     True
3     True
4    False
dtype: bool


In [39]:
array = np.random.randn(8,3) * 20 + 100

df4 = pd.DataFrame(array, columns=list('xyz'), index=list('abcdefgh'))
df4.iloc[2:5, 1] = np.nan
df4.iloc[1:3, 2] = np.nan
df4

Unnamed: 0,x,y,z
a,101.653847,76.498424,114.095471
b,95.490456,86.061915,
c,88.302195,,
d,81.153492,,88.84411
e,134.456672,,100.504632
f,38.506375,91.005735,69.070754
g,117.308582,84.853364,91.846294
h,87.023377,102.795503,109.618989


In [40]:
print(df4.isnull().all())
print(df4.isnull().any())

x    False
y    False
z    False
dtype: bool
x    False
y     True
z     True
dtype: bool


In [41]:
print(df4.dropna())
print(df4.dropna(axis=1))  #can use other useful flags like how="any", or thresh

            x           y           z
a  101.653847   76.498424  114.095471
f   38.506375   91.005735   69.070754
g  117.308582   84.853364   91.846294
h   87.023377  102.795503  109.618989
            x
a  101.653847
b   95.490456
c   88.302195
d   81.153492
e  134.456672
f   38.506375
g  117.308582
h   87.023377


In [42]:
print(df4.fillna("0"))
print(df4.fillna({'x' : 100, 'y' : 50, 'z' : 20}))
print(df4.fillna(df4.mean()))
print(df4.fillna(method='ffill')) #a lot of other useful flags

            x        y        z
a  101.653847  76.4984  114.095
b   95.490456  86.0619        0
c   88.302195        0        0
d   81.153492        0  88.8441
e  134.456672        0  100.505
f   38.506375  91.0057  69.0708
g  117.308582  84.8534  91.8463
h   87.023377  102.796  109.619
            x           y           z
a  101.653847   76.498424  114.095471
b   95.490456   86.061915   20.000000
c   88.302195   50.000000   20.000000
d   81.153492   50.000000   88.844110
e  134.456672   50.000000  100.504632
f   38.506375   91.005735   69.070754
g  117.308582   84.853364   91.846294
h   87.023377  102.795503  109.618989
            x           y           z
a  101.653847   76.498424  114.095471
b   95.490456   86.061915   95.663375
c   88.302195   88.242988   95.663375
d   81.153492   88.242988   88.844110
e  134.456672   88.242988  100.504632
f   38.506375   91.005735   69.070754
g  117.308582   84.853364   91.846294
h   87.023377  102.795503  109.618989
            x           y   