# Dataframes

In [30]:
import pandas as pd

ages = pd.Series([20, 30, 25, 60, 27], index = ['Juan', 'Mary', 'Peter', 'Ana', 'Alex'])
ages

Juan     20
Mary     30
Peter    25
Ana      60
Alex     27
dtype: int64

In [31]:
ages2 = pd.Series([21, 31, 26, 61, 28], index = ['Juan', 'Mary', 'Peter', 'Ana', 'Alex'])
ages2

Juan     21
Mary     31
Peter    26
Ana      61
Alex     28
dtype: int64

In [32]:
d_ages = pd.DataFrame({'Ages': ages})
d_ages

Unnamed: 0,Ages
Juan,20
Mary,30
Peter,25
Ana,60
Alex,27


In [33]:
d_ages = pd.DataFrame({'Ages': ages, 'Ages + 1': ages2})
d_ages

Unnamed: 0,Ages,Ages + 1
Juan,20,21
Mary,30,31
Peter,25,26
Ana,60,61
Alex,27,28


## Dataframes example 2

In [34]:
# importing pandas
dataFrame = pd.DataFrame([[2,4,6],[10,20,30]], columns=["col1", "col2", "col3"], index=["row1","row2"])
print(dataFrame)

      col1  col2  col3
row1     2     4     6
row2    10    20    30


In [35]:
# Selecting column 1
print(dataFrame.col1)

# Calculating mean
print(dataFrame.col1.mean())

row1     2
row2    10
Name: col1, dtype: int64
6.0


In [36]:
# Using Dictionaries as dataframes
datos = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}
df2 = pd.DataFrame.from_dict(datos)
print(df2)  

   col_1 col_2
0      3     a
1      2     b
2      1     c
3      0     d


In [37]:
# Printing shape parameters
size = df2.shape
rows = size[0]
cols = size[1]

print(size)
print(rows)
print(cols)


(4, 2)
4
2


In [38]:
# 
import pandas as pd
import numpy as np

df = pd.DataFrame(np.random.randn(5, 4),['row1', 'row2', 'row3', 'row4', 'row5'], ['col1', 'col2', 'col3', 'col4'])
df

Unnamed: 0,col1,col2,col3,col4
row1,-0.526862,0.576648,0.498533,3.013247
row2,1.461852,-2.465466,-0.192134,1.446818
row3,0.45079,-0.281241,-0.202617,-1.857869
row4,0.546703,0.35227,0.37628,-1.133154
row5,1.008791,-0.545742,0.313927,-1.797347


In [39]:
# Selecting column
df['col1'] # df.col1

row1   -0.526862
row2    1.461852
row3    0.450790
row4    0.546703
row5    1.008791
Name: col1, dtype: float64

In [40]:
# Selecting two columns
df[['col1', 'col2']]

Unnamed: 0,col1,col2
row1,-0.526862,0.576648
row2,1.461852,-2.465466
row3,0.45079,-0.281241
row4,0.546703,0.35227
row5,1.008791,-0.545742


In [41]:
# Generating new column
df['other'] = df['col3'] + df['col4']
df

Unnamed: 0,col1,col2,col3,col4,other
row1,-0.526862,0.576648,0.498533,3.013247,3.51178
row2,1.461852,-2.465466,-0.192134,1.446818,1.254684
row3,0.45079,-0.281241,-0.202617,-1.857869,-2.060486
row4,0.546703,0.35227,0.37628,-1.133154,-0.756874
row5,1.008791,-0.545742,0.313927,-1.797347,-1.48342


In [42]:
# Dropping columns
# Inplace is for save data
df.drop('other', axis = 1, inplace = True)
df

Unnamed: 0,col1,col2,col3,col4
row1,-0.526862,0.576648,0.498533,3.013247
row2,1.461852,-2.465466,-0.192134,1.446818
row3,0.45079,-0.281241,-0.202617,-1.857869
row4,0.546703,0.35227,0.37628,-1.133154
row5,1.008791,-0.545742,0.313927,-1.797347


In [43]:
# Dropping Rows
df.drop('row1')

Unnamed: 0,col1,col2,col3,col4
row2,1.461852,-2.465466,-0.192134,1.446818
row3,0.45079,-0.281241,-0.202617,-1.857869
row4,0.546703,0.35227,0.37628,-1.133154
row5,1.008791,-0.545742,0.313927,-1.797347


In [44]:
# row was not deleted
# it needs inplace parameter in True
df

Unnamed: 0,col1,col2,col3,col4
row1,-0.526862,0.576648,0.498533,3.013247
row2,1.461852,-2.465466,-0.192134,1.446818
row3,0.45079,-0.281241,-0.202617,-1.857869
row4,0.546703,0.35227,0.37628,-1.133154
row5,1.008791,-0.545742,0.313927,-1.797347


In [45]:
# Using loc method for named index
df.loc['row1']

col1   -0.526862
col2    0.576648
col3    0.498533
col4    3.013247
Name: row1, dtype: float64

In [46]:
# using iloc for numeric index
df.iloc[0]

col1   -0.526862
col2    0.576648
col3    0.498533
col4    3.013247
Name: row1, dtype: float64

In [47]:
# Selecting specific values
df.loc['row1', 'col2']

0.5766481117851419

In [48]:
#Selecting several values
df.loc[['row1', 'row2'],['col1', 'col3']]

Unnamed: 0,col1,col3
row1,-0.526862,0.498533
row2,1.461852,-0.192134


In [49]:
# Conditional selecting
df[df['col1'] > 0]

Unnamed: 0,col1,col2,col3,col4
row2,1.461852,-2.465466,-0.192134,1.446818
row3,0.45079,-0.281241,-0.202617,-1.857869
row4,0.546703,0.35227,0.37628,-1.133154
row5,1.008791,-0.545742,0.313927,-1.797347


In [50]:
# Selecting from conditionals
df[df['col1'] > 0][['col2', 'col4']]

Unnamed: 0,col2,col4
row2,-2.465466,1.446818
row3,-0.281241,-1.857869
row4,0.35227,-1.133154
row5,-0.545742,-1.797347


In [51]:
# Multiple conditions
# & is for "and" in columns. | is for "or"
df[(df['col1'] < 0) & (df['col3'] < 0)]

Unnamed: 0,col1,col2,col3,col4


In [52]:
# Multiple levels for index
out_ = ['O1', 'O1', 'O1', 'O2', 'O2', 'O2']
in_ = [1, 2, 3, 1, 2, 3]
h_index = list(zip(out_, in_))
h_index = pd.MultiIndex.from_tuples(h_index)

In [53]:
import numpy as np
import pandas as pn

df_m = pn.DataFrame(np.random.rand(6,5), h_index, ['col1', 'col2', 'col3', 'col4', 'col5'])
df_m

Unnamed: 0,Unnamed: 1,col1,col2,col3,col4,col5
O1,1,0.826692,0.010842,0.813647,0.16847,0.294913
O1,2,0.607761,0.934991,0.406262,0.48293,0.88794
O1,3,0.453967,0.671976,0.519942,0.602196,0.05585
O2,1,0.925163,0.319771,0.885154,0.638592,0.025755
O2,2,0.848574,0.162614,0.744478,0.791905,0.059517
O2,3,0.812114,0.476268,0.822858,0.930678,0.705266


In [54]:
# Adding names to indexes
df_m.index.names = ['Groups','Nums']
df_m

Unnamed: 0_level_0,Unnamed: 1_level_0,col1,col2,col3,col4,col5
Groups,Nums,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
O1,1,0.826692,0.010842,0.813647,0.16847,0.294913
O1,2,0.607761,0.934991,0.406262,0.48293,0.88794
O1,3,0.453967,0.671976,0.519942,0.602196,0.05585
O2,1,0.925163,0.319771,0.885154,0.638592,0.025755
O2,2,0.848574,0.162614,0.744478,0.791905,0.059517
O2,3,0.812114,0.476268,0.822858,0.930678,0.705266


In [55]:
# Selecting a sub dataframe
df_m.loc['O1']

Unnamed: 0_level_0,col1,col2,col3,col4,col5
Nums,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.826692,0.010842,0.813647,0.16847,0.294913
2,0.607761,0.934991,0.406262,0.48293,0.88794
3,0.453967,0.671976,0.519942,0.602196,0.05585


In [56]:
# Using missing values
d = {'A':[10, np.nan], 'B':[np.nan, 50] }
df_d = pn.DataFrame(d)
df_d

Unnamed: 0,A,B
0,10.0,
1,,50.0


In [57]:
df_d.dropna()

Unnamed: 0,A,B


In [58]:
df_d.fillna('filled')

Unnamed: 0,A,B
0,10,filled
1,filled,50
