# Dataframes

In [1]:
import pandas as pd

ages = pd.Series([20, 30, 25, 60, 27], index = ['Juan', 'Mary', 'Peter', 'Ana', 'Alex'])
ages

Juan     20
Mary     30
Peter    25
Ana      60
Alex     27
dtype: int64

In [2]:
ages2 = pd.Series([21, 31, 26, 61, 28], index = ['Juan', 'Mary', 'Peter', 'Ana', 'Alex'])
ages2

Juan     21
Mary     31
Peter    26
Ana      61
Alex     28
dtype: int64

In [3]:
d_ages = pd.DataFrame({'Ages': ages})
d_ages

Unnamed: 0,Ages
Juan,20
Mary,30
Peter,25
Ana,60
Alex,27


In [4]:
d_ages = pd.DataFrame({'Ages': ages, 'Ages_1': ages2})
d_ages

Unnamed: 0,Ages,Ages_1
Juan,20,21
Mary,30,31
Peter,25,26
Ana,60,61
Alex,27,28


In [5]:
d_ages.Ages

Juan     20
Mary     30
Peter    25
Ana      60
Alex     27
Name: Ages, dtype: int64

In [6]:
data = {'Names': ['Juan', 'Peter', 'Mary', 'ana'],
        'Age':[34,32,31,29],
        'City':['Bogotá', 'Cali', 'Medellín', 'Cucuta']
       }

data_fm = pd.DataFrame(data)
display(data_fm)

Unnamed: 0,Names,Age,City
0,Juan,34,Bogotá
1,Peter,32,Cali
2,Mary,31,Medellín
3,ana,29,Cucuta


In [49]:
display(data_fm[data_fm.Age > 30])

Unnamed: 0,Names,Age,City
0,Juan,34,Bogotá
1,Peter,32,Cali
2,Mary,31,Medellín


## Dataframes example 2

In [7]:
# importing pandas
dataFrame = pd.DataFrame([[2,4,6],[10,20,30]], columns=["col1", "col2", "col3"], index=["row1","row2"])
print(dataFrame)

      col1  col2  col3
row1     2     4     6
row2    10    20    30


In [8]:
# Selecting column 1
print(dataFrame.col1)

# Calculating mean
print(dataFrame.col1.mean())

row1     2
row2    10
Name: col1, dtype: int64
6.0


In [9]:
# Using Dictionaries as dataframes
datos = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}
df2 = pd.DataFrame.from_dict(datos)
print(df2)  

   col_1 col_2
0      3     a
1      2     b
2      1     c
3      0     d


In [10]:
# Printing shape parameters
size = df2.shape
rows = size[0]
cols = size[1]

print(size)
print(rows)
print(cols)


(4, 2)
4
2


In [11]:
# 
import pandas as pd
import numpy as np

df = pd.DataFrame(np.random.randn(5, 4),['row1', 'row2', 'row3', 'row4', 'row5'], ['col1', 'col2', 'col3', 'col4'])
df

Unnamed: 0,col1,col2,col3,col4
row1,-1.152851,-1.998389,0.190963,-0.539467
row2,-0.90195,0.252632,-0.57985,-0.441831
row3,-0.439383,1.493467,2.787627,1.158999
row4,0.15684,1.808707,-0.116071,1.260174
row5,-1.426372,-0.866083,0.427808,-0.166075


In [12]:
# Selecting a specific value
df.at['row1', 'col1']

-1.1528514271821932

In [13]:
# Selecting a specific value
df.iat[0, 0]

-1.1528514271821932

In [14]:
# Selecting column
df['col1'] # df.col1

row1   -1.152851
row2   -0.901950
row3   -0.439383
row4    0.156840
row5   -1.426372
Name: col1, dtype: float64

In [15]:
# Selecting two columns
df[['col1', 'col2']]

Unnamed: 0,col1,col2
row1,-1.152851,-1.998389
row2,-0.90195,0.252632
row3,-0.439383,1.493467
row4,0.15684,1.808707
row5,-1.426372,-0.866083


In [16]:
# Generating new column
df['other'] = df['col3'] + df['col4']
df

Unnamed: 0,col1,col2,col3,col4,other
row1,-1.152851,-1.998389,0.190963,-0.539467,-0.348504
row2,-0.90195,0.252632,-0.57985,-0.441831,-1.021681
row3,-0.439383,1.493467,2.787627,1.158999,3.946626
row4,0.15684,1.808707,-0.116071,1.260174,1.144103
row5,-1.426372,-0.866083,0.427808,-0.166075,0.261733


In [17]:
# Dropping columns
# Inplace is for save data
df.drop('other', axis = 1, inplace = True)
df

Unnamed: 0,col1,col2,col3,col4
row1,-1.152851,-1.998389,0.190963,-0.539467
row2,-0.90195,0.252632,-0.57985,-0.441831
row3,-0.439383,1.493467,2.787627,1.158999
row4,0.15684,1.808707,-0.116071,1.260174
row5,-1.426372,-0.866083,0.427808,-0.166075


In [18]:
# Dropping Rows
df.drop('row1')

Unnamed: 0,col1,col2,col3,col4
row2,-0.90195,0.252632,-0.57985,-0.441831
row3,-0.439383,1.493467,2.787627,1.158999
row4,0.15684,1.808707,-0.116071,1.260174
row5,-1.426372,-0.866083,0.427808,-0.166075


In [19]:
# row was not deleted
# it needs inplace parameter in True
df

Unnamed: 0,col1,col2,col3,col4
row1,-1.152851,-1.998389,0.190963,-0.539467
row2,-0.90195,0.252632,-0.57985,-0.441831
row3,-0.439383,1.493467,2.787627,1.158999
row4,0.15684,1.808707,-0.116071,1.260174
row5,-1.426372,-0.866083,0.427808,-0.166075


In [20]:
# Using loc method for named index
df.loc['row1']

col1   -1.152851
col2   -1.998389
col3    0.190963
col4   -0.539467
Name: row1, dtype: float64

In [21]:
# using iloc for numeric index
df.iloc[0]

col1   -1.152851
col2   -1.998389
col3    0.190963
col4   -0.539467
Name: row1, dtype: float64

In [22]:
# Selecting specific values
df.loc['row1', 'col2']

-1.998389376386656

In [23]:
#Selecting several values
df.loc[['row1', 'row2'],['col1', 'col3']]

Unnamed: 0,col1,col3
row1,-1.152851,0.190963
row2,-0.90195,-0.57985


In [24]:
# Conditional selecting
df[df['col1'] > 0]

Unnamed: 0,col1,col2,col3,col4
row4,0.15684,1.808707,-0.116071,1.260174


In [25]:
# Another way
df[df.col1 > 0]

Unnamed: 0,col1,col2,col3,col4
row4,0.15684,1.808707,-0.116071,1.260174


In [26]:
# Selecting from conditionals
df[df['col1'] > 0][['col2', 'col4']]

Unnamed: 0,col2,col4
row4,1.808707,1.260174


In [27]:
# Another way
df[df.col1 > 0][['col2', 'col4']]

Unnamed: 0,col2,col4
row4,1.808707,1.260174


In [28]:
# Multiple conditions
# & is for "and" in columns. | is for "or"
df[(df['col1'] < 0) & (df['col3'] < 0)]

Unnamed: 0,col1,col2,col3,col4
row2,-0.90195,0.252632,-0.57985,-0.441831


In [29]:
# Multiple levels for index
out_ = ['O1', 'O1', 'O1', 'O2', 'O2', 'O2']
in_ = [1, 2, 3, 1, 2, 3]
h_index = list(zip(out_, in_))
h_index = pd.MultiIndex.from_tuples(h_index)

In [30]:
import numpy as np
import pandas as pn

df_m = pn.DataFrame(np.random.rand(6,5), h_index, ['col1', 'col2', 'col3', 'col4', 'col5'])
df_m

Unnamed: 0,Unnamed: 1,col1,col2,col3,col4,col5
O1,1,0.57073,0.656034,0.702342,0.590967,0.390004
O1,2,0.196783,0.638698,0.100675,0.561564,0.633367
O1,3,0.823598,0.984892,0.652394,0.915303,0.085305
O2,1,0.092367,0.94591,0.335691,0.926701,0.356134
O2,2,0.817137,0.677543,0.145638,0.860829,0.026204
O2,3,0.27908,0.921798,0.368321,0.128749,0.833221


In [31]:
# Adding names to indexes
df_m.index.names = ['Groups','Nums']
df_m

Unnamed: 0_level_0,Unnamed: 1_level_0,col1,col2,col3,col4,col5
Groups,Nums,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
O1,1,0.57073,0.656034,0.702342,0.590967,0.390004
O1,2,0.196783,0.638698,0.100675,0.561564,0.633367
O1,3,0.823598,0.984892,0.652394,0.915303,0.085305
O2,1,0.092367,0.94591,0.335691,0.926701,0.356134
O2,2,0.817137,0.677543,0.145638,0.860829,0.026204
O2,3,0.27908,0.921798,0.368321,0.128749,0.833221


In [32]:
# Selecting a sub dataframe
df_m.loc['O1']

Unnamed: 0_level_0,col1,col2,col3,col4,col5
Nums,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.57073,0.656034,0.702342,0.590967,0.390004
2,0.196783,0.638698,0.100675,0.561564,0.633367
3,0.823598,0.984892,0.652394,0.915303,0.085305


In [33]:
# Using missing values
d = {'A':[10, np.nan], 'B':[np.nan, 50] }
df_d = pn.DataFrame(d)
df_d

Unnamed: 0,A,B
0,10.0,
1,,50.0


In [34]:
df_d.dropna()

Unnamed: 0,A,B


In [35]:
df_d.fillna('filled')

Unnamed: 0,A,B
0,10,filled
1,filled,50


## Modifying Dataframes

In [36]:
score1 = pd.Series([200, 300, 250, 600, 270], index = ['Juan', 'Mary', 'Peter', 'Ana', 'Alex'])
score2 = pd.Series([230, 320, 280, 570, 370], index = ['Juan', 'Mary', 'Peter', 'Ana', 'Alex'])
score3 = pd.Series([130, 370, 480, 370, 490], index = ['Juan', 'Mary', 'Peter', 'Ana', 'Alex'])

d_scores = pd.DataFrame({'Score1': score1, 'Score2': score1, 'Score3': score1})
d_scores

Unnamed: 0,Score1,Score2,Score3
Juan,200,200,200
Mary,300,300,300
Peter,250,250,250
Ana,600,600,600
Alex,270,270,270


In [37]:
# Changing column names
d_scores_n = d_scores.rename(columns = {'Score1': 'First_Year'})
d_scores_n

Unnamed: 0,First_Year,Score2,Score3
Juan,200,200,200
Mary,300,300,300
Peter,250,250,250
Ana,600,600,600
Alex,270,270,270


In [38]:
# Adding new column
d_scores_n['Score4'] = ([400, 340, 560, 450, 540]) 
d_scores_n

Unnamed: 0,First_Year,Score2,Score3,Score4
Juan,200,200,200,400
Mary,300,300,300,340
Peter,250,250,250,560
Ana,600,600,600,450
Alex,270,270,270,540


In [39]:
# Adding a new Row
d_scores_n.loc['Stuart'] = ([340, 260, 480, 340]) 
d_scores_n

Unnamed: 0,First_Year,Score2,Score3,Score4
Juan,200,200,200,400
Mary,300,300,300,340
Peter,250,250,250,560
Ana,600,600,600,450
Alex,270,270,270,540
Stuart,340,260,480,340


In [40]:
# Changing a value
d_scores_n.loc['Stuart', 'First_Year'] = 300
d_scores_n

Unnamed: 0,First_Year,Score2,Score3,Score4
Juan,200,200,200,400
Mary,300,300,300,340
Peter,250,250,250,560
Ana,600,600,600,450
Alex,270,270,270,540
Stuart,300,260,480,340


In [41]:
# Using insert method
d_scores_n.insert(1, 'Score1', [100, 200, 300, 200, 150, 240])
d_scores_n

Unnamed: 0,First_Year,Score1,Score2,Score3,Score4
Juan,200,100,200,200,400
Mary,300,200,300,300,340
Peter,250,300,250,250,560
Ana,600,200,600,600,450
Alex,270,150,270,270,540
Stuart,300,240,260,480,340


In [42]:
# Changing column names
d_scores_n.columns = ['First', 'Second', 'Third', 'Fourth', 'Fifth']
d_scores_n

Unnamed: 0,First,Second,Third,Fourth,Fifth
Juan,200,100,200,200,400
Mary,300,200,300,300,340
Peter,250,300,250,250,560
Ana,600,200,600,600,450
Alex,270,150,270,270,540
Stuart,300,240,260,480,340


In [43]:
# Creating a new dataframe
First = pd.Series([200, 300], index = ['Fred', 'Dany'])
Second = pd.Series([230, 320], index = ['Fred', 'Dany'])
Third = pd.Series([300, 400], index = ['Fred', 'Dany'])
Fourth = pd.Series([430, 420], index = ['Fred', 'Dany'])
Fifth = pd.Series([500, 600], index = ['Fred', 'Dany'])

new_scores = pd.DataFrame({'First': First, 'Second': Second, 'Third': Third, 'Fourth': Fourth, 'Fifth': Fifth})
new_scores

Unnamed: 0,First,Second,Third,Fourth,Fifth
Fred,200,230,300,430,500
Dany,300,320,400,420,600


In [44]:
# Appending Dataframes
total = d_scores_n.append(new_scores)
total

Unnamed: 0,First,Second,Third,Fourth,Fifth
Juan,200,100,200,200,400
Mary,300,200,300,300,340
Peter,250,300,250,250,560
Ana,600,200,600,600,450
Alex,270,150,270,270,540
Stuart,300,240,260,480,340
Fred,200,230,300,430,500
Dany,300,320,400,420,600


## Reindexing

In [45]:
d_scores

Unnamed: 0,Score1,Score2,Score3
Juan,200,200,200
Mary,300,300,300
Peter,250,250,250
Ana,600,600,600
Alex,270,270,270


In [46]:
# Resetting index
i_scores = d_scores.reset_index()
i_scores

Unnamed: 0,index,Score1,Score2,Score3
0,Juan,200,200,200
1,Mary,300,300,300
2,Peter,250,250,250
3,Ana,600,600,600
4,Alex,270,270,270


In [47]:
# Changing index column
i_scores.set_index('Score2')

Unnamed: 0_level_0,index,Score1,Score3
Score2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
200,Juan,200,200
300,Mary,300,300
250,Peter,250,250
600,Ana,600,600
270,Alex,270,270


In [48]:
# Multiple Index
i_scores.set_index(['Score1', 'Score2'])

Unnamed: 0_level_0,Unnamed: 1_level_0,index,Score3
Score1,Score2,Unnamed: 2_level_1,Unnamed: 3_level_1
200,200,Juan,200
300,300,Mary,300
250,250,Peter,250
600,600,Ana,600
270,270,Alex,270
