# Dataframes

In [1]:
import pandas as pd

ages = pd.Series([20, 30, 25, 60, 27], index = ['Juan', 'Mary', 'Peter', 'Ana', 'Alex'])
ages

Juan     20
Mary     30
Peter    25
Ana      60
Alex     27
dtype: int64

In [2]:
ages2 = pd.Series([21, 31, 26, 61, 28], index = ['Juan', 'Mary', 'Peter', 'Ana', 'Alex'])
ages2

Juan     21
Mary     31
Peter    26
Ana      61
Alex     28
dtype: int64

In [3]:
d_ages = pd.DataFrame({'Ages': ages})
d_ages

Unnamed: 0,Ages
Juan,20
Mary,30
Peter,25
Ana,60
Alex,27


In [4]:
d_ages = pd.DataFrame({'Ages': ages, 'Ages_1': ages2})
d_ages

Unnamed: 0,Ages,Ages_1
Juan,20,21
Mary,30,31
Peter,25,26
Ana,60,61
Alex,27,28


In [5]:
d_ages.Ages

Juan     20
Mary     30
Peter    25
Ana      60
Alex     27
Name: Ages, dtype: int64

## Dataframes example 2

In [6]:
# importing pandas
dataFrame = pd.DataFrame([[2,4,6],[10,20,30]], columns=["col1", "col2", "col3"], index=["row1","row2"])
print(dataFrame)

      col1  col2  col3
row1     2     4     6
row2    10    20    30


In [7]:
# Selecting column 1
print(dataFrame.col1)

# Calculating mean
print(dataFrame.col1.mean())

row1     2
row2    10
Name: col1, dtype: int64
6.0


In [8]:
# Using Dictionaries as dataframes
datos = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}
df2 = pd.DataFrame.from_dict(datos)
print(df2)  

   col_1 col_2
0      3     a
1      2     b
2      1     c
3      0     d


In [9]:
# Printing shape parameters
size = df2.shape
rows = size[0]
cols = size[1]

print(size)
print(rows)
print(cols)


(4, 2)
4
2


In [10]:
# 
import pandas as pd
import numpy as np

df = pd.DataFrame(np.random.randn(5, 4),['row1', 'row2', 'row3', 'row4', 'row5'], ['col1', 'col2', 'col3', 'col4'])
df

Unnamed: 0,col1,col2,col3,col4
row1,1.067625,0.348785,-2.19953,1.05017
row2,0.056525,0.36664,-0.843097,-1.623633
row3,0.051514,-0.176979,0.700818,-2.142903
row4,-0.496451,1.276041,-0.047157,-0.843715
row5,1.941917,-1.767063,-0.336761,1.145178


In [11]:
# Selecting a specific value
df.at['row1', 'col1']

1.067625251474228

In [12]:
# Selecting a specific value
df.iat[0, 0]

1.067625251474228

In [13]:
# Selecting column
df['col1'] # df.col1

row1    1.067625
row2    0.056525
row3    0.051514
row4   -0.496451
row5    1.941917
Name: col1, dtype: float64

In [14]:
# Selecting two columns
df[['col1', 'col2']]

Unnamed: 0,col1,col2
row1,1.067625,0.348785
row2,0.056525,0.36664
row3,0.051514,-0.176979
row4,-0.496451,1.276041
row5,1.941917,-1.767063


In [15]:
# Generating new column
df['other'] = df['col3'] + df['col4']
df

Unnamed: 0,col1,col2,col3,col4,other
row1,1.067625,0.348785,-2.19953,1.05017,-1.14936
row2,0.056525,0.36664,-0.843097,-1.623633,-2.46673
row3,0.051514,-0.176979,0.700818,-2.142903,-1.442086
row4,-0.496451,1.276041,-0.047157,-0.843715,-0.890872
row5,1.941917,-1.767063,-0.336761,1.145178,0.808417


In [16]:
# Dropping columns
# Inplace is for save data
df.drop('other', axis = 1, inplace = True)
df

Unnamed: 0,col1,col2,col3,col4
row1,1.067625,0.348785,-2.19953,1.05017
row2,0.056525,0.36664,-0.843097,-1.623633
row3,0.051514,-0.176979,0.700818,-2.142903
row4,-0.496451,1.276041,-0.047157,-0.843715
row5,1.941917,-1.767063,-0.336761,1.145178


In [17]:
# Dropping Rows
df.drop('row1')

Unnamed: 0,col1,col2,col3,col4
row2,0.056525,0.36664,-0.843097,-1.623633
row3,0.051514,-0.176979,0.700818,-2.142903
row4,-0.496451,1.276041,-0.047157,-0.843715
row5,1.941917,-1.767063,-0.336761,1.145178


In [18]:
# row was not deleted
# it needs inplace parameter in True
df

Unnamed: 0,col1,col2,col3,col4
row1,1.067625,0.348785,-2.19953,1.05017
row2,0.056525,0.36664,-0.843097,-1.623633
row3,0.051514,-0.176979,0.700818,-2.142903
row4,-0.496451,1.276041,-0.047157,-0.843715
row5,1.941917,-1.767063,-0.336761,1.145178


In [19]:
# Using loc method for named index
df.loc['row1']

col1    1.067625
col2    0.348785
col3   -2.199530
col4    1.050170
Name: row1, dtype: float64

In [20]:
# using iloc for numeric index
df.iloc[0]

col1    1.067625
col2    0.348785
col3   -2.199530
col4    1.050170
Name: row1, dtype: float64

In [21]:
# Selecting specific values
df.loc['row1', 'col2']

0.3487853839262323

In [22]:
#Selecting several values
df.loc[['row1', 'row2'],['col1', 'col3']]

Unnamed: 0,col1,col3
row1,1.067625,-2.19953
row2,0.056525,-0.843097


In [23]:
# Conditional selecting
df[df['col1'] > 0]

Unnamed: 0,col1,col2,col3,col4
row1,1.067625,0.348785,-2.19953,1.05017
row2,0.056525,0.36664,-0.843097,-1.623633
row3,0.051514,-0.176979,0.700818,-2.142903
row5,1.941917,-1.767063,-0.336761,1.145178


In [24]:
# Another way
df[df.col1 > 0]

Unnamed: 0,col1,col2,col3,col4
row1,1.067625,0.348785,-2.19953,1.05017
row2,0.056525,0.36664,-0.843097,-1.623633
row3,0.051514,-0.176979,0.700818,-2.142903
row5,1.941917,-1.767063,-0.336761,1.145178


In [25]:
# Selecting from conditionals
df[df['col1'] > 0][['col2', 'col4']]

Unnamed: 0,col2,col4
row1,0.348785,1.05017
row2,0.36664,-1.623633
row3,-0.176979,-2.142903
row5,-1.767063,1.145178


In [26]:
# Another way
df[df.col1 > 0][['col2', 'col4']]

Unnamed: 0,col2,col4
row1,0.348785,1.05017
row2,0.36664,-1.623633
row3,-0.176979,-2.142903
row5,-1.767063,1.145178


In [27]:
# Multiple conditions
# & is for "and" in columns. | is for "or"
df[(df['col1'] < 0) & (df['col3'] < 0)]

Unnamed: 0,col1,col2,col3,col4
row4,-0.496451,1.276041,-0.047157,-0.843715


In [28]:
# Multiple levels for index
out_ = ['O1', 'O1', 'O1', 'O2', 'O2', 'O2']
in_ = [1, 2, 3, 1, 2, 3]
h_index = list(zip(out_, in_))
h_index = pd.MultiIndex.from_tuples(h_index)

In [29]:
import numpy as np
import pandas as pn

df_m = pn.DataFrame(np.random.rand(6,5), h_index, ['col1', 'col2', 'col3', 'col4', 'col5'])
df_m

Unnamed: 0,Unnamed: 1,col1,col2,col3,col4,col5
O1,1,0.344794,0.981136,0.417849,0.618606,0.404393
O1,2,0.962319,0.871739,0.657054,0.546215,0.003927
O1,3,0.126379,0.04578,0.438514,0.166817,0.286659
O2,1,0.702238,0.731196,0.389092,0.096492,0.85529
O2,2,0.432766,0.370186,0.931875,0.672306,0.618859
O2,3,0.250013,0.794912,0.780254,0.087169,0.44258


In [30]:
# Adding names to indexes
df_m.index.names = ['Groups','Nums']
df_m

Unnamed: 0_level_0,Unnamed: 1_level_0,col1,col2,col3,col4,col5
Groups,Nums,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
O1,1,0.344794,0.981136,0.417849,0.618606,0.404393
O1,2,0.962319,0.871739,0.657054,0.546215,0.003927
O1,3,0.126379,0.04578,0.438514,0.166817,0.286659
O2,1,0.702238,0.731196,0.389092,0.096492,0.85529
O2,2,0.432766,0.370186,0.931875,0.672306,0.618859
O2,3,0.250013,0.794912,0.780254,0.087169,0.44258


In [31]:
# Selecting a sub dataframe
df_m.loc['O1']

Unnamed: 0_level_0,col1,col2,col3,col4,col5
Nums,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.344794,0.981136,0.417849,0.618606,0.404393
2,0.962319,0.871739,0.657054,0.546215,0.003927
3,0.126379,0.04578,0.438514,0.166817,0.286659


In [32]:
# Using missing values
d = {'A':[10, np.nan], 'B':[np.nan, 50] }
df_d = pn.DataFrame(d)
df_d

Unnamed: 0,A,B
0,10.0,
1,,50.0


In [33]:
df_d.dropna()

Unnamed: 0,A,B


In [34]:
df_d.fillna('filled')

Unnamed: 0,A,B
0,10,filled
1,filled,50


## Modifying Dataframes

In [35]:
score1 = pd.Series([200, 300, 250, 600, 270], index = ['Juan', 'Mary', 'Peter', 'Ana', 'Alex'])
score2 = pd.Series([230, 320, 280, 570, 370], index = ['Juan', 'Mary', 'Peter', 'Ana', 'Alex'])
score3 = pd.Series([130, 370, 480, 370, 490], index = ['Juan', 'Mary', 'Peter', 'Ana', 'Alex'])

d_scores = pd.DataFrame({'Score1': score1, 'Score2': score1, 'Score3': score1})
d_scores

Unnamed: 0,Score1,Score2,Score3
Juan,200,200,200
Mary,300,300,300
Peter,250,250,250
Ana,600,600,600
Alex,270,270,270


In [36]:
# Changing column names
d_scores_n = d_scores.rename(columns = {'Score1': 'First_Year'})
d_scores_n

Unnamed: 0,First_Year,Score2,Score3
Juan,200,200,200
Mary,300,300,300
Peter,250,250,250
Ana,600,600,600
Alex,270,270,270


In [37]:
# Adding new column
d_scores_n['Score4'] = ([400, 340, 560, 450, 540]) 
d_scores_n

Unnamed: 0,First_Year,Score2,Score3,Score4
Juan,200,200,200,400
Mary,300,300,300,340
Peter,250,250,250,560
Ana,600,600,600,450
Alex,270,270,270,540
