In [2]:
import numpy as np
import pandas as pd

In [3]:
from numpy.random import randn
np.random.seed(101)

rand_mat = randn(5,4)

In [4]:
rand_mat

array([[ 2.70684984,  0.62813271,  0.90796945,  0.50382575],
       [ 0.65111795, -0.31931804, -0.84807698,  0.60596535],
       [-2.01816824,  0.74012206,  0.52881349, -0.58900053],
       [ 0.18869531, -0.75887206, -0.93323722,  0.95505651],
       [ 0.19079432,  1.97875732,  2.60596728,  0.68350889]])

In [5]:
# creating DataFrame from 2d array with named leters as indexes for rows and columns
df = pd.DataFrame(data= rand_mat, index='A B C D E'.split(), columns='W X Y Z'.split())
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


#### grab data from DataFrame

In [6]:
# column contents by index
df['W']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [7]:
type(df['W'])

pandas.core.series.Series

In [8]:
# get two columns
df[['W','Y']]

Unnamed: 0,W,Y
A,2.70685,0.907969
B,0.651118,-0.848077
C,-2.018168,0.528813
D,0.188695,-0.933237
E,0.190794,2.605967


In [9]:
type(df[['W','Y']])

pandas.core.frame.DataFrame

In [10]:
# create a new column 
df['NEW'] = df['W'] + df['Y']
df

Unnamed: 0,W,X,Y,Z,NEW
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [11]:
# removing rows
df.drop('A')

Unnamed: 0,W,X,Y,Z,NEW
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [12]:
# removing columns
df.drop('W', axis=1)

Unnamed: 0,X,Y,Z,NEW
A,0.628133,0.907969,0.503826,3.614819
B,-0.319318,-0.848077,0.605965,-0.196959
C,0.740122,0.528813,-0.589001,-1.489355
D,-0.758872,-0.933237,0.955057,-0.744542
E,1.978757,2.605967,0.683509,2.796762


In [13]:
df

Unnamed: 0,W,X,Y,Z,NEW
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [14]:
# to make cnges permanent
df.drop('A', inplace=True)

In [15]:
df

Unnamed: 0,W,X,Y,Z,NEW
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [16]:
# selecting rows
#by name
df.loc['C']

W     -2.018168
X      0.740122
Y      0.528813
Z     -0.589001
NEW   -1.489355
Name: C, dtype: float64

In [17]:
# by integer location
df.iloc[1]

W     -2.018168
X      0.740122
Y      0.528813
Z     -0.589001
NEW   -1.489355
Name: C, dtype: float64

In [18]:
# two rows by name
df.loc[['B', 'E']]

Unnamed: 0,W,X,Y,Z,NEW
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [19]:
# get a subset of the DataFrame (similar to get subset of numpt 2d array)
df.loc[['C','D'],['W','Y']]

Unnamed: 0,W,Y
C,-2.018168,0.528813
D,0.188695,-0.933237


# part2

In [21]:
# generate a boolean DataFrame that it's entrees are the result of the statement
df > 0

Unnamed: 0,W,X,Y,Z,NEW
B,True,False,False,True,False
C,False,True,True,False,False
D,True,False,False,True,False
E,True,True,True,True,True


In [22]:
df_bool = df > 0

In [24]:
# get the relevant results over the original df
df[df_bool]

Unnamed: 0,W,X,Y,Z,NEW
B,0.651118,,,0.605965,
C,,0.740122,0.528813,,
D,0.188695,,,0.955057,
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [26]:
# per column
df['W'] > 0

B     True
C    False
D     True
E     True
Name: W, dtype: bool

In [27]:
# removing all the rows that has negative value in column 'W'
df[df['W'] > 0]

Unnamed: 0,W,X,Y,Z,NEW
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [30]:
# multiple conditions
cond1 = df['W'] > 0
cond2 = df['Y'] > 1

In [31]:
df[cond1 & cond2]

Unnamed: 0,W,X,Y,Z,NEW
E,0.190794,1.978757,2.605967,0.683509,2.796762


### indexing

In [32]:
df

Unnamed: 0,W,X,Y,Z,NEW
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [33]:
df.reset_index()

Unnamed: 0,index,W,X,Y,Z,NEW
0,B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
1,C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
2,D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
3,E,0.190794,1.978757,2.605967,0.683509,2.796762


In [44]:
# set column as index
new_col = 'AS DF GR BV'.split()

In [45]:
df['NEWW'] = new_col

In [46]:
df

Unnamed: 0_level_0,W,X,Y,Z,NEW,NEWW
NEWW,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AS,0.651118,-0.319318,-0.848077,0.605965,-0.196959,AS
DF,-2.018168,0.740122,0.528813,-0.589001,-1.489355,DF
GR,0.188695,-0.758872,-0.933237,0.955057,-0.744542,GR
BV,0.190794,1.978757,2.605967,0.683509,2.796762,BV


In [47]:
df.set_index('NEWW')

Unnamed: 0_level_0,W,X,Y,Z,NEW
NEWW,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AS,0.651118,-0.319318,-0.848077,0.605965,-0.196959
DF,-2.018168,0.740122,0.528813,-0.589001,-1.489355
GR,0.188695,-0.758872,-0.933237,0.955057,-0.744542
BV,0.190794,1.978757,2.605967,0.683509,2.796762


In [48]:
df

Unnamed: 0_level_0,W,X,Y,Z,NEW,NEWW
NEWW,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AS,0.651118,-0.319318,-0.848077,0.605965,-0.196959,AS
DF,-2.018168,0.740122,0.528813,-0.589001,-1.489355,DF
GR,0.188695,-0.758872,-0.933237,0.955057,-0.744542,GR
BV,0.190794,1.978757,2.605967,0.683509,2.796762,BV


In [49]:
df.set_index('NEWW', inplace=True)
df

Unnamed: 0_level_0,W,X,Y,Z,NEW
NEWW,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AS,0.651118,-0.319318,-0.848077,0.605965,-0.196959
DF,-2.018168,0.740122,0.528813,-0.589001,-1.489355
GR,0.188695,-0.758872,-0.933237,0.955057,-0.744542
BV,0.190794,1.978757,2.605967,0.683509,2.796762


In [52]:
# information of all columns
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, AS to BV
Data columns (total 5 columns):
W      4 non-null float64
X      4 non-null float64
Y      4 non-null float64
Z      4 non-null float64
NEW    4 non-null float64
dtypes: float64(5)
memory usage: 192.0+ bytes


In [53]:
# type of all columns
df.dtypes

W      float64
X      float64
Y      float64
Z      float64
NEW    float64
dtype: object

In [55]:
# get some extra information (can use in agg())
df.describe()

Unnamed: 0,W,X,Y,Z,NEW
count,4.0,4.0,4.0,4.0,4.0
mean,-0.24689,0.410172,0.338367,0.413883,0.091476
std,1.200715,1.220402,1.653573,0.685137,1.879691
min,-2.018168,-0.758872,-0.933237,-0.589001,-1.489355
25%,-0.363021,-0.429207,-0.869367,0.307224,-0.930745
50%,0.189745,0.210402,-0.159632,0.644737,-0.47075
75%,0.305875,1.049781,1.048102,0.751396,0.551471
max,0.651118,1.978757,2.605967,0.955057,2.796762


In [56]:
df['W'] > 0

NEWW
AS     True
DF    False
GR     True
BV     True
Name: W, dtype: bool

In [57]:
ser_w = df['W'] > 0

In [58]:
# count all of the different values
ser_w.value_counts()

True     3
False    1
Name: W, dtype: int64

In [59]:
#sum the number of Trues
sum(ser_w)

3

In [60]:
# length of the columns
len(df['W'] > 0)

4