# Selecting Entries

In [1]:
import numpy as np
from pandas import Series, DataFrame
import pandas as pd

In [2]:
# Selecting Entries in a Series:
ser1 = Series(np.arange(3),index=['A','B','C'])
ser1 = 2*ser1 #to avoid confusion in the future ser1
ser1

A    0
B    2
C    4
dtype: int32

In [3]:
# You can grab an entry by index name: ser1['B'] returns 2 or by index value: ser1[1] returns 2 or by a range of values: ser1[0:2] returns rows A:0 and B:2 or by a list of index names: ser1[['A','B']] returns rows A:0 and B:2
# You can grab entries by logic: ser1[ser1>3] returns row C:4
# You can change values using logic: ser1[ser1>3] = 10 changes C

In [4]:
# Selecting Entries in a DataFrame:
dframe = DataFrame(np.arange(25).reshape((5,5)), index=['NYC','LA','SF','DC','Chi'],columns=['A','B','C','D','E'])


In [5]:
dframe

Unnamed: 0,A,B,C,D,E
NYC,0,1,2,3,4
LA,5,6,7,8,9
SF,10,11,12,13,14
DC,15,16,17,18,19
Chi,20,21,22,23,24


In [6]:
# You can grab entries by column name: 
dframe['B'] #returns all rows with column B values


NYC     1
LA      6
SF     11
DC     16
Chi    21
Name: B, dtype: int32

In [7]:
# You can grab multiple columns with a list of names: 
dframe[['B','E']]


Unnamed: 0,B,E
NYC,1,4
LA,6,9
SF,11,14
DC,16,19
Chi,21,24


In [8]:
dframe

Unnamed: 0,A,B,C,D,E
NYC,0,1,2,3,4
LA,5,6,7,8,9
SF,10,11,12,13,14
DC,15,16,17,18,19
Chi,20,21,22,23,24


In [9]:
# You can grab specific rows using Boolean: 
dframe[dframe['E']>8]


Unnamed: 0,A,B,C,D,E
LA,5,6,7,8,9
SF,10,11,12,13,14
DC,15,16,17,18,19
Chi,20,21,22,23,24


In [10]:
# You can grab a specific cell by column and row: 
dframe['B']['LA']


6

In [11]:
#To show a Boolean DataFrame: 

dframe>10

Unnamed: 0,A,B,C,D,E
NYC,False,False,False,False,False
LA,False,False,False,False,False
SF,False,True,True,True,True
DC,True,True,True,True,True
Chi,True,True,True,True,True


# Data Alignment

In [12]:
# Data Alignment
ser1 = Series([0,1,2],index=['A','B','C'])
ser2 = Series([3,4,5,6],index=list('ABCD')) #a nice little shortcut
ser1

A    0
B    1
C    2
dtype: int64

In [13]:
ser1 + ser2

A    3.0
B    5.0
C    7.0
D    NaN
dtype: float64

In [14]:
# Because ser1 didn't have a value for D, it replaced it with a null.
# The same behavior occurs with DataFrames (null values are assigned for any unmatched field)
# Use .add to assign fill values:
ser1.add(ser2,fill_value=0) #this adds 0 to whatever hasn’t matched NOTE: ser2.add(ser1,fill_value=0) returns the same thing!
# When using .add/fill_value with dataframes, null values are assigned when there are no prior values in a cell (at the intersection where new rows from one DataFrame meet new columns from another)


A    3.0
B    5.0
C    7.0
D    6.0
dtype: float64

In [15]:
# Operations Between a Series and a DataFrame
dframe1 = DataFrame(np.arange(9).reshape(3,3),columns=list('ADC'), index=['NYC','SF','LA'])
ser1 = dframe1.ix[0] #so ser1 takes the 'NYC' row and values


In [16]:
dframe1

Unnamed: 0,A,D,C
NYC,0,1,2
SF,3,4,5
LA,6,7,8


In [17]:
ser1

A    0
D    1
C    2
Name: NYC, dtype: int32

In [18]:
dframe1 - ser1 #returns the dframe1 DataFrame, but now all the 'NYC' values = 0

Unnamed: 0,A,D,C
NYC,0,0,0
SF,3,3,3
LA,6,6,6


In [30]:
from numpy.random import randn
dframe = DataFrame(randn(25).reshape((5,5)),index=['A','B','D','E','F'], columns=['col1','col2','col3','col4','col5'])

In [31]:
# To count the unique values in a DataFrame column:
dframe['col1'].value_counts() #returns the count from highest to lowest


 1.686253    1
 0.816522    1
 1.498512    1
-0.014961    1
-1.129562    1
Name: col1, dtype: int64

# Summary Statistics in DataFrame

In [21]:
arr = np.array([[1,2,np.nan],[np.nan,3,4]]) #inserts null values
dframe1 = DataFrame(arr,index=['A','B'],columns = ['One','Two','Three'])
dframe1

Unnamed: 0,One,Two,Three
A,1.0,2.0,
B,,3.0,4.0


In [22]:
dframe1.sum() 


One      1.0
Two      5.0
Three    4.0
dtype: float64

In [23]:
dframe1.sum(axis=1)

A    3.0
B    7.0
dtype: float64

In [24]:
dframe1.min() 

One      1.0
Two      2.0
Three    4.0
dtype: float64

In [25]:
#Check the index which have minimum values 
dframe1.idxmin()

One      A
Two      A
Three    B
dtype: object

In [26]:
dframe1

Unnamed: 0,One,Two,Three
A,1.0,2.0,
B,,3.0,4.0


In [27]:
dframe1.cumsum()

Unnamed: 0,One,Two,Three
A,1.0,2.0,
B,,5.0,4.0


In [28]:
dframe1.describe()

Unnamed: 0,One,Two,Three
count,1.0,2.0,1.0
mean,1.0,2.5,4.0
std,,0.707107,
min,1.0,2.0,4.0
25%,1.0,2.25,4.0
50%,1.0,2.5,4.0
75%,1.0,2.75,4.0
max,1.0,3.0,4.0
