### Pandas

In [1]:
#importing pandas and checking the version installed
import pandas 
pandas.__version__

'1.0.1'

### Pandas objects

In [2]:
import numpy as np
import pandas as pd

### Series

In [3]:
#index is implicitly provided
s1=pd.Series([600,297,287,378])
s1

0    600
1    297
2    287
3    378
dtype: int64

In [4]:
s1.values

array([600, 297, 287, 378], dtype=int64)

In [5]:
s1.index

RangeIndex(start=0, stop=4, step=1)

In [6]:
#explicitly providing index
Bacteria=pd.Series([120,293,7242,3782],index=['Actinobacteria', 'Bacteroidetes','Firmicutes', 'Proteobacteria'])
Bacteria

Actinobacteria     120
Bacteroidetes      293
Firmicutes        7242
Proteobacteria    3782
dtype: int64

In [7]:
#accessing the value
Bacteria['Bacteroidetes']

293

In [8]:
#positional indexing
Bacteria[3]

3782

In [9]:
Bacteria.name='count'
Bacteria.index.name='phylum'
Bacteria

phylum
Actinobacteria     120
Bacteroidetes      293
Firmicutes        7242
Proteobacteria    3782
Name: count, dtype: int64

In [10]:
#appplying functions on series values
np.log(Bacteria)

phylum
Actinobacteria    4.787492
Bacteroidetes     5.680173
Firmicutes        8.887653
Proteobacteria    8.238008
Name: count, dtype: float64

In [11]:
#filtering according to values 
Bacteria[Bacteria<=5000]

phylum
Actinobacteria     120
Bacteroidetes      293
Proteobacteria    3782
Name: count, dtype: int64

In [12]:
#series from dictionary
dict={'1MJ17CS725':25,'1MJ17CS751':51,'1MJ17CS731':31,'1MJ17CS735':35}
print(dict)
pd.Series(dict)

{'1MJ17CS725': 25, '1MJ17CS751': 51, '1MJ17CS731': 31, '1MJ17CS735': 35}


1MJ17CS725    25
1MJ17CS751    51
1MJ17CS731    31
1MJ17CS735    35
dtype: int64

### DataFrame:Bi-Dimensional Series with two or more indices

In [13]:
#dataframe created from dictionaries
fam={"name":["bhavana","yamini","priya","suhas","brinda"],"year_of_birth":[1999,1998,1995,1999,2007],"Age":[21,22,25,21,13]}
print(fam)
fam=pd.DataFrame(fam)
fam

{'name': ['bhavana', 'yamini', 'priya', 'suhas', 'brinda'], 'year_of_birth': [1999, 1998, 1995, 1999, 2007], 'Age': [21, 22, 25, 21, 13]}


Unnamed: 0,name,year_of_birth,Age
0,bhavana,1999,21
1,yamini,1998,22
2,priya,1995,25
3,suhas,1999,21
4,brinda,2007,13


In [14]:
#changing order of columns
family=pd.DataFrame(fam,columns=["name","Age","year_of_birth"])
family

Unnamed: 0,name,Age,year_of_birth
0,bhavana,21,1999
1,yamini,22,1998
2,priya,25,1995
3,suhas,21,1999
4,brinda,13,2007


In [15]:
#assigning new columns to dataframe
family["curr_year"]=family.year_of_birth+family.Age
family

Unnamed: 0,name,Age,year_of_birth,curr_year
0,bhavana,21,1999,2020
1,yamini,22,1998,2020
2,priya,25,1995,2020
3,suhas,21,1999,2020
4,brinda,13,2007,2020


In [16]:
family["eldest"]=pd.Series(range(1,6,1),index=[2,1,3,0,4])
family

Unnamed: 0,name,Age,year_of_birth,curr_year,eldest
0,bhavana,21,1999,2020,4
1,yamini,22,1998,2020,2
2,priya,25,1995,2020,1
3,suhas,21,1999,2020,3
4,brinda,13,2007,2020,5


In [17]:
#converting dataframe into the dictionary(values of dict are also dicts)
family.to_dict()

{'name': {0: 'bhavana', 1: 'yamini', 2: 'priya', 3: 'suhas', 4: 'brinda'},
 'Age': {0: 21, 1: 22, 2: 25, 3: 21, 4: 13},
 'year_of_birth': {0: 1999, 1: 1998, 2: 1995, 3: 1999, 4: 2007},
 'curr_year': {0: 2020, 1: 2020, 2: 2020, 3: 2020, 4: 2020},
 'eldest': {0: 4, 1: 2, 2: 1, 3: 3, 4: 5}}

In [18]:
#passing a dicts where the values are dicts
pd.DataFrame(family.to_dict())

Unnamed: 0,name,Age,year_of_birth,curr_year,eldest
0,bhavana,21,1999,2020,4
1,yamini,22,1998,2020,2
2,priya,25,1995,2020,1
3,suhas,21,1999,2020,3
4,brinda,13,2007,2020,5


In [19]:
#list to dicts
square_table=[{'a':i,'a^2':i**2}for i in range(8)]
print(square_table)
pd.DataFrame(square_table)

[{'a': 0, 'a^2': 0}, {'a': 1, 'a^2': 1}, {'a': 2, 'a^2': 4}, {'a': 3, 'a^2': 9}, {'a': 4, 'a^2': 16}, {'a': 5, 'a^2': 25}, {'a': 6, 'a^2': 36}, {'a': 7, 'a^2': 49}]


Unnamed: 0,a,a^2
0,0,0
1,1,1
2,2,4
3,3,9
4,4,16
5,5,25
6,6,36
7,7,49


In [20]:
#missing values in the dataframe is filled with NAN("not a number") automatically by pandas
f=pd.DataFrame([{'Name':'Bhavana','Friend':'Sreeja'},{'Name':'Millee','Phone':'255264'}],index=[1,2])
print(f)
print(f.to_dict())

      Name  Friend   Phone
1  Bhavana  Sreeja     NaN
2   Millee     NaN  255264
{'Name': {1: 'Bhavana', 2: 'Millee'}, 'Friend': {1: 'Sreeja', 2: nan}, 'Phone': {1: nan, 2: '255264'}}


In [21]:
#two-dimensional numpy array to dataframe
pd.DataFrame(np.random.randint(0,10),columns=['a1','a2','a3'],index=['b1','b2','b3'])

Unnamed: 0,a1,a2,a3
b1,1,1,1
b2,1,1,1
b3,1,1,1


### The pandas Index Object

In [22]:
indexes=pd.Index([725,751,731,735,749])
indexes

Int64Index([725, 751, 731, 735, 749], dtype='int64')

``index acts as immutable array``


In [23]:
#accessing values in index object using pyhton indexing
indexes[2]

731

In [24]:
#slicing over the index object
indexes[0:4:2]

Int64Index([725, 731], dtype='int64')

In [25]:
indexes[::]

Int64Index([725, 751, 731, 735, 749], dtype='int64')

In [26]:
#attributes associated with index object
print(indexes.shape,indexes.ndim,indexes.size,indexes.dtype)

(5,) 1 5 int64


In [27]:
#index object is immutable
#throws error
#indexes[2]=0

### Operating on data in pandas

#### Ufuncs:Index Preservation

In [28]:
fsc=np.random.RandomState(10)
series=pd.Series(fsc.randint(0,10,4))
series

0    9
1    4
2    0
3    1
dtype: int32

In [29]:
hgc=pd.DataFrame(fsc.randint(0,10,(5,4)),columns=['a','b','c','d'])
hgc

Unnamed: 0,a,b,c,d
0,9,0,1,8
1,9,0,8,6
2,4,3,0,4
3,6,8,1,8
4,4,1,3,6


In [30]:
np.exp(series)

0    8103.083928
1      54.598150
2       1.000000
3       2.718282
dtype: float64

In [31]:
np.cos(hgc*np.pi/180)

Unnamed: 0,a,b,c,d
0,0.987688,1.0,0.999848,0.990268
1,0.987688,1.0,0.990268,0.994522
2,0.997564,0.99863,1.0,0.997564
3,0.994522,0.990268,0.999848,0.990268
4,0.997564,0.999848,0.99863,0.994522


### Universal functions:index allignment

In [32]:
#creating series named area and population
area=pd.Series({'Alaska':192867,'Texas':732916,'California':418290},name='area')
population=pd.Series({'California':11681289,'Texas':72027857,'NewYork':19283709},name='population')
print(area)
population

Alaska        192867
Texas         732916
California    418290
Name: area, dtype: int64


California    11681289
Texas         72027857
NewYork       19283709
Name: population, dtype: int64

In [33]:
#computing populatpion density
population/area

Alaska              NaN
California    27.926293
NewYork             NaN
Texas         98.275733
dtype: float64

In [34]:
#union(all indices from both series)
area.index|population.index

Index(['Alaska', 'California', 'NewYork', 'Texas'], dtype='object')

In [35]:
a=pd.Series([7,3,8],index=[1,2,3])
b=pd.Series([4,2,6],index=[2,3,6])
print(a)
print(b)
b
a+b

1    7
2    3
3    8
dtype: int64
2    4
3    2
6    6
dtype: int64


1     NaN
2     7.0
3    10.0
6     NaN
dtype: float64

In [36]:
#filling NAN values to explicitly using add()
a.add(b,fill_value=0)

1     7.0
2     7.0
3    10.0
6     6.0
dtype: float64

### data wrangling

#### merge operations

In [37]:
df1= pd.DataFrame( {"Province": ["FL", "FL", "NH", "NH", "ZH"],
        "Year": [2013, 2014, 2013, 2014, 2014],
        "Literacy": [0.2, 0.1, 0.5, 0.3, 0.5]})
df1['nonsense'] = df1.Year / df1.Literacy
df1['Serie_aligned'] = pd.Series(range(5), index=[0,1,2, 3, 4])

In [38]:
df1

Unnamed: 0,Province,Year,Literacy,nonsense,Serie_aligned
0,FL,2013,0.2,10065.0,0
1,FL,2014,0.1,20140.0,1
2,NH,2013,0.5,4026.0,2
3,NH,2014,0.3,6713.333333,3
4,ZH,2014,0.5,4028.0,4


In [39]:
df2 = pd.DataFrame({"Province": ["FL", "NH", "ZH"], "Population": ["100000", "200000", "300000"]})
df2

Unnamed: 0,Province,Population
0,FL,100000
1,NH,200000
2,ZH,300000


In [40]:
#if there are overlapping names then it uses those for the merge
df1.merge(df2)

Unnamed: 0,Province,Year,Literacy,nonsense,Serie_aligned,Population
0,FL,2013,0.2,10065.0,0,100000
1,FL,2014,0.1,20140.0,1,100000
2,NH,2013,0.5,4026.0,2,200000
3,NH,2014,0.3,6713.333333,3,200000
4,ZH,2014,0.5,4028.0,4,300000


In [41]:
#if the column names are different then we have to specify them explicitely
df3 = pd.DataFrame({"province": ["FL", "NH"], "Population": ["100000", "200000"]})
df3
df1.merge(df3, right_on='province', left_on='Province')#merge does inner join by default

Unnamed: 0,Province,Year,Literacy,nonsense,Serie_aligned,province,Population
0,FL,2013,0.2,10065.0,0,FL,100000
1,FL,2014,0.1,20140.0,1,FL,100000
2,NH,2013,0.5,4026.0,2,NH,200000
3,NH,2014,0.3,6713.333333,3,NH,200000


In [42]:
#if other types of joins are needed,we can explicitly give that
df4 = pd.DataFrame({"Province": ["FL", "NH", "UT"], "Population": ["100000", "200000", "50000"]})
df1.merge(df4, how='outer')#here,we do outer merge)

Unnamed: 0,Province,Year,Literacy,nonsense,Serie_aligned,Population
0,FL,2013.0,0.2,10065.0,0.0,100000.0
1,FL,2014.0,0.1,20140.0,1.0,100000.0
2,NH,2013.0,0.5,4026.0,2.0,200000.0
3,NH,2014.0,0.3,6713.333333,3.0,200000.0
4,ZH,2014.0,0.5,4028.0,4.0,
5,UT,,,,,50000.0


#### combining data with overlap

In [43]:
a= pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],index=['f', 'e', 'd', 'c', 'b', 'a'])
b= pd.Series(np.arange(len(a), dtype=np.float64),index=['f', 'e', 'd', 'c', 'b', 'a'])

In [44]:
a

f    NaN
e    2.5
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64

In [45]:
b

f    0.0
e    1.0
d    2.0
c    3.0
b    4.0
a    5.0
dtype: float64

In [46]:
pd.Series(np.where(pd.isnull(a),b,a),index=a.index)

f    0.0
e    2.5
d    2.0
c    3.5
b    4.5
a    5.0
dtype: float64

In [47]:
a.combine_first(b)

f    0.0
e    2.5
d    2.0
c    3.5
b    4.5
a    5.0
dtype: float64