In [15]:
import numpy as np
import pandas as pd

In [16]:
trend2014 = pd.Series([100,99.3,95.5,93.5,92.4,84.8,84.5,78.9,74.3,72.8],
                    index=['SQL','HTML','SAS','Python','Tableau','PHP','JavaScript','Ruby','R','Matlab'])

In [17]:
trend2015 = pd.Series({'SQL': 100,'HTML': 99.9,'SAS': 99.4,'Python': 96.5,'Tableau': 91.3,
                     'R': 84.8,'PHP': 84.5,'JavaScript': 83.0,'Ruby': 76.2,'Matlab': 72.4})

In [18]:
trend_in_two_years = pd.DataFrame({'2014':trend2014,'2015':trend2015})
trend_in_two_years

Unnamed: 0,2014,2015
HTML,99.3,99.9
JavaScript,84.5,83.0
Matlab,72.8,72.4
PHP,84.8,84.5
Python,93.5,96.5
R,74.3,84.8
Ruby,78.9,76.2
SAS,95.5,99.4
SQL,100.0,100.0
Tableau,92.4,91.3


In [20]:
trend_in_two_years = trend_in_two_years.sort_values('2015',ascending=False)
trend_in_two_years

Unnamed: 0,2014,2015
SQL,100.0,100.0
HTML,99.3,99.9
SAS,95.5,99.4
Python,93.5,96.5
Tableau,92.4,91.3
R,74.3,84.8
PHP,84.8,84.5
JavaScript,84.5,83.0
Ruby,78.9,76.2
Matlab,72.8,72.4


##### Since Pandas is built on top of NumPy, there's NumPy array inside every dataframe. So we can extract it by asking for the attribute, 'values'.

In [7]:
trend_in_two_years.values

array([[ 100. ,  100. ],
       [  99.3,   99.9],
       [  95.5,   99.4],
       [  93.5,   96.5],
       [  92.4,   91.3],
       [  74.3,   84.8],
       [  84.8,   84.5],
       [  84.5,   83. ],
       [  78.9,   76.2],
       [  72.8,   72.4]])

In [8]:
trend_in_two_years.index

Index(['SQL', 'HTML', 'SAS', 'Python', 'Tableau', 'R', 'PHP', 'JavaScript',
       'Ruby', 'Matlab'],
      dtype='object')

In [10]:
trend_in_two_years.columns

Index(['2014', '2015'], dtype='object')

##### Indexing the dataframe with the bracket naturally returns a column

In [11]:
trend_in_two_years['2014']

SQL           100.0
HTML           99.3
SAS            95.5
Python         93.5
Tableau        92.4
R              74.3
PHP            84.8
JavaScript     84.5
Ruby           78.9
Matlab         72.8
Name: 2014, dtype: float64

In [12]:
trend_in_two_years['2015']

SQL           100.0
HTML           99.9
SAS            99.4
Python         96.5
Tableau        91.3
R              84.8
PHP            84.5
JavaScript     83.0
Ruby           76.2
Matlab         72.4
Name: 2015, dtype: float64

### Select a subset of rows from the DataFrame

##### iloc is used for numeric indices

In [20]:
trend_in_two_years.iloc[1:4]

Unnamed: 0,2014,2015
HTML,99.3,99.9
SAS,95.5,99.4
Python,93.5,96.5


##### 'loc' is used for explicit values

In [24]:
trend_in_two_years.loc['R':'Ruby']

Unnamed: 0,2014,2015
R,74.3,84.8
PHP,84.8,84.5
JavaScript,84.5,83.0
Ruby,78.9,76.2


In [30]:
trend_in_two_years['avg'] = 0.5*(trend_in_two_years['2014'] + trend_in_two_years['2015'])
trend_in_two_years

Unnamed: 0,2014,2015,avg
SQL,100.0,100.0,100.0
HTML,99.3,99.9,99.6
SAS,95.5,99.4,97.45
Python,93.5,96.5,95.0
Tableau,92.4,91.3,91.85
R,74.3,84.8,79.55
PHP,84.8,84.5,84.65
JavaScript,84.5,83.0,83.75
Ruby,78.9,76.2,77.55
Matlab,72.8,72.4,72.6


##### Use Python Dict to make a dataframe

Do not forget round paranthesis while calling a Pandas DataFrame

In [13]:
presidents = pd.DataFrame([{'Name':'Donald Trump','Inaugration':2017,'birthyear':1946},
             {'Name':'Barack Obama','Inaugration':2009,'birthyear':1961},
             {'Name':'George W. Bush','Inaugration':2001,'birthyear':1946},
            {'Name':'Clinton', 'Inaugration':1993,'birthyear':1946}])
presidents

Unnamed: 0,Inaugration,Name,birthyear
0,2017,Donald Trump,1946
1,2009,Barack Obama,1961
2,2001,George W. Bush,1946
3,1993,Clinton,1946


In [48]:
presidents_index = presidents.set_index('Name')
presidents_index

Unnamed: 0_level_0,Inaugration,birthyear
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Donald Trump,2017,1946
Barack Obama,2009,1961
George W. Bush,2001,1946
Clinton,1993,1946


##### We need the .loc object to index rows

In [49]:
presidents_index.loc['Clinton']

Inaugration    1993
birthyear      1946
Name: Clinton, dtype: int64

In [50]:
presidents_index.loc['Clinton']['Inaugration']

1993

In [51]:
presidents_index['Inaugration']['Clinton']

1993

### Simplest form of Join using Pandas merge

In [52]:
presidents_fathers = pd.DataFrame([{'son': 'Donald Trump','father': 'Fred Trump'},
                                   {'son': 'Barack Obama','father': 'Barack Obama, Sr.'},
                                   {'son': 'George W. Bush','father': 'George H. W. Bush'},
                                   {'son': 'George H. W. Bush','father': 'Prescott Bush'}])
presidents_fathers

Unnamed: 0,father,son
0,Fred Trump,Donald Trump
1,"Barack Obama, Sr.",Barack Obama
2,George H. W. Bush,George W. Bush
3,Prescott Bush,George H. W. Bush


In [53]:
pd.merge(presidents, presidents_fathers,left_on='Name',right_on='son')

Unnamed: 0,Inaugration,Name,birthyear,father,son
0,2017,Donald Trump,1946,Fred Trump,Donald Trump
1,2009,Barack Obama,1961,"Barack Obama, Sr.",Barack Obama
2,2001,George W. Bush,1946,George H. W. Bush,George W. Bush


In [55]:
pd.merge(presidents, presidents_fathers,left_on='Name',right_on='son',how='left').drop('son',axis=1)

Unnamed: 0,Inaugration,Name,birthyear,father
0,2017,Donald Trump,1946,Fred Trump
1,2009,Barack Obama,1961,"Barack Obama, Sr."
2,2001,George W. Bush,1946,George H. W. Bush
3,1993,Clinton,1946,


##### Sample DataFrame by random choice of values

In [56]:
pd.DataFrame(np.random.choice(['a','b','c','d'],(3,3)),index=[1,2,3],columns=['A','B','C'])

Unnamed: 0,A,B,C
1,c,c,b
2,a,b,a
3,b,a,b
