### Getting Started with Pandas

In [2]:
# Construct a dataframe using Pandas

import pandas as pd
import numpy as np

In [3]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
            'year': [2000, 2001, 2002, 2001, 2002, 2003],
            'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)


In [5]:
print (frame)


   pop   state  year
0  1.5    Ohio  2000
1  1.7    Ohio  2001
2  3.6    Ohio  2002
3  2.4  Nevada  2001
4  2.9  Nevada  2002
5  3.2  Nevada  2003


In [7]:
frame.head() # Useful when looking at large dataset - returns first 5 rows

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002


In [8]:
# If you specify a sequence of columns, dataframe's columns will be arranged in that order

pd.DataFrame(data, columns=['year', 'state', 'pop'])


Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [10]:
# Retrieve a column in a dataframe 

frame['state']


0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object

In [11]:
# Another common form of data is a nested dict of dicts

pop = {'Nevada': {2001: 2.4, 2002: 2.9}, 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}


In [12]:
frame3 = pd.DataFrame(pop)


In [13]:
frame3

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [14]:
# Transpose the dataframe

frame3.T

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


In [19]:
# Example data frame with indices on the left and columns or attributes on the top

data = pd.DataFrame(np.arange(16).reshape((4, 4)), index=['Ohio', 'Colorado', 'Utah', 'New York'],columns=['one', 'two', 'three', 'four'])


In [20]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [23]:
# Using the drop () and specifying row labels

data.drop(['Colorado', 'Ohio'])


Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [24]:
# Using the drop () and specifying axis = columns 

data.drop('three', axis='columns')


Unnamed: 0,one,two,four
Ohio,0,1,3
Colorado,4,5,7
Utah,8,9,11
New York,12,13,15


In [25]:
data.drop(['three', 'four'], axis='columns')


Unnamed: 0,one,two
Ohio,0,1
Colorado,4,5
Utah,8,9
New York,12,13


In [26]:
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])


In [27]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [28]:
# Index obj 'b'

obj['b']


1.0

In [29]:
obj[1:3]

b    1.0
c    2.0
dtype: float64

In [31]:
data [['two', 'three']]


Unnamed: 0,two,three
Ohio,1,2
Colorado,5,6
Utah,9,10
New York,13,14


In [32]:
# Using loc and iloc for data frame label-indexing on rows with special indexing operator

data.loc['Colorado', ['two', 'three']] # Select a single row and multiple columns


two      5
three    6
Name: Colorado, dtype: int64

In [35]:
# Using iloc for integers indexing 

data.iloc[2,[3,0,1]]


four    11
one      8
two      9
Name: Utah, dtype: int64

In [36]:
data.iloc[2]


one       8
two       9
three    10
four     11
Name: Utah, dtype: int64

In [4]:
# Sorting and Ranking
# Sorting using the sort_index () to obtain sorted rows/index and columns

objectz = pd.Series(range(4), index=['d', 'a', 'b', 'c', ])
objectz.sort_index()


a    1
b    2
c    3
d    0
dtype: int64

In [5]:
# Sort a series by its values using sort_values ()
obje = pd.Series([4,7,-3,3])
obje.sort_values()


2   -3
3    3
0    4
1    7
dtype: int64

In [3]:
# Ranking assigns ranks from one through n
# break ties by assigning each group the mean rank 

whatup = pd.Series([7, -5, 7, 4, 2, 0, 4])
whatup.rank()


0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [None]:
# Summarizing and descriptive statistics
# data.sum() returns column sums
