In [5]:
import numpy as np
import pandas as pd

#  Getting Started with pandas

o pandas contains data structures and data manipulation tools designed to make data cleaning and analysis fast and easy

o pandas is often used with numerical computing tools like NumPy and SciPy, analytical libraries like statsmodels and scikit-learn, and data visualization libraries like matplotlib. 

o pandas adopts NumPy’s style of array-based computing

o the biggest difference is that pandas is designed for working with tabular or heterogeneous data whereas NumPy is 
  designed for homogeneous numerical array data.

    
### pandas Data Structures

o Series - one-dimensional array-like object containing a sequence of values and an associated array of data labels (index) 

o DataFrame - rectangular table of data and contains an ordered collection of columns, each of which can be a different
              value type (numeric, string, boolean, etc.)
           

In [33]:
import numpy as np
import pandas as pd
# from pandas import Series, DataFrame

# Creates a simple series
obj1 = pd.Series([4, 7, -5, 3])   
print(obj1)
print(obj1[1])


0    4
1    7
2   -5
3    3
dtype: int64
7


In [35]:
# Series with string indexing
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])  
print(obj2)
print(obj2['a'])
obj2['d'] = 100
print(obj2)
# print(obj2[2])

d    4
b    7
a   -5
c    3
dtype: int64
-5
d    100
b      7
a     -5
c      3
dtype: int64


In [36]:
# NumPy and NumPy-like operations
print(obj2[obj2 > 0])  
print(obj2 * 2)
print(np.exp(obj1))

d    100
b      7
c      3
dtype: int64
d    200
b     14
a    -10
c      6
dtype: int64
0      54.598150
1    1096.633158
2       0.006738
3      20.085537
dtype: float64


In [10]:
# Creates a series with a Python dictionary
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = pd.Series(sdata)
print(obj3)

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64


In [11]:
# Creates a series from an existing series
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = pd.Series(sdata, index=states)
print(obj4)

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64


In [12]:
# Label the series
obj4.name = 'Population'
obj4.index.name = 'State'
print(obj4)

State
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: Population, dtype: float64


In [24]:
# Create a Dataframe with a dictionary where values are equal sized list
# data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
#         'year': [2000, 2001, 2002, 2001, 2002, 2003],
#         'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame1 = pd.DataFrame(data)
print(frame1)

    state  year  pop
0    Ohio  2000  1.5
1    Ohio  2001  1.7
2    Ohio  2002  3.6
3  Nevada  2001  2.4
4  Nevada  2002  2.9
5  Nevada  2003  3.2


In [31]:
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop'])  #set column ordering
print(frame2)
print(frame2['state'])    # present column as series
print(frame2.year)        # present column as series

   year   state  pop
0  2000    Ohio  1.5
1  2001    Ohio  1.7
2  2002    Ohio  3.6
3  2001  Nevada  2.4
4  2002  Nevada  2.9
5  2003  Nevada  3.2
0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object
0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64


Unnamed: 0,year,state,pop
0,,Ohio,1.5
1,,Ohio,1.7
2,,Ohio,3.6
3,,Nevada,2.4
4,,Nevada,2.9
5,,Nevada,3.2


In [37]:
# Create a DataFrame with an extra undefined column
frame3 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],index=['one', 'two', 'three', 'four','five', 'six'])
print(frame3)
print(frame3.loc['three'])
frame3['debt'] = 0  # initialize entire column
print(frame3)

val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])  # initialize selected columns
frame3['debt'] = val
print(frame3)

frame3['eastern'] = frame3.state == 'Ohio'   # creates new colunm
print(frame3)

del frame3['eastern']  # delete a column
print(frame3)

       year   state  pop debt
one    2000    Ohio  1.5  NaN
two    2001    Ohio  1.7  NaN
three  2002    Ohio  3.6  NaN
four   2001  Nevada  2.4  NaN
five   2002  Nevada  2.9  NaN
six    2003  Nevada  3.2  NaN
year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object
       year   state  pop  debt
one    2000    Ohio  1.5     0
two    2001    Ohio  1.7     0
three  2002    Ohio  3.6     0
four   2001  Nevada  2.4     0
five   2002  Nevada  2.9     0
six    2003  Nevada  3.2     0
       year   state  pop  debt
one    2000    Ohio  1.5   NaN
two    2001    Ohio  1.7  -1.2
three  2002    Ohio  3.6   NaN
four   2001  Nevada  2.4  -1.5
five   2002  Nevada  2.9  -1.7
six    2003  Nevada  3.2   NaN
       year   state  pop  debt  eastern
one    2000    Ohio  1.5   NaN     True
two    2001    Ohio  1.7  -1.2     True
three  2002    Ohio  3.6   NaN     True
four   2001  Nevada  2.4  -1.5    False
five   2002  Nevada  2.9  -1.7    False
six    2003  Nevada  3.2   NaN   

### DataFrame constructor

o When passing nested dictionaries to the DataFrame, pandas will interpret the outer dict keys as the columns and the 
  inner keys as the row indices.
  
o Table 5-1. Possible data inputs to DataFrame constructor

In [38]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
frame4 = pd.DataFrame(pop)
print(frame4) 
print(frame4.T)

      Nevada  Ohio
2001     2.4   1.7
2002     2.9   3.6
2000     NaN   1.5
        2001  2002  2000
Nevada   2.4   2.9   NaN
Ohio     1.7   3.6   1.5


### Index Objects

o Index objects used to hold the axis labels and other metadata (like the axis name or names)

o Index objects are immutable

o Table 5-2. Some Index methods and properties


In [6]:
obj = pd.Series(range(3), index=['a', 'b', 'c'])
index = obj.index
print(index)
# index[1] = 'd'


Index(['a', 'b', 'c'], dtype='object')


TypeError: Index does not support mutable operations

### Essential Functionality

o Reindexing - create a new object with the data conformed to a new index

o Table 5-3. reindex function arguments

In [9]:
obj1 = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
print(obj1)
obj2 = obj1.reindex(['a', 'b', 'c', 'd', 'e'])
print(obj2)

obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
print(obj3)
obj3 = obj3.reindex(range(6))
# obj3 = obj3.reindex(range(6), method='ffill')
print(obj3)

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64
a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64
0      blue
2    purple
4    yellow
dtype: object
0      blue
1       NaN
2    purple
3       NaN
4    yellow
5       NaN
dtype: object


o Dropping Entries from an Axis - drop method drops one or more more entries from an axis of a Series or Dataframe

o The drop method returns a copy by default but it can be called with a parameter that makes the method mutable.

In [51]:
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
print(obj)
new_obj = obj.drop('c')
print(new_obj)
obj.drop('c', inplace=True)
print(obj)

data = pd.DataFrame(np.arange(16).reshape((4, 4)), index=['Ohio', 'Colorado', 'Utah', 'New York'], columns=['one', 'two', 'three', 'four'])
print(data)
print(data.drop(['Colorado', 'Ohio']))   # returns copy of rows dropped from dataframe
print(data.drop(['two', 'four'], axis='columns'))   # returns copy of columns dropped from dataframe


a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64
a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64
a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64
          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15
          one  two  three  four
Utah        8    9     10    11
New York   12   13     14    15
          one  three
Ohio        0      2
Colorado    4      6
Utah        8     10
New York   12     14


o Indexing, Selection, and Filtering

    - Series indexing is the same as with NumPy array indexing, except Series’s index values instead of only integers.
    
    - Slicing with labels behaves differently than normal Python slicing in that the endpoint is inclusive
    
    - Can apply indexing and slicing to Dataframe columms
    
    - DataFrame with label-indexed rows, can utlize special indexing operators loc and iloc to retieve a subset matrix
    
o Table 5-4. Indexing options with DataFrame

In [10]:
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
print(obj)
print(obj[1:3])
print(obj['a':'c'])
obj['b':'c'] = 5
print(obj)

data = pd.DataFrame(np.arange(16).reshape((4, 4)), index=['Ohio', 'Colorado', 'Utah', 'New York'], columns=['one', 'two', 'three', 'four'])
print(data)
print(data['two'])
data['two'] = 10
print(data)
print(data.loc['Colorado', ['two', 'three']])
print(data.iloc[[1, 2], [3, 0, 1]])


a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64
b    1.0
c    2.0
dtype: float64
a    0.0
b    1.0
c    2.0
dtype: float64
a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64
          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15
Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32
          one  two  three  four
Ohio        0   10      2     3
Colorado    4   10      6     7
Utah        8   10     10    11
New York   12   10     14    15
two      10
three     6
Name: Colorado, dtype: int64
          four  one  two
Colorado     7    4   10
Utah        11    8   10


### Other essential functionality

o Integer Indexes - if numbers are used as labels, they are implicitly converted to strings

o Arithmetic and Data Alignment - when adding together objects, if any index pairs are not the same, the respective index
  in the result will be the union of the index pairs (Table 5-5. Flexible Series and Dataframe arithmetic methods

o Function Application and Mapping - NumPy ufuncs (element-wise array methods) also work with pandas objects

o Sorting and Ranking - sorting is done lexicographically by row or column index using the sort_index method which returns
  a new, sorted object.  Ranking assigns ranks from one through the number of valid data points in an array.

o Axis Indexes with Duplicate Labels - pandas can have duplicate labels on their axis but While many pandas functions many   pandas functions (like reindex) require that the labels be unique

In [None]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])
print(s1)
print(s2)
print(s1+s2)
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'), index=['Ohio', 'Texas', 'Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print(df1)
print(df2)
print(df1+df2)

frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print(frame)
print (np.abs(frame))

obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
obj.sort_index()
print(obj)
frame = pd.DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'], columns=['d', 'a', 'b', 'c'])
print(frame)
xframe = frame.sort_index(axis=1)
print(xframe)
print(frame)
yframe = xframe.sort_index()
print(yframe)

obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
print(obj.rank())


### Summarizing and Computing Descriptive Statistics

o pandas objects are equipped with a set of common mathematical and statistical methods

o Table 5-8. Descriptive and summary statistics