In [None]:
import pandas as pd

# Intro to Pandas Structures
- ##Series in Pandas

In [None]:
"""Series are like 1-d arrays but they contain a data value (i.e. an index) or
 a fixed and sorted dictionary which maps index values to data values"""

obj = pd.Series([1, 3, -1, 6])
print(obj)

In [3]:
values = obj.values
index_values = obj.index  # like the range function
values, index_values

In [4]:
obj2 = pd.Series(data=[1, -5, 9, 0, -3], index=["a", "b", "d", "p", "q"])
obj2

a    1
b   -5
d    9
p    0
q   -3
dtype: int64

In [5]:
obj2[obj2 > 0]

In [6]:
obj2.index

Index(['a', 'b', 'd', 'p', 'q'], dtype='object')

In [7]:
#  check for missing data
obj2.isnull()

In [8]:
# both the series and the index have a name attribute
obj2.name = "numbers"
obj2.index.name = "letters"
obj2

letters
a    1
b   -5
d    9
p    0
q   -3
Name: numbers, dtype: int64

In [9]:
# obj has default indexing (enumerated)
# lets change that
obj.index = ["john", "aaron", "alex", "mike"]
obj

- # Dataframes in Pandas

In [10]:
"""Data frames represented a table of data"""
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'], 
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

frame = pd.DataFrame(data)
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002
5,3.2,Nevada,2003


In [11]:
frame.head(3)  # first have three rows 

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002


In [12]:
#  column order can be specified
frame = pd.DataFrame(data, columns=["year", "state", "pop"])
frame

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [13]:
#  adding passing a column not in the dictionary will results in NaN data values
#  we include a "debt" column not in the data
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                      index=['one', 'two', 'three', 'four','five', 'six'])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [14]:
#  a column in the DataFrame can be retrieved as a Series (two methods)
frame2["year"]


In [15]:
frame2.state

In [16]:
# row retrieval using special attribute called "loc"
frame2.loc["three"]  # prints all values, w/ column location, of row index "three"  

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [17]:
"""Column modifications"""
frame2["debt"] = 16.5
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5
six,2003,Nevada,3.2,16.5


In [18]:
import numpy as np
frame2["debt"] = np.arange(6.)
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0
six,2003,Nevada,3.2,5.0


In [19]:
val = pd.Series(data=[-1.7, 6.2, -3], index=["two", "four", "five"])
frame2["debt"] = val
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.7
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,6.2
five,2002,Nevada,2.9,-3.0
six,2003,Nevada,3.2,


In [26]:
# Assigning a column that doesn't exist creates it a new one
frame2["eastern"] = (frame2.state == "Ohio")  # eastern column creates boolean data values
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.7,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,6.2,False
five,2002,Nevada,2.9,-3.0,False
six,2003,Nevada,3.2,,False


In [27]:
# deleting columns w/ built-in function "del"
del frame2["eastern"]  # error occurs when deleting a column that doesn't exist

In [29]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [31]:
"""Nested dict of dicts"""
pop = {'Nevada': {2001: 2.4, 2002: 2.9}, 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
frame3 = pd.DataFrame(pop)
frame3

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [41]:
# transpose of dataframe
frame3.T  # similar to numpy array

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


In [47]:
# dicts of Series 
pd.DataFrame(pop, index=[2001, 2002, 2003])  # pop's 2000 not included since index assignment is explicit 


Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [66]:
pdata = {"Ohio": frame3["Ohio"][:-1], "Nevada": frame3["Nevada"][:2]}
pd.DataFrame(pdata)

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7


In [69]:
frame3.index.name = ["Year"]
frame3.columns.name = ["State"]
frame3

[State],Nevada,Ohio
[Year],Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [70]:
frame3.values

array([[ nan,  1.5],
       [ 2.4,  1.7],
       [ 2.9,  3.6]])

In [71]:
frame2.values  # if values are of different types the display will be in regards to the columns

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, -1.7],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, 6.2],
       [2002, 'Nevada', 2.9, -3.0],
       [2003, 'Nevada', 3.2, nan]], dtype=object)

- # Index Objects

In [74]:
"""Index labels are immutable (unchanging over time) including from the user"""
obj = pd.Series(data=[1, 3, 4], index=["a", "b", "c"])
index = obj.index
index

Index(['a', 'b', 'c'], dtype='object')

In [79]:
index[1:]

Index(['b', 'c'], dtype='object')

In [81]:
index[1] = "w"  # type error

In [83]:
labels = pd.Index(np.arange(3))
labels

Int64Index([0, 1, 2], dtype='int64')

In [84]:
obj2 = pd.Series([1.5, -2.5, 0], index=labels)
obj2

0    1.5
1   -2.5
2    0.0
dtype: float64

In [93]:
check = obj2.index is labels  # compares each value is all equal then True 
print(check)

True


In [94]:
"Ohio" in frame3.columns

True

In [113]:
2000 in frame3.index

True

# Essential Functionality
- ## Reindexing