# Data Manipulation with Pandas

In [1]:
import numpy as np
import pandas as pd

### Pandas Series Object

In [2]:
# A series can be created from a list or an array

data = pd.Series([.25, .5, .75, 1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [3]:
# You can access the values of the series using '[series].values'

data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [4]:
# The series has a built in array-like index object embedded in its

data.index

RangeIndex(start=0, stop=4, step=1)

In [5]:
# You can access elements of the series just like with an array

data[1]

0.5

In [6]:
# And you can slice them

data[1:3]

1    0.50
2    0.75
dtype: float64

In [7]:
# The series has an explicitely defined index unlike the implicite index associated with a NumPy array
# The index is by default a range of integers from 0 but can be defined as any items of the *same type*.

data = pd.Series([.25, .5, .75, 1.0], index = ['This', 'Is', 'An', 'Index'])  # INDICES MUST BE THE SAME TYPE!!!
print(data, '\n')
print(data[0], '\n')
print(data['This'])

This     0.25
Is       0.50
An       0.75
Index    1.00
dtype: float64 

0.25 

0.25


### Is a Series a Specialized Dictionary?

In [8]:
# For practicality yes.
# It functions like a dictionary with TYPED KEYS and TYPED VALUES



population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
print(population_dict, '\n')
population = pd.Series(population_dict)
population_byind = population.sort_index()
population_byval = population.sort_values()

print(population_byind, '\n')
print(population_byval, '\n')

{'California': 38332521, 'Texas': 26448193, 'New York': 19651127, 'Florida': 19552860, 'Illinois': 12882135} 

California    38332521
Florida       19552860
Illinois      12882135
New York      19651127
Texas         26448193
dtype: int64 

Illinois      12882135
Florida       19552860
New York      19651127
Texas         26448193
California    38332521
dtype: int64 



In [9]:
population_byind['California']

38332521

In [10]:
population_byind['California':'Illinois']

California    38332521
Florida       19552860
Illinois      12882135
dtype: int64

### Constructing a Series object

In [11]:
# Series can be constructed from a simple list
# The index will default to range(len([array]))

pd.Series([2, 4, 6])

0    2
1    4
2    6
dtype: int64

In [12]:
# If data is a scalar and a list of indices is specified then the scalar will repeat

pd.Series(11, index=np.linspace(120,210,10))

120.0    11
130.0    11
140.0    11
150.0    11
160.0    11
170.0    11
180.0    11
190.0    11
200.0    11
210.0    11
dtype: int64

In [13]:
# Data can be input as a dictionary with the index as the keys

my_dict = {2:'a', 1:'b', 3:'c'}

In [14]:
pd.Series(my_dict)

2    a
1    b
3    c
dtype: object

In [15]:
# You can select specific keys from which to generate the Series if you don't want
# to include the entire dictionary in the Series.

pd.Series(my_dict, index=[3,2])

3    c
2    a
dtype: object

### The Dataframe

In [16]:
# The dataframe can be thought of as a collection of pandas Series objects

# Let's make a series of the land areas of the states we already have the populations of

area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995}

area = pd.Series(area_dict)
area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64

In [17]:
# Let's join them into a dataframe using a dictionary definition

states = pd.DataFrame({'populaiton': population, 'area': area})

states

Unnamed: 0,populaiton,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [18]:
states.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

In [19]:
states.columns

Index(['populaiton', 'area'], dtype='object')

In [20]:
states['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

### Constructing a DataFrame

In [21]:
# A DataFrame can consist of a single Series

pd.DataFrame(population, columns=['popuplation'])

Unnamed: 0,popuplation
California,38332521
Texas,26448193
New York,19651127
Florida,19552860
Illinois,12882135


In [22]:
# Can be constructed from a list of dictionaries
# The Dictionary Keys become the column headers and the
# dictionary values become the elements in the columns

data = [{'a': i**2, 'b': 2*i}
       for i in range(5)]
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,4,4
3,9,6
4,16,8


In [23]:
# As before, DataFrames can be constructed from a dictionary of pandas Series.

pd.DataFrame({'population': population, 'area': area})

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [24]:
# A 2D NumPy Array

pd.DataFrame(np.random.rand(5,7), columns = ['Winnie', 'the', 'pooh', 'and', 
             'tigger', 'too', '!'], index = ['How', 'many', 'licks', 'tootsie', 'pop'])

Unnamed: 0,Winnie,the,pooh,and,tigger,too,!
How,0.302404,0.850161,0.608741,0.464985,0.919431,0.419418,0.390365
many,0.576111,0.037423,0.105754,0.43862,0.82536,0.964587,0.713844
licks,0.944498,0.013253,0.683098,0.44104,0.633215,0.992818,0.380087
tootsie,0.076731,0.612791,0.185839,0.356749,0.876345,0.662993,0.83841
pop,0.795292,0.620496,0.484991,0.262956,0.795167,0.71725,0.272113


In [25]:
# A Structured Array!

A = np.zeros(3, dtype=[('A', 'i8'),('B', 'f8'),])
A

array([(0, 0.), (0, 0.), (0, 0.)], dtype=[('A', '<i8'), ('B', '<f8')])

In [26]:
pd.DataFrame(A)

Unnamed: 0,A,B
0,0,0.0
1,0,0.0
2,0,0.0


### The Index Object

In [27]:
# The index of a pandas DataFrame is it's own thing,
# like an immutable array or an ordered multi-set

ind = pd.Index([2, 3, 5, 7, 11])
ind

Int64Index([2, 3, 5, 7, 11], dtype='int64')

In [28]:
# It often operates like an array
ind[1]

3

In [29]:
ind[3:]

Int64Index([7, 11], dtype='int64')

In [30]:
ind[::3]

Int64Index([2, 7], dtype='int64')

In [31]:
# The index also has the familiar NumPy array properties

print('Index Size :', ind.size, '\n')
print('Index Shape:', ind.shape, '\n')
print('Index Dim  :', ind.ndim, '\n')
print('Index dtype:', ind.dtype, '\n')

Index Size : 5 

Index Shape: (5,) 

Index Dim  : 1 

Index dtype: int64 



In [32]:
# THE INDEX IS IMMUTABLE, IT CANNOT BE MODIFIED AS NORMAL

ind[1] = 5

TypeError: Index does not support mutable operations

In [33]:
# Index as an Ordered Set
# The Index object follows many of the rules of the Python "set" data structure
# This makes unions, intersections, differences and other combinations familiar to compute

indA = pd.Index([1, 3, 5, 7, 9])
indB = pd.Index([2, 3, 5, 7, 11])

indA.intersection(indB)

Int64Index([3, 5, 7], dtype='int64')

In [34]:
indA.union(indB)

Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

In [35]:
indA.difference(indB)

Int64Index([1, 9], dtype='int64')

In [36]:
indB.difference(indA)

Int64Index([2, 11], dtype='int64')