# Pandas Object

# Pandas Series Object

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])

In [3]:
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [4]:
type(data)

pandas.core.series.Series

In [6]:
data.index

RangeIndex(start=0, stop=4, step=1)

In [7]:
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [8]:
data[3]

1.0

In [10]:
data[1:3]

1    0.50
2    0.75
dtype: float64

# Series as generalized Numpy Array

In [11]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a','b','c','d'])

In [12]:
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [13]:
data['d']

1.0

In [14]:
data['b':'d']

b    0.50
c    0.75
d    1.00
dtype: float64

# Series as Specialized Dictionary

In [15]:
population_dict = {'texas':3123,
                   'new york':32434,
                   'california':452312,
                   'Florida':4353619}

In [17]:
population_dict

{'texas': 3123, 'new york': 32434, 'california': 452312, 'Florida': 4353619}

In [18]:
population = pd.Series(population_dict)

In [19]:
population

texas            3123
new york        32434
california     452312
Florida       4353619
dtype: int64

In [20]:
type(population)

pandas.core.series.Series

In [21]:
population['california']

452312

In [22]:
population['texas':'california']

texas           3123
new york       32434
california    452312
dtype: int64

In [23]:
area_dict = {'texas':312783,
                   'new york':3234,
                   'california':4312,
                   'Florida':43519}

In [24]:
area_dict

{'texas': 312783, 'new york': 3234, 'california': 4312, 'Florida': 43519}

In [25]:
area=pd.Series(area_dict)

In [26]:
area

texas         312783
new york        3234
california      4312
Florida        43519
dtype: int64

# DataFrame Object

In [27]:
states = pd.DataFrame({'Population':population,
                      'Area':area})

In [28]:
states

Unnamed: 0,Population,Area
texas,3123,312783
new york,32434,3234
california,452312,4312
Florida,4353619,43519


In [29]:
type(states)

pandas.core.frame.DataFrame

In [30]:
states.index

Index(['texas', 'new york', 'california', 'Florida'], dtype='object')

In [31]:
states.columns

Index(['Population', 'Area'], dtype='object')

In [33]:
states['Area']

texas         312783
new york        3234
california      4312
Florida        43519
Name: Area, dtype: int64

# Statistical Functions

In [35]:
states.count()

Population    4
Area          4
dtype: int64

In [36]:
states.sum()

Population    4841488
Area           363848
dtype: int64

In [37]:
states.mean()

Population    1210372.0
Area            90962.0
dtype: float64

In [38]:
states.mode()

Unnamed: 0,Population,Area
0,3123,3234
1,32434,4312
2,452312,43519
3,4353619,312783


In [39]:
states.std()

Population    2.105520e+06
Area          1.490635e+05
dtype: float64

In [40]:
states.min()

Population    3123
Area          3234
dtype: int64

In [41]:
states.max()

Population    4353619
Area           312783
dtype: int64

In [43]:
states.abs()

Unnamed: 0,Population,Area
texas,3123,312783
new york,32434,3234
california,452312,4312
Florida,4353619,43519


In [44]:
states.describe()

Unnamed: 0,Population,Area
count,4.0,4.0
mean,1210372.0,90962.0
std,2105520.0,149063.542127
min,3123.0,3234.0
25%,25106.25,4042.5
50%,242373.0,23915.5
75%,1427639.0,110835.0
max,4353619.0,312783.0


# Saving to CSV

In [47]:
states.to_csv('states.csv')

In [48]:
states

Unnamed: 0,Population,Area
texas,3123,312783
new york,32434,3234
california,452312,4312
Florida,4353619,43519


# Reading from CSV

In [49]:
states_1 = pd.read_csv('states.csv')

In [51]:
states_1

Unnamed: 0.1,Unnamed: 0,Population,Area
0,texas,3123,312783
1,new york,32434,3234
2,california,452312,4312
3,Florida,4353619,43519


In [52]:
states_1.columns

Index(['Unnamed: 0', 'Population', 'Area'], dtype='object')

# Rename Column

In [53]:
states_1.rename(columns={'Unnamed: 0':'States'})

Unnamed: 0,States,Population,Area
0,texas,3123,312783
1,new york,32434,3234
2,california,452312,4312
3,Florida,4353619,43519


# Pandas Index Object

In [54]:
ind = pd.Index([2,3,5,7,8])

In [55]:
ind

Int64Index([2, 3, 5, 7, 8], dtype='int64')

In [56]:
ind[1]

3

In [58]:
ind[::2]

Int64Index([2, 5, 8], dtype='int64')

# Index as Ordered Set

In [60]:
indx = pd.Index([1,3,5,7,9])
indy = pd.Index([2,3,5,7,11])

In [62]:
#intersection
indx & indy

  indx & indy


Int64Index([3, 5, 7], dtype='int64')

In [63]:
#union
indx | indy

  indx | indy


Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

In [64]:
#symmetric Difference
indx ^ indy

  indx ^ indy


Int64Index([1, 2, 9, 11], dtype='int64')

# Indexers: loc, iloc

In [67]:
data = pd.Series(['a','b','c','d'], index = [1,3,5,7])

In [68]:
data

1    a
3    b
5    c
7    d
dtype: object

In [69]:
data[1]

'a'

In [70]:
data[1:3]

3    b
5    c
dtype: object

In [72]:
data.loc[1]

'a'

In [74]:
data.iloc[1]

'b'

# Handling Missing Data

In [75]:
var = np.array([1,3,None,4])

In [76]:
var

array([1, 3, None, 4], dtype=object)

In [77]:
var = np.array([1,3,np.nan,4])

In [78]:
var

array([ 1.,  3., nan,  4.])

In [79]:
var.dtype

dtype('float64')

In [80]:
var.sum()

nan

In [81]:
var.min()

nan

# Nan & None in Pandas

In [83]:
x = pd.Series([1,np.nan,5,None])

In [84]:
x

0    1.0
1    NaN
2    5.0
3    NaN
dtype: float64

# Operation on Null Values

In [86]:
x.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [87]:
x.isnull().sum()

2

In [88]:
x.dropna()

0    1.0
2    5.0
dtype: float64

In [96]:
x.fillna(0)

0    1.0
1    0.0
2    5.0
3    0.0
dtype: float64

In [94]:
x.fillna(method="ffill")

0    1.0
1    1.0
2    5.0
3    5.0
dtype: float64

In [95]:
x.fillna(method="bfill")

0    1.0
1    5.0
2    5.0
3    NaN
dtype: float64

# Concatenation

In [98]:
ser1 = pd.Series(['A','B','C'],index=[1,2,3])
ser2 = pd.Series(['D','E','F'],index=[4,5,6])

In [99]:
ser1

1    A
2    B
3    C
dtype: object

In [100]:
ser2

4    D
5    E
6    F
dtype: object

In [101]:
ser3 = pd.concat([ser1,ser2])

In [102]:
ser3

1    A
2    B
3    C
4    D
5    E
6    F
dtype: object

# Merge

In [103]:
pop = pd.read_csv('state-population.csv')
area = pd.read_csv('state-areas.csv')
abbrevs = pd.read_csv('state-abbrevs.csv')

In [104]:
pop.head()

Unnamed: 0,state/region,ages,year,population
0,AL,under18,2012,1117489.0
1,AL,total,2012,4817528.0
2,AL,under18,2010,1130966.0
3,AL,total,2010,4785570.0
4,AL,under18,2011,1125763.0


In [106]:
area.head()

Unnamed: 0,state,area (sq. mi)
0,Alabama,52423
1,Alaska,656425
2,Arizona,114006
3,Arkansas,53182
4,California,163707


In [107]:
abbrevs.head()

Unnamed: 0,state,abbreviation
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ
3,Arkansas,AR
4,California,CA


In [122]:
merged = pd.merge(pop,abbrevs,how='outer',left_on='state/region',right_on='abbreviation')

In [124]:
merged.head()

Unnamed: 0,state/region,ages,year,population,state,abbreviation
0,AL,under18,2012,1117489.0,Alabama,AL
1,AL,total,2012,4817528.0,Alabama,AL
2,AL,under18,2010,1130966.0,Alabama,AL
3,AL,total,2010,4785570.0,Alabama,AL
4,AL,under18,2011,1125763.0,Alabama,AL


In [125]:
merged.isnull().sum()

state/region     0
ages             0
year             0
population      20
state           96
abbreviation    96
dtype: int64

In [127]:
merged.loc[merged['state'].isnull(),'state/region'].unique()

array(['PR', 'USA'], dtype=object)

In [130]:
merged.loc[merged['state/region']== 'PR', 'state']='Puerto Rico'
merged.loc[merged['state/region']== 'USA', 'state']='United States'

In [131]:
merged['state']

0             Alabama
1             Alabama
2             Alabama
3             Alabama
4             Alabama
            ...      
2539    United States
2540    United States
2541    United States
2542    United States
2543    United States
Name: state, Length: 2544, dtype: object

In [138]:
merged = merged.drop('abbreviation', axis=1)

In [139]:
area.head()

Unnamed: 0,state,area (sq. mi)
0,Alabama,52423
1,Alaska,656425
2,Arizona,114006
3,Arkansas,53182
4,California,163707


In [140]:
final = pd.merge(merged, area, on='state',how='left')

In [141]:
final.head()

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
0,AL,under18,2012,1117489.0,Alabama,52423.0
1,AL,total,2012,4817528.0,Alabama,52423.0
2,AL,under18,2010,1130966.0,Alabama,52423.0
3,AL,total,2010,4785570.0,Alabama,52423.0
4,AL,under18,2011,1125763.0,Alabama,52423.0


In [142]:
final.isnull().sum()

state/region      0
ages              0
year              0
population       20
state             0
area (sq. mi)    48
dtype: int64

In [143]:
final.dropna(inplace=True)

In [144]:
final.isnull().sum()

state/region     0
ages             0
year             0
population       0
state            0
area (sq. mi)    0
dtype: int64

In [145]:
final.shape

(2476, 6)

In [146]:
final.head()

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
0,AL,under18,2012,1117489.0,Alabama,52423.0
1,AL,total,2012,4817528.0,Alabama,52423.0
2,AL,under18,2010,1130966.0,Alabama,52423.0
3,AL,total,2010,4785570.0,Alabama,52423.0
4,AL,under18,2011,1125763.0,Alabama,52423.0


In [148]:
data2010=final.query("year==2010 & ages=='total'")

In [149]:
data2010

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
3,AL,total,2010,4785570.0,Alabama,52423.0
91,AK,total,2010,713868.0,Alaska,656425.0
101,AZ,total,2010,6408790.0,Arizona,114006.0
189,AR,total,2010,2922280.0,Arkansas,53182.0
197,CA,total,2010,37333601.0,California,163707.0
283,CO,total,2010,5048196.0,Colorado,104100.0
293,CT,total,2010,3579210.0,Connecticut,5544.0
379,DE,total,2010,899711.0,Delaware,1954.0
389,DC,total,2010,605125.0,District of Columbia,68.0
475,FL,total,2010,18846054.0,Florida,65758.0
