The pandas library in Python makes working with data — like importing, cleaning, and organizing it — easier. It's hard to imagine doing data science in Python without it.
The most widely used data structures in pandas: `Series` and `DataFrames`.


In [23]:
import pandas as pd 
import numpy as np


In [24]:
# Series Object in Pandas
Series_example = pd.Series([-0.5, 0.75, 1.0, -2])
Series_example
Series_example.values
Series_example.index
print(Series_example[1])
Series_example[1:3]
Series_example2 = pd.Series([-0.5, 0.75, 1.0, -2], index=['a', 'b', 'c', 'd'])
Series_example2
Series_example2['b']

0.75


0.75

In [25]:
# Dictionaries: 
population_dict = { 'France': 65429495,
                    'Germany': 82408706,
                    'Russia': 143910127,
                    'Japan': 126922333 }
population = pd.Series(population_dict)
population
population['Russia']


143910127

In [38]:
# Data structures: DataFrames object
area_dict = {'Albania': 28748,
             'France': 643801,
             'Germany': 357386,
             'Japan': 377972,
             'Russia': 17125200}
area = pd.Series(area_dict)
area

countries = pd.DataFrame({'Population': population, 'Area': area})
countries

countries['Capital'] = ['Tirana', 'Paris', 'Berlin', 'Tokyo', 'Moscow']
countries

countries = countries[['Capital', 'Area', 'Population']]
countries

countries['Population Density'] = countries['Population'] / countries['Area']
countries

countries['Area']

countries.loc['Japan']

countries.loc['Japan']['Area']

countries['Debt-to-GDP Ratio'] = np.nan
countries

debt = pd.Series([0.19, 2.36], index=['Russia', 'Japan'])
countries['Debt-to-GDP Ratio'] = debt
countries

del countries['Capital']
countries

countries.T

pd.DataFrame(np.random.rand(3, 2),
             columns=['random', 'example'],
             index=['a', 'b', 'c'])

Unnamed: 0,random,example
a,0.903782,0.925866
b,0.223582,0.369454
c,0.334603,0.646476


In [60]:
# Manipulate data in pandas: Index objects

series_example = pd.Series([-0.5, 0.75, 1.0, -2], index=['a', 'b', 'c', 'd'])
ind = series_example.index
ind

ind[1]

ind[::2]

# ind[1] = 0 Type error

ind_odd = pd.Index([1, 3, 5, 7, 9])
ind_prime = pd.Index([2, 3, 5, 7, 11])
ind_odd & ind_prime

ind_odd | ind_prime

ind_odd ^ ind_prime

series_example2 = pd.Series([-0.5, 0.75, 1.0, -2], index=['a', 'b', 'c', 'd'])
series_example2

series_example2['b']

'a' in series_example2

series_example2.keys()

list(series_example2.items())

series_example2['e'] = 1.25
series_example2

series_example2['a':'c']

series_example2[0:2]

series_example2[(series_example2 > -1) & (series_example2 < 0.8)] # masking

series_example2[['a', 'e']] # fancy indexing

# The loc attribute allows indexing and slicing that always references the explicit index:

series_example2.loc['a']
series_example2.loc['a':'c']

# The iloc attribute enables indexing and slicing by using the implicit, Python-style index:

series_example2.iloc[0]
series_example2.iloc[0:2]


  ind_odd & ind_prime
  ind_odd | ind_prime
  ind_odd ^ ind_prime


a   -0.50
b    0.75
dtype: float64

In [73]:
# Select data in DataFrames objects 

area = pd.Series({'Albania': 28748,
                  'France': 643801,
                  'Germany': 357386,
                  'Japan': 377972,
                  'Russia': 17125200})
population = pd.Series ({'Albania': 2937590,
                         'France': 65429495,
                         'Germany': 82408706,
                         'Russia': 143910127,
                         'Japan': 126922333})
countries = pd.DataFrame({'Area': area, 'Population': population})
countries

countries['Area']

countries['Population Density'] = countries['Population'] / countries['Area']
countries

countries.values

countries.T

countries.iloc[:3, :2]

countries.loc[:'Germany', :'Population']

countries['France':'Japan']

countries[1:3]





Unnamed: 0,Area,Population,Population Density
France,643801,65429495,101.629999
Germany,357386,82408706,230.587393


In [77]:
# Operating on data in pandas
rng = np.random.RandomState(42)
ser_example = pd.Series(rng.randint(0, 10, 4))
ser_example

df_example = pd.DataFrame(rng.randint(0, 10, (3, 4)),
                  columns=['A', 'B', 'C', 'D'])
df_example

np.exp(ser_example)

np.cos(df_example * np.pi / 4)

Unnamed: 0,A,B,C,D
0,-1.83697e-16,0.7071068,6.123234000000001e-17,-1.83697e-16
1,0.7071068,-1.0,-0.7071068,0.7071068
2,0.7071068,6.123234000000001e-17,-0.7071068,-1.0


In [93]:
# Index alignment with Series objects

area = pd.Series({'Russia': 17075400, 'Canada':  9984670,
                  'USA': 9826675, 'China': 9598094, 
                  'Brazil': 8514877}, name='area')
population = pd.Series({'China': 1409517397, 'India': 1339180127,
                        'USA': 324459463, 'Indonesia': 322179605, 
                        'Brazil': 207652865}, name='population')

population / area

series1 = pd.Series([2, 4, 6], index=[0, 1, 2])
series2 = pd.Series([3, 5, 7], index=[1, 2, 3])
series1 + series2

series1.add(series2, fill_value=0)

df1 = pd.DataFrame(rng.randint(0, 20, (2, 2)),
                   columns=list('AB'))
df1

df2 = pd.DataFrame(rng.randint(0, 10, (3, 3)),
                   columns=list('BAC'))
df2

df1 + df2

fill = df1.stack().mean()
df1.add(df2, fill_value=fill)

df3 = pd.DataFrame(rng.randint(10, size=(3, 4)), columns=list('WXYZ'))
df3

df3 - df3.iloc[0]

df3.subtract(df3['X'], axis=0)

halfrow = df3.iloc[0, ::2]
halfrow

df3 - halfrow

Unnamed: 0,W,X,Y,Z
0,0.0,,0.0,
1,-1.0,,2.0,
2,1.0,,-1.0,
