# Pandas basics: Intro

2 objects:

1. pd.Series
2. pd.DataFrame

In [1]:
import pandas as pd
import numpy as np

In [2]:
# pandas series
ps = pd.Series(['s',1,3])

In [3]:
ps

0    s
1    1
2    3
dtype: object

In [4]:
type(ps)

pandas.core.series.Series

In [5]:
type(ps.values)

numpy.ndarray

In [6]:
ps.index

RangeIndex(start=0, stop=3, step=1)

In [7]:
# series
ex_series = pd.Series(
    data = ['Mozarella Cheese',
           'Wiener Schnitzel',
            'Schwarzwalder Kirschtorte',
            'Lemonade',
            'Whiskey'],
index = ['appetizer',
            'main course',
            'dessert',
            'beverage',
            'alcohol'])

In [8]:
ex_series

appetizer               Mozarella Cheese
main course             Wiener Schnitzel
dessert        Schwarzwalder Kirschtorte
beverage                        Lemonade
alcohol                          Whiskey
dtype: object

## `loc` and `iloc` to access elements
1. `loc`: access via ***values***
2. `iloc`: access via ***index***

In [9]:
# loc 
ex_series.loc[['appetizer','main course']]

appetizer      Mozarella Cheese
main course    Wiener Schnitzel
dtype: object

In [10]:
# iloc
ex_series.iloc[[0,1]]

appetizer      Mozarella Cheese
main course    Wiener Schnitzel
dtype: object

In [11]:
# We can retrieve using a range of data using iloc[]
ex_series.iloc[0:2]

appetizer      Mozarella Cheese
main course    Wiener Schnitzel
dtype: object

In [12]:
ex_series.loc['appetizer']

'Mozarella Cheese'

## DataFrames
DataFrame is 2 dimensional tabular size-mutable and heterogeneous data. It is a collection of `pd.Series`

In [13]:
# let's create some DataFrame
dc_city_pop = {
    'Tokyo': 2000,
    'Delhi': 30303,
    'Shanghai': 5356
}

In [14]:
pc_city_pop = pd.Series(dc_city_pop)

In [15]:
pc_city_pop.iloc[0]

2000

In [16]:
dc_city_countries = {
    "Tokyo" : "Japan",
    "Delhi" : 'India',
    'Shanghai' : 'China'
}

In [17]:
dc_city_countries

{'Tokyo': 'Japan', 'Delhi': 'India', 'Shanghai': 'China'}

In [18]:
pc_city_countries = pd.Series(dc_city_countries)

In [19]:
pc_city_countries


Tokyo       Japan
Delhi       India
Shanghai    China
dtype: object

In [20]:
pc_city_pop

Tokyo        2000
Delhi       30303
Shanghai     5356
dtype: int64

In [21]:
# concatenate the columns

df_cities = pd.concat([pc_city_countries,pc_city_pop], axis = 1)

In [22]:
# it becomes DataFrame
type(df_cities)

pandas.core.frame.DataFrame

In [23]:
df_cities

Unnamed: 0,0,1
Tokyo,Japan,2000
Delhi,India,30303
Shanghai,China,5356


In [24]:
# lets change column names
df_cities.columns = ['Country','Population']

In [25]:
df_cities

Unnamed: 0,Country,Population
Tokyo,Japan,2000
Delhi,India,30303
Shanghai,China,5356


In [26]:
# iloc
df_cities.iloc[1:2]

Unnamed: 0,Country,Population
Delhi,India,30303


In [27]:
#loc
df_cities.loc[['Tokyo','Delhi']]

Unnamed: 0,Country,Population
Tokyo,Japan,2000
Delhi,India,30303


In [28]:
#iloc with 2 parameters
df_cities.iloc[1,1]

30303

In [29]:
# loc with 2 parameters
df_cities.loc['Shanghai','Population']

5356

In [30]:
#indexing based on logical conditions
df_cities[df_cities['Population']>2000]

Unnamed: 0,Country,Population
Delhi,India,30303
Shanghai,China,5356


In [31]:
#indexing based on logical conditions
df_cities[df_cities['Country']=='India']

Unnamed: 0,Country,Population
Delhi,India,30303


In [32]:
# show the data for countries that are in the list
df_cities[df_cities['Country'].isin(['China','India'])]

Unnamed: 0,Country,Population
Delhi,India,30303
Shanghai,China,5356


#### We can use `~` sign for negating the results

In [33]:
# show the data for countries that are **not** in the list
df_cities[~df_cities['Country'].isin(['China','India'])]

Unnamed: 0,Country,Population
Tokyo,Japan,2000


In [34]:
# Create a column from index and generate a new index, drop should be False, if we want to keep the old_index column
df_cities.reset_index(drop =False,inplace = True)

In [59]:
df_cities.rename({'City1':'City'},axis = 1,inplace = True)

In [60]:
df_cities

Unnamed: 0,City,Country,Population
0,Tokyo,Japan,2000
1,Delhi,India,30303
2,Shanghai,China,5356


In [61]:
#metadata
df_cities.shape

(3, 3)

In [63]:
print(f'The number of rows is: {df_cities.shape[0]} \nThe number of columns is: {df_cities.shape[1]}')

The number of rows is: 3 
The number of columns is: 3


In [66]:
df_cities.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   City        3 non-null      object
 1   Country     3 non-null      object
 2   Population  3 non-null      int64 
dtypes: int64(1), object(2)
memory usage: 96.0+ bytes


In [70]:
# check null values
df_cities.isna().sum()

City          0
Country       0
Population    0
dtype: int64