# Getting started with Pandas

Pandas is used for - 
    - Data Import / Export
    - Data Selection and Filtering
    - Data Wrangling
    - Descriptive Statistics with Pandas
    - Data Cleaning
    - Data Aggregation

Pandas has two datatypes mainly used - 
    - Pandas Series - used to represent 1D data
    - Pandas Dataframe - used to represent 2D data

In [1]:
import pandas as pd

### Pandas Series

In [3]:
ps = pd.Series([7,4,5,2,3,6,9,5,4,1,2,5],name="Number of Sales")
ps

0     7
1     4
2     5
3     2
4     3
5     6
6     9
7     5
8     4
9     1
10    2
11    5
Name: Number of Sales, dtype: int64

In [4]:
ps.size

12

In [5]:
ps.name

'Number of Sales'

In [6]:
ps.index

RangeIndex(start=0, stop=12, step=1)

### Pandas Dataframe

In [7]:
data = {"Name":["Anshu",'John',"Marco",'Rahul',"Dhanesh","Lankesh","Divya","Laxmi","Swetha"],
        "Age":[45,25,48,56,22,35,36,41,25],
        "Gender":["M","M","M","M","M","M","F","F","F"],
        "City":["Delhi","Mumbai","Chennai","Chennai",'Mumbai','Chennai','Delhi',"Bangalore","Bangalore"]}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Gender,City
0,Anshu,45,M,Delhi
1,John,25,M,Mumbai
2,Marco,48,M,Chennai
3,Rahul,56,M,Chennai
4,Dhanesh,22,M,Mumbai
5,Lankesh,35,M,Chennai
6,Divya,36,F,Delhi
7,Laxmi,41,F,Bangalore
8,Swetha,25,F,Bangalore


In [8]:
df.shape

(9, 4)

In [9]:
df.columns

Index(['Name', 'Age', 'Gender', 'City'], dtype='object')

## Data Selection and Filtering with Pandas Data Frame

In [10]:
df

Unnamed: 0,Name,Age,Gender,City
0,Anshu,45,M,Delhi
1,John,25,M,Mumbai
2,Marco,48,M,Chennai
3,Rahul,56,M,Chennai
4,Dhanesh,22,M,Mumbai
5,Lankesh,35,M,Chennai
6,Divya,36,F,Delhi
7,Laxmi,41,F,Bangalore
8,Swetha,25,F,Bangalore


In [11]:
df['Name'] # to select data from a specific column of a dataframe

0      Anshu
1       John
2      Marco
3      Rahul
4    Dhanesh
5    Lankesh
6      Divya
7      Laxmi
8     Swetha
Name: Name, dtype: object

In [13]:
df.Name  # this is not a recommended approach, as it will not work if colname has space

0      Anshu
1       John
2      Marco
3      Rahul
4    Dhanesh
5    Lankesh
6      Divya
7      Laxmi
8     Swetha
Name: Name, dtype: object

In [14]:
cols = ['Name','Gender']
df[cols]  
# OR df[['Name','Gender']]

Unnamed: 0,Name,Gender
0,Anshu,M
1,John,M
2,Marco,M
3,Rahul,M
4,Dhanesh,M
5,Lankesh,M
6,Divya,F
7,Laxmi,F
8,Swetha,F


In [15]:
type(df['Name'])

pandas.core.series.Series

In [16]:
type(df[cols])

pandas.core.frame.DataFrame

In [15]:
type(df['Name'])

pandas.core.series.Series

In [17]:
df.head() # by defualt top 5 rows

Unnamed: 0,Name,Age,Gender,City
0,Anshu,45,M,Delhi
1,John,25,M,Mumbai
2,Marco,48,M,Chennai
3,Rahul,56,M,Chennai
4,Dhanesh,22,M,Mumbai


In [18]:
df.head(2) # to access top 2 rows

Unnamed: 0,Name,Age,Gender,City
0,Anshu,45,M,Delhi
1,John,25,M,Mumbai


In [19]:
df.tail() # to get bottom 5 rows

Unnamed: 0,Name,Age,Gender,City
4,Dhanesh,22,M,Mumbai
5,Lankesh,35,M,Chennai
6,Divya,36,F,Delhi
7,Laxmi,41,F,Bangalore
8,Swetha,25,F,Bangalore


In [20]:
df.tail(2) # bottom 2 rows

Unnamed: 0,Name,Age,Gender,City
7,Laxmi,41,F,Bangalore
8,Swetha,25,F,Bangalore


In [21]:
df[0:2]

Unnamed: 0,Name,Age,Gender,City
0,Anshu,45,M,Delhi
1,John,25,M,Mumbai


In [22]:
df[1:5]

Unnamed: 0,Name,Age,Gender,City
1,John,25,M,Mumbai
2,Marco,48,M,Chennai
3,Rahul,56,M,Chennai
4,Dhanesh,22,M,Mumbai


In [23]:
df[1:4]['Name']

1     John
2    Marco
3    Rahul
Name: Name, dtype: object

In [24]:
df[1:5][['Name',"Gender"]]

Unnamed: 0,Name,Gender
1,John,M
2,Marco,M
3,Rahul,M
4,Dhanesh,M


In [25]:
# accessing rows and cols by index
df.iloc[1:5,2:4]

Unnamed: 0,Gender,City
1,M,Mumbai
2,M,Chennai
3,M,Chennai
4,M,Mumbai


## Filtering

In [26]:
df

Unnamed: 0,Name,Age,Gender,City
0,Anshu,45,M,Delhi
1,John,25,M,Mumbai
2,Marco,48,M,Chennai
3,Rahul,56,M,Chennai
4,Dhanesh,22,M,Mumbai
5,Lankesh,35,M,Chennai
6,Divya,36,F,Delhi
7,Laxmi,41,F,Bangalore
8,Swetha,25,F,Bangalore


In [27]:
df[df['Age']<40]

Unnamed: 0,Name,Age,Gender,City
1,John,25,M,Mumbai
4,Dhanesh,22,M,Mumbai
5,Lankesh,35,M,Chennai
6,Divya,36,F,Delhi
8,Swetha,25,F,Bangalore


In [28]:
df['Age']<40

0    False
1     True
2    False
3    False
4     True
5     True
6     True
7    False
8     True
Name: Age, dtype: bool

In [29]:
df[df['Age']>=40]

Unnamed: 0,Name,Age,Gender,City
0,Anshu,45,M,Delhi
2,Marco,48,M,Chennai
3,Rahul,56,M,Chennai
7,Laxmi,41,F,Bangalore


In [30]:
df[df['City']=="Chennai"]

Unnamed: 0,Name,Age,Gender,City
2,Marco,48,M,Chennai
3,Rahul,56,M,Chennai
5,Lankesh,35,M,Chennai


In [31]:
df[df['City']!="Chennai"]

Unnamed: 0,Name,Age,Gender,City
0,Anshu,45,M,Delhi
1,John,25,M,Mumbai
4,Dhanesh,22,M,Mumbai
6,Divya,36,F,Delhi
7,Laxmi,41,F,Bangalore
8,Swetha,25,F,Bangalore


In [32]:
df[df.Age>40][df.Gender=="M"]

  """Entry point for launching an IPython kernel.


Unnamed: 0,Name,Age,Gender,City
0,Anshu,45,M,Delhi
2,Marco,48,M,Chennai
3,Rahul,56,M,Chennai


In [34]:
df[df['City']=="Chennai"]['Name']

2      Marco
3      Rahul
5    Lankesh
Name: Name, dtype: object

In [37]:
for i in range(df.shape[0]):
    if df['City'][i]=="Chennai":
        print(df["Name"][i])

Marco
Rahul
Lankesh


In [38]:
df[ (df.Age>40) & (df.Gender=="F") ]

Unnamed: 0,Name,Age,Gender,City
7,Laxmi,41,F,Bangalore


In [39]:
df[ (df.Age>40) | (df.Gender=="F") ]

Unnamed: 0,Name,Age,Gender,City
0,Anshu,45,M,Delhi
2,Marco,48,M,Chennai
3,Rahul,56,M,Chennai
6,Divya,36,F,Delhi
7,Laxmi,41,F,Bangalore
8,Swetha,25,F,Bangalore
