# Introduction to Pandas

### What is Pandas?

Built on NumPy, Pandas is an open-source high-level data manipulation tool that lets you analyse,
clean and manipulate data in what are called DataFrames. 
<br><br>A DataFrame is just an object that stores data in a table-like structure of rows (observations) and columns (Features/Labels).

### Series

A series is an indexed single dimentional object (a Dataframe with only a single column) the default index starts from 0 goes up to the length of the series.

In [222]:
import numpy as np
import pandas as pd

In [223]:
#create a series with default indexing
s1 = pd.Series([0,1,3,4,5])
s1

0    0
1    1
2    3
3    4
4    5
dtype: int64

In [224]:
s1 = pd.Series([0,'Hello','World',4,5.2])
s1

0        0
1    Hello
2    World
3        4
4      5.2
dtype: object

In [266]:
#create a series with specific indices
s1 = pd.Series([3, 4, 5, 6 ,7], index=['1st', '2nd', '3rd', '4th', '5th'])
s1

1st    3
2nd    4
3rd    5
4th    6
5th    7
dtype: int64

In [269]:
s1['3rd']

5

In [227]:
s1[s1 > 4]

3rd    5
4th    6
5th    7
dtype: int64

In [228]:
s1[s1%2==0]

2nd    4
4th    6
dtype: int64

In [229]:
s1[1:-1]

2nd    4
3rd    5
4th    6
dtype: int64

In [230]:
s2 = pd.Series([0,'Hello','World',4,5.2])
s2.append(s1)

0          0
1      Hello
2      World
3          4
4        5.2
1st        3
2nd        4
3rd        5
4th        6
5th        7
dtype: object

### DataFrame

In [270]:
#empty dataframe
df = pd.DataFrame(columns=['First_name', 'Last_name', 'Age', 'Gender'])
df

Unnamed: 0,First_name,Last_name,Age,Gender


In [271]:
#insert ordered values
df.loc['p1'] = ['Winston', 'Bishop', 32, 'Male']
df.loc['p2'] = ['Jessica', 'Day', 31, 'Female']
df

Unnamed: 0,First_name,Last_name,Age,Gender
p1,Winston,Bishop,32,Male
p2,Jessica,Day,31,Female


In [272]:
#insert unordered values
df.loc[3] = dict(Gender='Male', First_name='Nick', Age=33, Last_name='Miller')
df

Unnamed: 0,First_name,Last_name,Age,Gender
p1,Winston,Bishop,32,Male
p2,Jessica,Day,31,Female
3,Nick,Miller,33,Male


In [274]:
#create a dataframe from ordered lists
l = [['Winston', 'Schmidt', 32, 'Male'], ['Cece', '', 31, 'Female']]
df = pd.DataFrame(l, index=[0, 'a'], columns=['First_name', 'Last_name', 'Age', 'Gender'])
df

Unnamed: 0,First_name,Last_name,Age,Gender
0,Winston,Schmidt,32,Male
a,Cece,,31,Female


In [235]:
#create a dataframe from a dict
d = {'a':['Jessica', 'Day', 31, 'Female'], 'b':['Nick', 'Miller', 33, 'Male']}

In [236]:
df = pd.DataFrame(d, index=['Gender', 'Age', 'Height', 'Weight'])
df

Unnamed: 0,a,b
Gender,Jessica,Nick
Age,Day,Miller
Height,31,33
Weight,Female,Male


In [237]:
df = pd.DataFrame.from_dict(d, orient='index')
df.columns=['Gender', 'Age', 'Height', 'Weight']
df

Unnamed: 0,Gender,Age,Height,Weight
a,Jessica,Day,31,Female
b,Nick,Miller,33,Male


In [238]:
df.T

Unnamed: 0,a,b
Gender,Jessica,Nick
Age,Day,Miller
Height,31,33
Weight,Female,Male


#### Loading a CSV file to a Dataframe

In [239]:
trees = pd.read_csv('trees.csv')
trees

Unnamed: 0,Index,Girth,Height,Volume
0,1,8.3,70,10.3
1,2,8.6,65,10.3
2,3,8.8,63,10.2
3,4,10.5,72,16.4
4,5,10.7,81,18.8
5,6,10.8,83,19.7
6,7,11.0,66,15.6
7,8,11.0,75,18.2
8,9,11.1,80,22.6
9,10,11.2,75,19.9


In [240]:
trees.head()

Unnamed: 0,Index,Girth,Height,Volume
0,1,8.3,70,10.3
1,2,8.6,65,10.3
2,3,8.8,63,10.2
3,4,10.5,72,16.4
4,5,10.7,81,18.8


In [241]:
trees.tail(7)

Unnamed: 0,Index,Girth,Height,Volume
24,25,16.3,77,42.6
25,26,17.3,81,55.4
26,27,17.5,82,55.7
27,28,17.9,80,58.3
28,29,18.0,80,51.5
29,30,18.0,80,51.0
30,31,20.6,87,77.0


In [242]:
trees.drop('Index', axis=1, inplace=True)
trees.head()

Unnamed: 0,Girth,Height,Volume
0,8.3,70,10.3
1,8.6,65,10.3
2,8.8,63,10.2
3,10.5,72,16.4
4,10.7,81,18.8


In [243]:
trees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31 entries, 0 to 30
Data columns (total 3 columns):
Girth     31 non-null float64
Height    31 non-null int64
Volume    31 non-null float64
dtypes: float64(2), int64(1)
memory usage: 824.0 bytes


In [244]:
trees.describe()

Unnamed: 0,Girth,Height,Volume
count,31.0,31.0,31.0
mean,13.248387,76.0,30.170968
std,3.138139,6.371813,16.437846
min,8.3,63.0,10.2
25%,11.05,72.0,19.4
50%,12.9,76.0,24.2
75%,15.25,80.0,37.3
max,20.6,87.0,77.0


#### Column selection

In [245]:
trees['Height'].head()

0    70
1    65
2    63
3    72
4    81
Name: Height, dtype: int64

In [246]:
type(trees['Height'])

pandas.core.series.Series

In [247]:
trees[['Height', 'Volume']].head(3)

Unnamed: 0,Height,Volume
0,70,10.3
1,65,10.3
2,63,10.2


#### Row selection

In [248]:
trees[5:10]

Unnamed: 0,Girth,Height,Volume
5,10.8,83,19.7
6,11.0,66,15.6
7,11.0,75,18.2
8,11.1,80,22.6
9,11.2,75,19.9


In [249]:
trees.iloc[5:10]

Unnamed: 0,Girth,Height,Volume
5,10.8,83,19.7
6,11.0,66,15.6
7,11.0,75,18.2
8,11.1,80,22.6
9,11.2,75,19.9


In [250]:
trees[trees['Height'] > 80]

Unnamed: 0,Girth,Height,Volume
4,10.7,81,18.8
5,10.8,83,19.7
16,12.9,85,33.8
17,13.3,86,27.4
25,17.3,81,55.4
26,17.5,82,55.7
30,20.6,87,77.0


In [251]:
trees[(trees['Height'] > 80) & (trees['Girth'] < 12)]

Unnamed: 0,Girth,Height,Volume
4,10.7,81,18.8
5,10.8,83,19.7


The difference between `iloc()` and `loc()`

In [252]:
trees2 = trees[trees['Height'] > 80]
trees2

Unnamed: 0,Girth,Height,Volume
4,10.7,81,18.8
5,10.8,83,19.7
16,12.9,85,33.8
17,13.3,86,27.4
25,17.3,81,55.4
26,17.5,82,55.7
30,20.6,87,77.0


In [253]:
trees2.iloc[:5]

Unnamed: 0,Girth,Height,Volume
4,10.7,81,18.8
5,10.8,83,19.7
16,12.9,85,33.8
17,13.3,86,27.4
25,17.3,81,55.4


In [254]:
trees2.loc[:5]

Unnamed: 0,Girth,Height,Volume
4,10.7,81,18.8
5,10.8,83,19.7


In [264]:
sorted_trees = trees.sort_values('Height')
sorted_trees.iloc[:7]

Unnamed: 0,Girth,Height,Volume
2,8.8,63,10.2
19,13.8,64,24.9
1,8.6,65,10.3
6,11.0,66,15.6
13,11.7,69,21.3
0,8.3,70,10.3
18,13.7,71,25.7


In [277]:
sorted_trees.loc[:7]

Unnamed: 0,Girth,Height,Volume
2,8.8,63,10.2
19,13.8,64,24.9
1,8.6,65,10.3
6,11.0,66,15.6
13,11.7,69,21.3
0,8.3,70,10.3
18,13.7,71,25.7
3,10.5,72,16.4
23,16.0,72,38.3
22,14.5,74,36.3


In [281]:
trees2.values

array([[10.7, 81. , 18.8],
       [10.8, 83. , 19.7],
       [12.9, 85. , 33.8],
       [13.3, 86. , 27.4],
       [17.3, 81. , 55.4],
       [17.5, 82. , 55.7],
       [20.6, 87. , 77. ]])

# ===================================================

### Excersies: