<img style="float:right" src="https://iteach.kennesaw.edu/images/MB_Vert_3Clr.png" width="100">

# pandas

Tools and data structures (Series and DataFrame) that support data analytics in Python.

In [2]:
import pandas as pd

## pandas Series

In [None]:
s1 = pd.Series([1, 1, 2, 3, 5, 8])
s1

In [None]:
s1.values

In [None]:
s1.index

Let's make the index more descriptive ...

In [7]:
s2 = pd.Series([1, 1, 2, 3, 5, 8], index=list('abcdef')); s2

a    1
b    1
c    2
d    3
e    5
f    8
dtype: int64

### Selecting values by index

In [8]:
s2['d']

3

In [9]:
s2[['d', 'a']]

d    3
a    1
dtype: int64

### Testing membership

In [10]:
'b' in s2, 'z' in s2

(True, False)

## pandas DataFrame

### Constructing a DataFrame

#### From a NumPy matrix

In [88]:
import numpy as np
data = np.random.random((4,2))
df = pd.DataFrame(data, columns=['a', 'b'])
df

Unnamed: 0,a,b
0,0.332969,0.852141
1,0.834882,0.934484
2,0.621355,0.892955
3,0.45123,0.672888


#### From a dict of lists

In [75]:
city_data = {'city': ['Atlanta', 'Roswell', 'Marietta', 'Smyrna', 'Kennesaw'],
             'pop':  [420_003, 88_346, 56_579, 51_271, 29_783],
             'rank': [3, 5, 4, 1, 2]}

In [77]:
cities = pd.DataFrame(city_data)
cities

Unnamed: 0,city,pop,rank
0,Atlanta,420003,3
1,Roswell,88346,5
2,Marietta,56579,4
3,Smyrna,51271,1
4,Kennesaw,29783,2


### Renaming columns

In [78]:
cities.columns = ['City', 'Population', 'Rank']
cities

Unnamed: 0,City,Population,Rank
0,Atlanta,420003,3
1,Roswell,88346,5
2,Marietta,56579,4
3,Smyrna,51271,1
4,Kennesaw,29783,2


### Adding a column

In [80]:
cities['Area'] = [134, None, 23.2, 15.4, 9.5]
cities

Unnamed: 0,City,Population,Rank,Area
0,Atlanta,420003,3,134.0
1,Roswell,88346,5,
2,Marietta,56579,4,23.2
3,Smyrna,51271,1,15.4
4,Kennesaw,29783,2,9.5


### Make a column be the index

In [81]:
cities = cities.set_index('City')
cities

Unnamed: 0_level_0,Population,Rank,Area
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Atlanta,420003,3,134.0
Roswell,88346,5,
Marietta,56579,4,23.2
Smyrna,51271,1,15.4
Kennesaw,29783,2,9.5


In [82]:
cities.sort_values(['Area'], ascending=True)

Unnamed: 0_level_0,Population,Rank,Area
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Kennesaw,29783,2,9.5
Smyrna,51271,1,15.4
Marietta,56579,4,23.2
Atlanta,420003,3,134.0
Roswell,88346,5,


### Select a column

In [83]:
cities['Area']

City
Atlanta     134.0
Roswell       NaN
Marietta     23.2
Smyrna       15.4
Kennesaw      9.5
Name: Area, dtype: float64

In [84]:
cities.Area

City
Atlanta     134.0
Roswell       NaN
Marietta     23.2
Smyrna       15.4
Kennesaw      9.5
Name: Area, dtype: float64

### Select a row by index

Type this: <i><font color="red">df['Atlanta']</font></i>

Type this: *df.loc['Atlanta']*

Type this: *df.iloc[4]*

### Operations

### Summarizing and descriptive statistics

In [109]:
cities.sum()

Population    645982.0
Rank              15.0
Area             182.1
dtype: float64

In [112]:
cities.sum(axis=1)

City
Atlanta     420140.0
Roswell      88351.0
Marietta     56606.2
Smyrna       51287.4
Kennesaw     29794.5
dtype: float64

In [108]:
cities.describe()

Unnamed: 0,Population,Rank,Area
count,5.0,5.0,4.0
mean,129196.4,3.0,45.525
std,163909.969672,1.581139,59.249606
min,29783.0,1.0,9.5
25%,51271.0,2.0,13.925
50%,56579.0,3.0,19.3
75%,88346.0,4.0,50.9
max,420003.0,5.0,134.0


In [96]:
df['City']= ['Atlanta', 'Smyrna', 'Kennesaw', 'Marietta']
df = df.set_index('City')
df

Unnamed: 0_level_0,a,b
City,Unnamed: 1_level_1,Unnamed: 2_level_1
Atlanta,0.332969,0.852141
Smyrna,0.834882,0.934484
Kennesaw,0.621355,0.892955
Marietta,0.45123,0.672888


In [99]:
cities

Unnamed: 0_level_0,Population,Rank,Area
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Atlanta,420003,3,134.0
Roswell,88346,5,
Marietta,56579,4,23.2
Smyrna,51271,1,15.4
Kennesaw,29783,2,9.5


In [107]:
all_columns = pd.concat([cities, df], axis=1, sort=False)
all_columns

Unnamed: 0,Population,Rank,Area,a,b
Atlanta,420003,3,134.0,0.332969,0.852141
Roswell,88346,5,,,
Marietta,56579,4,23.2,0.45123,0.672888
Smyrna,51271,1,15.4,0.834882,0.934484
Kennesaw,29783,2,9.5,0.621355,0.892955
