## Exploring Series

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Create a series
pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])

a    1.311674
b   -0.967693
c    1.023747
d   -0.237366
e   -0.264089
dtype: float64

In [3]:
# Preserve the series in variable s for later access
s = pd.Series(np.random.randn(5))
print(s)

print("Dataset length:")
print(len(s))

0   -1.114109
1    1.697327
2   -1.826726
3    0.038887
4   -0.823130
dtype: float64
Dataset length:
5


In [4]:
# Index
print(s[0])
print("\n")
print(s[:3])

-1.11410898256


0   -1.114109
1    1.697327
2   -1.826726
dtype: float64


## Exploring DataFrames

In [5]:
# Create a dataframe
df = pd.DataFrame(s, columns = ['Column 1'])
df

Unnamed: 0,Column 1
0,-1.114109
1,1.697327
2,-1.826726
3,0.038887
4,-0.82313


In [6]:
# Can access columns by name
df['Column 1']

0   -1.114109
1    1.697327
2   -1.826726
3    0.038887
4   -0.823130
Name: Column 1, dtype: float64

In [7]:
# Easy to add columns
df['Column 2'] = df['Column 1'] * 4
df

Unnamed: 0,Column 1,Column 2
0,-1.114109,-4.456436
1,1.697327,6.789309
2,-1.826726,-7.306903
3,0.038887,0.155548
4,-0.82313,-3.292519


In [8]:
# Other manipulation, like sorting -- if you want to preserve, set equal to a var
df.sort_values(by = 'Column 2')

Unnamed: 0,Column 1,Column 2
2,-1.826726,-7.306903
0,-1.114109,-4.456436
4,-0.82313,-3.292519
3,0.038887,0.155548
1,1.697327,6.789309


In [9]:
# Boolean indexing
df[df['Column 2'] <= 3]

Unnamed: 0,Column 1,Column 2
0,-1.114109,-4.456436
2,-1.826726,-7.306903
3,0.038887,0.155548
4,-0.82313,-3.292519


In [10]:
df.apply(lambda x: min(x) + max(x))
np.mean(df)

Column 1   -0.40555
Column 2   -1.62220
dtype: float64

In [11]:
df.describe()
# table = df.describe()

Unnamed: 0,Column 1,Column 2
count,5.0,5.0
mean,-0.40555,-1.6222
std,1.352385,5.409542
min,-1.826726,-7.306903
25%,-1.114109,-4.456436
50%,-0.82313,-3.292519
75%,0.038887,0.155548
max,1.697327,6.789309
