In [2]:
# Load numpy for math/array operations
# and matplotlib for plotting
import numpy as np
import matplotlib.pyplot as plt

In [3]:
%matplotlib inline
# Set up figure size and DPI for screen demo
plt.rcParams['figure.figsize'] = (4,3)
plt.rcParams['figure.dpi'] = 150

In [4]:
import pandas as pd

# Series

In [6]:
s = pd.Series([1,2,3])

In [7]:
# Pretty printing
s

0    1
1    2
2    3
dtype: int64

In [8]:
# Algebra on Series
s*s

0    1
1    4
2    9
dtype: int64

In [9]:
# Indexes: these can be anything from a range
# it can be string or other python objects
s.index

RangeIndex(start=0, stop=3, step=1)

In [13]:
# Pandas indices are a hybrid between 
# a dictionary and a list
# Non-range indices
s2 = pd.Series(dict(a=1,b=2,c=3))
print(s2.index)
print(s2)


Index(['a', 'b', 'c'], dtype='object')
a    1
b    2
c    3
dtype: int64


In [15]:
# Accessing elements
s2[0]

1

In [17]:
# Series is not ordered!
s3 = pd.Series(dict(a=4,b=3,c=2))
s3

a    4
b    3
c    2
dtype: int64

# DataFrame

In [18]:
#The most commonly used is dataframe
d = pd.DataFrame(dict(one=s,two=s2,three=s3))
d

Unnamed: 0,one,two,three
0,1.0,,
1,2.0,,
2,3.0,,
a,,1.0,4.0
b,,2.0,3.0
c,,3.0,2.0


In [24]:
# Index, Columns, & Values
print(d.index)
print(d.columns)
print(d.values)

Index([0, 1, 2, 'a', 'b', 'c'], dtype='object')
Index(['one', 'two', 'three'], dtype='object')
[[ 1. nan nan]
 [ 2. nan nan]
 [ 3. nan nan]
 [nan  1.  4.]
 [nan  2.  3.]
 [nan  3.  2.]]


In [25]:
# Sort
d.sort_values(by='three')

Unnamed: 0,one,two,three
c,,3.0,2.0
b,,2.0,3.0
a,,1.0,4.0
0,1.0,,
1,2.0,,
2,3.0,,


In [26]:
# Algebra
d*2

Unnamed: 0,one,two,three
0,2.0,,
1,4.0,,
2,6.0,,
a,,2.0,8.0
b,,4.0,6.0
c,,6.0,4.0


In [28]:
# Head & Tail to examine portions of the dataframe

# The last two rows is obtained by
d.tail(2)

Unnamed: 0,one,two,three
b,,2.0,3.0
c,,3.0,2.0


In [30]:
# Descriptive Statistics which
#automaticaly goe into your dataframe
d.describe()

Unnamed: 0,one,two,three
count,3.0,3.0,3.0
mean,2.0,2.0,3.0
std,1.0,1.0,1.0
min,1.0,1.0,2.0
25%,1.5,1.5,2.5
50%,2.0,2.0,3.0
75%,2.5,2.5,3.5
max,3.0,3.0,4.0


In [31]:
# Transpose
d.T

Unnamed: 0,0,1,2,a,b,c
one,1.0,2.0,3.0,,,
two,,,,1.0,2.0,3.0
three,,,,4.0,3.0,2.0


In [33]:
# Selecting data
d['one']

0    1.0
1    2.0
2    3.0
a    NaN
b    NaN
c    NaN
Name: one, dtype: float64

In [37]:
# Boolean operations, fillna, & dropna
d > 2 
#this acts like a mask on your data


Unnamed: 0,one,two,three
0,1.0,0.0,0.0
1,2.0,0.0,0.0
2,3.0,0.0,0.0
a,0.0,1.0,4.0
b,0.0,2.0,3.0
c,0.0,3.0,2.0


In [None]:
#You can fill nas with other values
d.fillna(0)
#d.dropna()

In [38]:
#for example
pd.read_csv('test.txt',sep='\t')


Unnamed: 0,year,sex,wgt
0,2001,F,36.221914
1,2001,M,36.481844
2,2002,F,34.016799
3,2002,M,37.589905
