In [2]:
import numpy as np
import pandas as pd

### Creating a Series

You can convert a list,numpy array, or dictionary to a Series:

In [3]:
labels = ['a','b','c']
my_list = [10,20,30]
arr = np.array([10,20,30])
d = {'a':10,'b':20,'c':30}

** Using Lists**

pd.Series(data, index)

In [4]:
pd.Series(data=my_list)

0    10
1    20
2    30
dtype: int64

In [5]:
pd.Series(data=my_list,index=labels)

a    10
b    20
c    30
dtype: int64

In [8]:
pd.Series(my_list,labels)

a    10
b    20
c    30
dtype: int64

In [9]:
pd.Series(arr)

0    10
1    20
2    30
dtype: int64

In [10]:
pd.Series(arr,labels)

a    10
b    20
c    30
dtype: int64

In [11]:
pd.Series(d)

a    10
b    20
c    30
dtype: int64

### Data in a Series

A pandas Series can hold a variety of object types:

In [9]:
pd.Series(data=labels)

0    a
1    b
2    c
dtype: object

## Using an Index

The key to using a Series is understanding its index. Pandas makes use of these index names or numbers by allowing for fast look ups of information (works like a hash table or dictionary).

Let's see some examples of how to grab information from a Series. Let us create two sereis, ser1 and ser2:

In [12]:
ser1 = pd.Series([1,2,3,4],index = ['USA', 'Germany','USSR', 'Japan'])    

In [13]:
ser1

USA        1
Germany    2
USSR       3
Japan      4
dtype: int64

In [14]:
ser2 = pd.Series([1,2,5,4],index = ['USA', 'Germany','Italy', 'Japan'])  

In [15]:
ser2

USA        1
Germany    2
Italy      5
Japan      4
dtype: int64

In [18]:
ser1['Germany']

2

In [19]:
ser1 + ser2

Germany    4.0
Italy      NaN
Japan      8.0
USA        2.0
USSR       NaN
dtype: float64

## Data Frames

In [20]:
from numpy.random import randn
np.random.seed(101)

pd.DataFrame(data, index, column)

In [23]:
df = pd.DataFrame(randn(5,4),index=['A', 'B', 'C', 'D', 'E'],columns='W X Y Z'.split())

remember: 'A B C D E'.split() is ['A','B','C','D']

In [24]:
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [25]:
df['W'] #grabing column W

A    0.302665
B   -0.134841
C    0.807706
D   -0.497104
E   -0.116773
Name: W, dtype: float64

In [26]:
df.W

A    0.302665
B   -0.134841
C    0.807706
D   -0.497104
E   -0.116773
Name: W, dtype: float64

In [27]:
df[['W','Z']]

Unnamed: 0,W,Z
A,0.302665,-1.159119
B,-0.134841,0.184502
C,0.807706,0.329646
D,-0.497104,0.484752
E,-0.116773,1.996652


In [28]:
type(df['W'])

pandas.core.series.Series

df.head() displays first five rows, df.tail() displays last five rows

In [30]:
df.head(3)

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646


In [31]:
df.tail(3)

Unnamed: 0,W,X,Y,Z
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


How to create a new column:

In [32]:
df['new'] = df['W'] + df['Z']

In [33]:
df

Unnamed: 0,W,X,Y,Z,new
A,0.302665,1.693723,-1.706086,-1.159119,-0.856454
B,-0.134841,0.390528,0.166905,0.184502,0.049661
C,0.807706,0.07296,0.638787,0.329646,1.137352
D,-0.497104,-0.75407,-0.943406,0.484752,-0.012352
E,-0.116773,1.901755,0.238127,1.996652,1.879879


In [34]:
df['new2'] = [0,1,0,1,0]

In [35]:
df

Unnamed: 0,W,X,Y,Z,new,new2
A,0.302665,1.693723,-1.706086,-1.159119,-0.856454,0
B,-0.134841,0.390528,0.166905,0.184502,0.049661,1
C,0.807706,0.07296,0.638787,0.329646,1.137352,0
D,-0.497104,-0.75407,-0.943406,0.484752,-0.012352,1
E,-0.116773,1.901755,0.238127,1.996652,1.879879,0


How to remove columns

In [38]:
df.drop('new',axis=1, inplace = True)

In [41]:
df.drop('new2',axis=1, inplace = True)

In [46]:
df.drop('A')

Unnamed: 0,W,X,Y,Z
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


Your df is not inplace unless specified to be true

In [44]:
df.drop('E',axis=0) #removes rows, axis = 0 is default

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752


In [47]:
df.loc['A']

W    0.302665
X    1.693723
Y   -1.706086
Z   -1.159119
Name: A, dtype: float64

In [48]:
df.iloc[2] #position instead of label

W    0.807706
X    0.072960
Y    0.638787
Z    0.329646
Name: C, dtype: float64

Selecting subsets

In [49]:
df.loc['B','Y']

0.16690463609281317

In [50]:
df.loc[['A','B'],['W','Y']]

Unnamed: 0,W,Y
A,0.302665,-1.706086
B,-0.134841,0.166905


In [51]:
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [52]:
def zero_or_one(x):
    if x >= 0:
        return 1
    else:
        return 0

In [53]:
zero_or_one(5)

1

In [54]:
df['W'].apply(lambda x: zero_or_one(x))

A    1
B    0
C    1
D    0
E    0
Name: W, dtype: int64