# Python for Data Analysis - Workbook 1 (Python Basics + NumPy Basics)

### Preliminaries

In [1]:
import numpy as np
import pandas as pd

## Basic Data Structures

### Series

In [4]:
# A Series is a one-dimensional array-like object containing a sequence of values, 
# and an associated array called an index

obj = pd.Series([5,-2,13,4])
obj

0     5
1    -2
2    13
3     4
dtype: int64

In [5]:
# We can set the index by passing in an additional arg:

obj = pd.Series([5,-2,13,4], index = ['a', 'c', 'd', 'b'])
obj

a     5
c    -2
d    13
b     4
dtype: int64

In [10]:
# Can also create a Series directly from a Python dict

capitals = {'New York': 'Albany', 'Arizona': 'Phoenix', 'Montana': 'Helena'}
pd.Series(capitals)

# Can also change the index order by passing in the dict keys to the index arg in the order you want to appear
# Missing values will appear as NA

pd.Series(capitals, index = ['Montana', 'Arizona', 'New York', 'Texas'])

Montana      Helena
Arizona     Phoenix
New York     Albany
Texas           NaN
dtype: object

In [15]:
# Checking for missing data

capitals = {'New York': 'Albany', 'Arizona': 'Phoenix', 'Montana': 'Helena'}
pd.Series(capitals)
capitals2 = pd.Series(capitals, index = ['Montana', 'Arizona', 'New York', 'Texas'])

capitals2.isnull()
# pd.isnull(capitals2)
# capitals2.notnull()
# pd.notnull(capitals2)



Montana      True
Arizona      True
New York     True
Texas       False
dtype: bool

In [30]:
# Both the Series object and index have a name attribute which can be changed

a = {'Red': 29, 'Blue': 42, 'Green': 24}
obj = pd.Series(a)
obj.name = 'Numbers'
obj.index.name = 'Colors'
print(obj)
print(obj.index)

Colors
Red      29
Blue     42
Green    24
Name: Numbers, dtype: int64
Index(['Red', 'Blue', 'Green'], dtype='object', name='Colors')


In [34]:
# Can alter index in place by assignment

obj = pd.Series([29, 42, 24], index = ['Red', 'Blue', 'Green'])
obj.index = ['Purple', 'Yellow', 'Orange']
obj

Purple    29
Yellow    42
Orange    24
dtype: int64

### Dataframe

In [35]:
# Rectangular table of data and ordered collection of columns of different value types
# Can represent higher-dimensional data through hierarchical indexing

In [39]:
# One way to create is from a dict of equal-lengh lists or NumPy arrays

data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'], 
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

pd.DataFrame(data)

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [48]:
# Select first five (or any whatever number is passed in) rows of a DF

data = pd.DataFrame({
            'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'], 
            'year': [2000, 2001, 2002, 2001, 2002, 2003],
            'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]
        })

data.head()
# data.head(3)

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [49]:
# Can specify columns in the order that you want

pd.DataFrame(data, columns = ['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [50]:
# Non-existent columns will just populate with NaN 

pd.DataFrame(data, columns = ['year', 'state', 'pop', 'debt'])

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,
5,2003,Nevada,3.2,


In [54]:
# Can select with either dict-like or attribute-like access

data['year']
# data.year

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

In [58]:
# Retrieve rows with 'loc' attribute

data.loc[0]

state    Ohio
year     2000
pop       1.5
Name: 0, dtype: object

In [60]:
# Columns can be modified by assignment
# Can be either a scalar

frame2 = pd.DataFrame(data, columns = ['year', 'state', 'pop', 'debt'])

frame2['debt'] = 10
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,10
1,2001,Ohio,1.7,10
2,2002,Ohio,3.6,10
3,2001,Nevada,2.4,10
4,2002,Nevada,2.9,10
5,2003,Nevada,3.2,10


In [64]:
# or another array

frame2['debt'] = np.random.randint(0,20,6)
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,5
1,2001,Ohio,1.7,10
2,2002,Ohio,3.6,18
3,2001,Nevada,2.4,8
4,2002,Nevada,2.9,11
5,2003,Nevada,3.2,19
