In [11]:
import pandas as pd
import numpy as np
import matplotlib as plt

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

# Run Shell Commands in Jupyter Notebook
Use **!** in front to run the shell commands in notebook

In [1]:
!python --version

Python 3.6.3 :: Anaconda custom (64-bit)


In [2]:
!pwd

/home/ubuntu/STA695/Week 2 - Python & Kaggle Introduction


# List, Tuple, Set and Dict
1. List
    + General purpose
    + Most widely used data structure
    + Grow and shrink size as needed
    + Sequence type
    + Sortable
2. Tuple
    + Immutable (can't add/change)
    + Useful for fixed data
    + Faster than Lists
    + Sequence Type
3. Set
    + Store non-duplicate items
    + Very fast access vs Lists
    + Math Set operations (union, intersect)
4. Dict
    + Key/Value pair
    + Associative array

## List slicing
**list[start:end+1:step]**

In [15]:
x = [1, 2, 3, 4, 5, 6, 7]
print(x[1:4])    # items 1 to 3
print(x[1:6:2])  # items 1, 3, 5
print(x[4:])     # items 4 to end
print(x[:3])     # items 0 to 2
print(x[-1])     # last item
print(x[-3:])    # last 3 items
print(x[:-2])    # all except last 2 items

[2, 3, 4]
[2, 4, 6]
[5, 6, 7]
[1, 2, 3]
7
[5, 6, 7]
[1, 2, 3, 4, 5]


## List adding/concatenating and multiplying

In [43]:
x = [1, 2] + [3]
print(x)
x.append(4)
print(x)
x = ['a', 'b'] + ['c']
print(x)
x = 'a' + 'b'     # this is a string, not list
print(x)

[1, 2, 3]
[1, 2, 3, 4]
['a', 'b', 'c']
ab


In [19]:
x = [1, 3] * 3
print(x)
x = 'a' * 5     # again, this is a string
print(x)

[1, 3, 1, 3, 1, 3]
aaaaa


## Check existence

In [20]:
x = ['a', 'b', 'c']
'a' in x

True

In [22]:
print('a' in x)
print('a' not in x)

True
False


## Iterate through a list

In [31]:
x = [4, 7, 9]
for i, j in enumerate(x):
    print('index = ', i, ", x[", i, "] = ", j, sep="")

index = 0, x[0] = 4
index = 1, x[1] = 7
index = 2, x[2] = 9


## Length, Max, Min and Sum

In [36]:
len(x), max(x), min(x), sum(x)

(3, 9, 4, 20)

## Sort a list

In [71]:
x = [6, 9, 0]
sorted(x, reverse=True)
x = [6, 9, 0]
print(x)
x.sort()
print(x)
x.reverse()
print(x)

[6, 9, 0]
[0, 6, 9]
[9, 6, 0]


## List comprehension (very important)

In [54]:
x = [i for i in range(3)]
print(x)
x = [i for i in range(9) if (i > 1 and i < 5)]
print(x)

[0, 1, 2]
[2, 3, 4]


In [3]:
print(range(1, 4))

range(1, 4)


## List delete, append, extend and insert

In [63]:
x = [1,2,3,4,5]
del(x[1])
print(x)
x.append(8)
print(x)
x = [1,2,3,4,5]
x.append([1,2])   # embedded list
print(x)
x = [1,2,3,4,5]
x.extend([1,2])
print(x)
x = [1,2,3,4,5]
x.insert(2, 8)
print(x)

[1, 3, 4, 5]
[1, 3, 4, 5, 8]
[1, 2, 3, 4, 5, [1, 2]]
[1, 2, 3, 4, 5, 1, 2]
[1, 2, 8, 3, 4, 5]


# Set

In [79]:
A = [1,2,3]
B = ['a', 'b', 6, 1, 2]
print(set(A) & set(B))   # intersection
print(set(A) | set(B))   # union
print(set(A) ^ set(B))   # XOR
print(set(A) - set(B))   # in set A but not in set B
print(set(A) <= set(B))  # subset

{1, 2}
{1, 2, 3, 6, 'b', 'a'}
{'b', 3, 6, 'a'}
{3}
False


# Dictionary

In [5]:
x = {'eric': 90, 'josh': 100, 'jin':80}
x

{'eric': 90, 'jin': 80, 'josh': 100}

In [8]:
list(x.keys())

['eric', 'josh', 'jin']

In [89]:
print(x.keys())
print(list(x.keys()))
print(x.values())
print(list(x.values()))

dict_keys(['eric', 'josh', 'jin'])
['eric', 'josh', 'jin']
dict_values([90, 100, 80])
[90, 100, 80]


In [90]:
for key in x:
    print(key, x[key])

eric 90
josh 100
jin 80


# Important built-in function
list of built-in functions in python 3.6:

<https://docs.python.org/3/library/functions.html>

# zip
**important for comparisons between two lists**

In [93]:
a = [1, 2, 3, 4, 5]
b = [2, 2, 9, 0, 9]
zip(a, b)

<zip at 0x1fef5e7ba08>

In [94]:
for i in zip(a, b):
    print(i)

(1, 2)
(2, 2)
(3, 9)
(4, 0)
(5, 9)


# lambda

lambda is just a shorthand to create an anonymous function. It's often used to create a one-off function (usually for scenarios when you need to pass a function as a parameter into another function). It can take a parameter, and it returns the value of an expression.

```lambda <input>: <expression>```

# map

map takes a function, and applies it to each item in an iterable (such as a list).

```map(some_function, some_iterable)
```

In [95]:
map(lambda pair: max(pair), zip(a, b))

<map at 0x1fef5f417b8>

In [96]:
for i in map(lambda pair: max(pair), zip(a, b)):
    print(i)

2
2
9
4
9


In [97]:
list(map(lambda pair: max(pair), zip(a, b)))

[2, 2, 9, 4, 9]

# Docs for libraries and function
To get the documnets for certain library or function, try

<b style="color:blue;"> help() </b>

To get the direction of certain object, try

<b style="color:blue;"> dir() </b>

**OR**, you should be at least able to "google". Typically, [stackoverflow](www.stackoverflow.com) and [Google](www.google.com) should be your best friend.

In [98]:
help(pd.DataFrame.min)

Help on function min in module pandas.core.frame:

min(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs)
    This method returns the minimum of the values in the object.
                If you want the *index* of the minimum, use ``idxmin``. This is
                the equivalent of the ``numpy.ndarray`` method ``argmin``.
    
    Parameters
    ----------
    axis : {index (0), columns (1)}
    skipna : boolean, default True
        Exclude NA/null values when computing the result.
    level : int or level name, default None
        If the axis is a MultiIndex (hierarchical), count along a
        particular level, collapsing into a Series
    numeric_only : boolean, default None
        Include only float, int, boolean columns. If None, will attempt to use
        everything, then use only numeric data. Not implemented for Series.
    
    Returns
    -------
    min : Series or DataFrame (if level specified)



In [13]:
dir(pd.DataFrame)

['T',
 '_AXIS_ALIASES',
 '_AXIS_IALIASES',
 '_AXIS_LEN',
 '_AXIS_NAMES',
 '_AXIS_NUMBERS',
 '_AXIS_ORDERS',
 '_AXIS_REVERSED',
 '_AXIS_SLICEMAP',
 '__abs__',
 '__add__',
 '__and__',
 '__array__',
 '__array_wrap__',
 '__bool__',
 '__bytes__',
 '__class__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__div__',
 '__doc__',
 '__eq__',
 '__finalize__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__ifloordiv__',
 '__imod__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__lt__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pow__',
 '__radd__',
 '__rand__',
 '__rdiv__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rfloordiv__',


## Object Creation

Creating a Series by passing a list of values, letting pandas create a default integer index:

In [16]:
s = pd.Series([1,3,5,np.nan,6,8], index=range(1, 7))
s

1    1.0
2    3.0
3    5.0
4    NaN
5    6.0
6    8.0
dtype: float64

Creating a DataFrame by passing a numpy array, with a datetime index and labeled columns:

In [17]:
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [18]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.440019,-1.201073,-0.221089,0.697222
2013-01-02,-0.991821,-0.31572,-0.510926,-0.079178
2013-01-03,1.190866,0.066244,0.323979,0.04011
2013-01-04,1.644875,0.282155,1.482164,-0.587044
2013-01-05,0.135864,-0.726151,-0.670911,0.240985
2013-01-06,1.599904,0.056818,-0.995458,1.420938


Creating a DataFrame by passing a dict of objects that can be converted to series-like.

In [19]:
df2 = pd.DataFrame({ 'A' : 1.,
                     'B' : pd.Timestamp('20130102'),
                     'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                     'D' : np.array([3] * 4,dtype='int32'),
                     'E' : pd.Categorical(["test","train","test","train"]),
                     'F' : 'foo' })

df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [20]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [21]:
# Viewing Data

In [22]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,-0.440019,-1.201073,-0.221089,0.697222
2013-01-02,-0.991821,-0.31572,-0.510926,-0.079178
2013-01-03,1.190866,0.066244,0.323979,0.04011
2013-01-04,1.644875,0.282155,1.482164,-0.587044
2013-01-05,0.135864,-0.726151,-0.670911,0.240985


In [23]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,1.644875,0.282155,1.482164,-0.587044
2013-01-05,0.135864,-0.726151,-0.670911,0.240985
2013-01-06,1.599904,0.056818,-0.995458,1.420938


Display the index, columns, and the underlying numpy data

In [24]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [25]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [26]:
df.values

array([[-0.44001891, -1.20107262, -0.22108891,  0.69722191],
       [-0.99182093, -0.31572038, -0.5109259 , -0.07917757],
       [ 1.19086558,  0.06624441,  0.32397861,  0.04011018],
       [ 1.64487542,  0.28215526,  1.48216437, -0.58704431],
       [ 0.13586446, -0.72615064, -0.67091129,  0.24098498],
       [ 1.59990409,  0.05681784, -0.99545761,  1.42093753]])

Describe shows a quick statistic summary of your data

In [27]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.523278,-0.306288,-0.098707,0.288839
std,1.116811,0.564607,0.89379,0.695009
min,-0.991821,-1.201073,-0.995458,-0.587044
25%,-0.296048,-0.623543,-0.630915,-0.049356
50%,0.663365,-0.129451,-0.366007,0.140548
75%,1.497644,0.063888,0.187712,0.583163
max,1.644875,0.282155,1.482164,1.420938


Transposing your data

In [28]:
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,-0.440019,-0.991821,1.190866,1.644875,0.135864,1.599904
B,-1.201073,-0.31572,0.066244,0.282155,-0.726151,0.056818
C,-0.221089,-0.510926,0.323979,1.482164,-0.670911,-0.995458
D,0.697222,-0.079178,0.04011,-0.587044,0.240985,1.420938


Sorting by an axis

In [29]:
df.sort_index(axis=0, ascending=True, inplace=True)
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.440019,-1.201073,-0.221089,0.697222
2013-01-02,-0.991821,-0.31572,-0.510926,-0.079178
2013-01-03,1.190866,0.066244,0.323979,0.04011
2013-01-04,1.644875,0.282155,1.482164,-0.587044
2013-01-05,0.135864,-0.726151,-0.670911,0.240985
2013-01-06,1.599904,0.056818,-0.995458,1.420938


In [30]:
df.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C,D
2013-01-06,1.599904,0.056818,-0.995458,1.420938
2013-01-05,0.135864,-0.726151,-0.670911,0.240985
2013-01-04,1.644875,0.282155,1.482164,-0.587044
2013-01-03,1.190866,0.066244,0.323979,0.04011
2013-01-02,-0.991821,-0.31572,-0.510926,-0.079178
2013-01-01,-0.440019,-1.201073,-0.221089,0.697222


In [31]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-01,-0.440019,-1.201073,-0.221089,0.697222
2013-01-05,0.135864,-0.726151,-0.670911,0.240985
2013-01-02,-0.991821,-0.31572,-0.510926,-0.079178
2013-01-06,1.599904,0.056818,-0.995458,1.420938
2013-01-03,1.190866,0.066244,0.323979,0.04011
2013-01-04,1.644875,0.282155,1.482164,-0.587044


In [32]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-01,-0.440019,-1.201073,-0.221089,0.697222
2013-01-05,0.135864,-0.726151,-0.670911,0.240985
2013-01-02,-0.991821,-0.31572,-0.510926,-0.079178
2013-01-06,1.599904,0.056818,-0.995458,1.420938
2013-01-03,1.190866,0.066244,0.323979,0.04011
2013-01-04,1.644875,0.282155,1.482164,-0.587044


# Selection

Selecting a single column, which yields a Series, equivalent to df.A

In [134]:
df['A']

2013-01-01    0.396717
2013-01-02   -0.662886
2013-01-03   -0.712939
2013-01-04    0.034589
2013-01-05    0.144436
2013-01-06    1.589120
Freq: D, Name: A, dtype: float64

Selecting via ```[]```, which slices the rows.

In [135]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,0.396717,-0.621587,0.015427,0.782382
2013-01-02,-0.662886,0.579109,0.140891,1.098659
2013-01-03,-0.712939,-0.054714,0.433275,-0.242728


In [33]:
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,-0.991821,-0.31572,-0.510926,-0.079178
2013-01-03,1.190866,0.066244,0.323979,0.04011
2013-01-04,1.644875,0.282155,1.482164,-0.587044


## Selection by Label


See more in Selection by Label

For getting a cross section using a label

In [34]:
df.head(3)

Unnamed: 0,A,B,C,D
2013-01-01,-0.440019,-1.201073,-0.221089,0.697222
2013-01-02,-0.991821,-0.31572,-0.510926,-0.079178
2013-01-03,1.190866,0.066244,0.323979,0.04011


In [35]:
df.iloc[0]

A   -0.440019
B   -1.201073
C   -0.221089
D    0.697222
Name: 2013-01-01 00:00:00, dtype: float64

In [137]:
 df.loc[dates[0]]

A    0.396717
B   -0.621587
C    0.015427
D    0.782382
Name: 2013-01-01 00:00:00, dtype: float64

In [39]:
df.loc['20130101':'20130104', ['A','B']]

Unnamed: 0,A,B
2013-01-01,-0.440019,-1.201073
2013-01-02,-0.991821,-0.31572
2013-01-03,1.190866,0.066244
2013-01-04,1.644875,0.282155


Showing label slicing, both endpoints are included

In [139]:
df.loc['20130102':'20130104',['A','B']]

Unnamed: 0,A,B
2013-01-02,-0.662886,0.579109
2013-01-03,-0.712939,-0.054714
2013-01-04,0.034589,-0.072122


Reduction in the dimensions of the returned object

In [140]:
df.loc['20130102',['A','B']]

A   -0.662886
B    0.579109
Name: 2013-01-02 00:00:00, dtype: float64

For getting a scalar value

In [41]:
df.iloc[0, 0]

-0.44001891098335583

In [141]:
df.loc[dates[0],'A']

0.39671681281585658

## Selection by Position

Select via the position of the passed integers

In [142]:
df.iloc[3]

A    0.034589
B   -0.072122
C   -1.028810
D   -0.733341
Name: 2013-01-04 00:00:00, dtype: float64

By integer slices, acting similar to numpy/python

In [143]:
df.iloc[3:5,0:2]

Unnamed: 0,A,B
2013-01-04,0.034589,-0.072122
2013-01-05,0.144436,1.283995


By lists of integer position locations, similar to the numpy/python style

In [144]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,-0.662886,0.140891
2013-01-03,-0.712939,0.433275
2013-01-05,0.144436,-0.141834


For slicing rows explicitly

In [42]:
df.iloc[1:3]

Unnamed: 0,A,B,C,D
2013-01-02,-0.991821,-0.31572,-0.510926,-0.079178
2013-01-03,1.190866,0.066244,0.323979,0.04011


For slicing columns explicitly



In [43]:
df.iloc[:,1:3]

Unnamed: 0,B,C
2013-01-01,-1.201073,-0.221089
2013-01-02,-0.31572,-0.510926
2013-01-03,0.066244,0.323979
2013-01-04,0.282155,1.482164
2013-01-05,-0.726151,-0.670911
2013-01-06,0.056818,-0.995458


For getting a value explicitly



In [44]:
df.iloc[1,1]

-0.31572038199420083

For getting fast access to a scalar (equiv to the prior method)

In [45]:
df.iat[1,1]

-0.31572038199420083

# Boolean Indexing

Using a single column’s values to select data.

In [47]:
df[df.A < 0]

Unnamed: 0,A,B,C,D
2013-01-01,-0.440019,-1.201073,-0.221089,0.697222
2013-01-02,-0.991821,-0.31572,-0.510926,-0.079178


Selecting values from a DataFrame where a boolean condition is met.

In [48]:
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,,,,0.697222
2013-01-02,,,,
2013-01-03,1.190866,0.066244,0.323979,0.04011
2013-01-04,1.644875,0.282155,1.482164,
2013-01-05,0.135864,,,0.240985
2013-01-06,1.599904,0.056818,,1.420938


Using the ```isin()``` method for filtering:

In [49]:
df2 = df.copy()
df2['E'] = ['one', 'one','two','three','four','three']
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.440019,-1.201073,-0.221089,0.697222,one
2013-01-02,-0.991821,-0.31572,-0.510926,-0.079178,one
2013-01-03,1.190866,0.066244,0.323979,0.04011,two
2013-01-04,1.644875,0.282155,1.482164,-0.587044,three
2013-01-05,0.135864,-0.726151,-0.670911,0.240985,four
2013-01-06,1.599904,0.056818,-0.995458,1.420938,three


In [50]:
df2['E'].isin(['two','four'])

2013-01-01    False
2013-01-02    False
2013-01-03     True
2013-01-04    False
2013-01-05     True
2013-01-06    False
Freq: D, Name: E, dtype: bool

In [51]:
df2[df2['E'].isin(['two','four'])]

Unnamed: 0,A,B,C,D,E
2013-01-03,1.190866,0.066244,0.323979,0.04011,two
2013-01-05,0.135864,-0.726151,-0.670911,0.240985,four


# Setting

Setting a new column automatically aligns the data by the indexes



In [52]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

Setting values by label



In [53]:
df.at[dates[0],'A'] = 0

Setting values by position



In [54]:
df.iat[0,1] = 0

Setting by assigning with a numpy array



In [55]:
df.loc[:,'D'] = np.array([5] * len(df))

In [56]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,-0.221089,5
2013-01-02,-0.991821,-0.31572,-0.510926,5
2013-01-03,1.190866,0.066244,0.323979,5
2013-01-04,1.644875,0.282155,1.482164,5
2013-01-05,0.135864,-0.726151,-0.670911,5
2013-01-06,1.599904,0.056818,-0.995458,5


# Operations

## Stats

In [59]:
df.mean()

A    0.596615
B   -0.106109
C   -0.098707
D    5.000000
dtype: float64

Same operation on the other axis

In [60]:
df.mean(1)

2013-01-01    1.194728
2013-01-02    0.795383
2013-01-03    1.645272
2013-01-04    2.102299
2013-01-05    0.934701
2013-01-06    1.415316
Freq: D, dtype: float64

Operating with objects that have different dimensionality and need alignment. In addition, pandas automatically broadcasts along the specified dimension.

In [61]:
s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2)
s

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [165]:
df.sub(s, axis='index')

Unnamed: 0,A,B,C,D
2013-01-01,,,,
2013-01-02,,,,
2013-01-03,-1.712939,-1.054714,-0.566725,4.0
2013-01-04,-2.965411,-3.072122,-4.02881,2.0
2013-01-05,-4.855564,-3.716005,-5.141834,0.0
2013-01-06,,,,


# Apply

In [64]:
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,-0.221089,5
2013-01-02,-0.991821,-0.31572,-0.732015,10
2013-01-03,0.199045,-0.249476,-0.408036,15
2013-01-04,1.84392,0.032679,1.074128,20
2013-01-05,1.979785,-0.693471,0.403217,25
2013-01-06,3.579689,-0.636654,-0.592241,30


In [65]:
df.apply(lambda x: x.max() - x.min())

A    2.636696
B    1.008306
C    2.477622
D    0.000000
dtype: float64

# Histogramming

In [66]:
s = pd.Series(np.random.randint(0, 7, size=10))
s

0    2
1    4
2    0
3    4
4    5
5    3
6    2
7    1
8    2
9    0
dtype: int64

In [67]:
s.value_counts()

2    3
4    2
0    2
5    1
3    1
1    1
dtype: int64

# String Methods

In [68]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

# Merge

## Concat

Concatenating pandas objects together with ```concat()```:

In [69]:
df = pd.DataFrame(np.random.randn(10, 4))
df

Unnamed: 0,0,1,2,3
0,0.175631,0.14553,-0.113008,2.473142
1,0.769794,0.81609,-1.243019,0.390894
2,-0.833972,1.022541,-0.289962,0.475655
3,0.526831,1.193194,0.486279,-0.451824
4,0.629996,-0.217671,-0.522091,-0.351222
5,-1.434021,0.045771,-2.372978,1.8036
6,-0.118409,-0.851575,1.501821,0.502889
7,-0.707389,1.602869,0.777512,-0.128045
8,0.211352,1.36686,-0.783899,0.848974
9,-0.837099,-1.576005,-0.444874,-2.467099


In [72]:
pieces = [df[:3], df[3:7], df[7:]]
pieces[2]

Unnamed: 0,0,1,2,3
7,-0.707389,1.602869,0.777512,-0.128045
8,0.211352,1.36686,-0.783899,0.848974
9,-0.837099,-1.576005,-0.444874,-2.467099


In [74]:
pd.concat([df[:3], df[3:7], df[7:]], axis=1, ignore_index=True)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0.175631,0.14553,-0.113008,2.473142,,,,,,,,
1,0.769794,0.81609,-1.243019,0.390894,,,,,,,,
2,-0.833972,1.022541,-0.289962,0.475655,,,,,,,,
3,,,,,0.526831,1.193194,0.486279,-0.451824,,,,
4,,,,,0.629996,-0.217671,-0.522091,-0.351222,,,,
5,,,,,-1.434021,0.045771,-2.372978,1.8036,,,,
6,,,,,-0.118409,-0.851575,1.501821,0.502889,,,,
7,,,,,,,,,-0.707389,1.602869,0.777512,-0.128045
8,,,,,,,,,0.211352,1.36686,-0.783899,0.848974
9,,,,,,,,,-0.837099,-1.576005,-0.444874,-2.467099


# Append

In [75]:
df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
0,0.05499,1.674641,1.541578,-0.146672
1,-0.615559,0.914713,0.37866,0.656751
2,-0.576296,-1.229714,0.668924,-1.166951
3,-0.963121,-0.817834,1.158253,-1.078459
4,-1.703898,-0.390848,-0.473936,-0.43039
5,-0.235521,0.490527,1.203916,-0.964865
6,-1.111766,2.050447,-0.895151,-0.198276
7,1.353307,1.309498,0.240841,0.582311


In [76]:
s = df.iloc[3]
df.append(s, ignore_index=True)

Unnamed: 0,A,B,C,D
0,0.05499,1.674641,1.541578,-0.146672
1,-0.615559,0.914713,0.37866,0.656751
2,-0.576296,-1.229714,0.668924,-1.166951
3,-0.963121,-0.817834,1.158253,-1.078459
4,-1.703898,-0.390848,-0.473936,-0.43039
5,-0.235521,0.490527,1.203916,-0.964865
6,-1.111766,2.050447,-0.895151,-0.198276
7,1.353307,1.309498,0.240841,0.582311
8,-0.963121,-0.817834,1.158253,-1.078459


# Grouping

By “group by” we are referring to a process involving one or more of the following steps

+ Splitting the data into groups based on some criteria
+ Applying a function to each group independently
+ Combining the results into a data structure


In [77]:
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                          'foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three',
                           'two', 'two', 'one', 'three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})
df

Unnamed: 0,A,B,C,D
0,foo,one,0.063154,1.519884
1,bar,one,-1.269894,-1.114606
2,foo,two,0.936247,-1.070197
3,bar,three,-0.900273,-0.547537
4,foo,two,-0.373174,-0.742806
5,bar,two,1.143329,0.340642
6,foo,one,-0.338253,-1.108563
7,foo,three,0.115481,0.310169


Grouping and then applying a function ```sum``` to the resulting groups.



In [78]:
df.groupby('A').sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-1.026839,-1.321501
foo,0.403455,-1.091513


In [79]:
df.groupby(['A','B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-1.269894,-1.114606
bar,three,-0.900273,-0.547537
bar,two,1.143329,0.340642
foo,one,-0.275099,0.411321
foo,three,0.115481,0.310169
foo,two,0.563072,-1.813003


# Import/Export Data

## CSV

Writing to a csv

In [183]:
df.to_csv('foo.csv')

Reading from a csv

In [184]:
pd.read_csv('foo.csv')

Unnamed: 0.1,Unnamed: 0,A,B,C,D
0,0,foo,one,0.463433,1.034696
1,1,bar,one,1.05515,-0.005757
2,2,foo,two,0.287593,-0.092019
3,3,bar,three,0.021715,0.043438
4,4,foo,two,-0.434303,-0.196786
5,5,bar,two,1.047992,-0.171141
6,6,foo,one,-0.091051,0.332254
7,7,foo,three,0.940632,1.115727
