In [None]:
import pandas as pd
import random
import numpy as np
import matplotlib
% matplotlib inline

In [None]:
df = pd.read_json('../data/sampledf.json')
df.index = ["R{:02d}".format(i) for i in range(len(df))]
df.columns = ["C{:02d}".format(i) for i in range(len(df.columns))]
df

### Adding and deleting Series in a DataFrame

In [None]:
df['C10'] = [random.randint(1, 100) for i in range(len(df))]
df

In [None]:
# caveat when adding a Series:
new_series = pd.Series(list([random.randint(1, 100) for i in range(len(df))]))
new_series

In [None]:
df['C10'] = new_series
df

In [None]:
# adding a new Series: mind the index of the the DF
new_series.index = df.index
new_series

In [None]:
df['C10'] = new_series
df

In [None]:
df['C11'] = new_series
df

In [None]:
del df['C11']
df

In [None]:
del df['C12']

In [None]:
# avoid KeyErrors if unsure Series exists
df.drop(['C12'], errors='ignore')

In [None]:
# getting a subset is simpler than delting many columns
dfs = df.loc[:, ['C03', 'C05']]
dfs

### Mangle your data

**recall: NumPy Broadcasting**

In [None]:
a = np.random.rand(3,1)
a

In [None]:
b = np.random.rand(3,1)
b

In [None]:
a + b

In [None]:
a - b

In [None]:
a * b

In [None]:
a / b

In [None]:
a + 7

In [None]:
b // 2

In [None]:
a % 2 == 0

In [None]:
a * [4, 5]

### Pandas

In [None]:
sales_data = pd.read_json('../data/blooth_sales_data.json',
                              convert_dates=['birthday', 'orderdate']
                              )
sales_data.head(5)

In [None]:
sales_data['turnover'] = sales_data['unitprice'] * sales_data['units']
sales_data.head(5)

In [None]:
sales_data['turnover'].mean()

In [None]:
sales_data.mean()

In [None]:
sales_data['turnover'].sum()

In [None]:
sales_data['turnover'].median()

In [None]:
sales_data.info()

In [None]:
# map for Series
sales_data['year'] = sales_data['orderdate'].map(lambda x: x.year)
sales_data.head(5)

In [None]:
# map for Series
sales_data['month'] = sales_data['orderdate'].map(lambda x: x.month)
sales_data.head(5)

In [None]:
# combine multiple columns
sales_data['year-month'] = sales_data['year'].map(
    str) + sales_data['month'].map(lambda x: "-{:02d}".format(x))

In [None]:
sales_data.head(5)

In [None]:
# applymap for df
sales_data[['unitprice', 'units']].applymap(lambda x: x / 1.5).head(5)

In [None]:
sales_data[['unitprice', 'units']].apply(lambda x: x / 1.5).head(5)

In [None]:
# remember: a copy of the DF is returned. The DF is not altered.
sales_data.head(5)

In [None]:
sales_data['unitprice'] = sales_data['unitprice'].map(lambda x: x * 2)

In [None]:
sales_data.describe()

In [None]:
sales_data['birthday'].describe()

In [None]:
sales_data['turnover'] = sales_data['turnover'].astype(np.int)
sales_data.head(5)

In [None]:
# sneak in NaN
sales_data.set_value(1, 'unitprice', np.NaN)
sales_data.set_value(4, 'unitprice', np.NaN)
sales_data.set_value(3, 'orderdate', np.NaN)
sales_data.head(5)

In [None]:
sales_data[sales_data['unitprice'].isnull()]

In [None]:
sales_data.dropna().head(5)

In [None]:
sales_data.fillna(99.99).head(5)

In [None]:
sales_data.head()

In [None]:
sales_data.fillna(99.99, inplace=True)
sales_data.head(5)