## Extracting and transforming data 

In [2]:
import pandas as pd
# sclicing in DataFrame is a view not a copy, but if need new line, the original are not changed
df2 = pd.DataFrame([['1980', 'Blondie', 'Call Me', '6'],
       ['1981', 'Chistorpher Cross', 'Arthurs Theme', '3'],
       ['1982', 'Joan Jett', 'I Love Rock and Roll', '7']])
df2.columns = ['year', 'artist', 'song', 'chart weeks']
# df3 = df2.ix[:2, 'artist':'chart weeks'].copy()
df3 = df2.ix[:2, 'artist':'chart weeks']
df3['chart weeks'] = df3['chart weeks'] + '2'
df3['test'] = 10
print df3
print df2

              artist                  song chart weeks  test
0            Blondie               Call Me          62    10
1  Chistorpher Cross         Arthurs Theme          32    10
2          Joan Jett  I Love Rock and Roll          72    10
   year             artist                  song chart weeks
0  1980            Blondie               Call Me          62
1  1981  Chistorpher Cross         Arthurs Theme          32
2  1982          Joan Jett  I Love Rock and Roll          72


In [3]:
# slicing in Series returns a view not a copy
a = pd.Series([0,2,3,4,5])
a[1:4] = 9
a

0    0
1    9
2    9
3    9
4    5
dtype: int64

In [None]:
# Create the boolean array: high_turnout
high_turnout = election['turnout'] > 70

# Filter the election DataFrame with the high_turnout array: high_turnout_df
high_turnout_df = election[high_turnout]

# Print the high_turnout_results DataFrame
print(high_turnout_df)

# Import numpy
import numpy as np

# Create the boolean array: too_close
too_close = election['margin'] < 1

# Assign np.nan to the 'winner' column where the results were too close to call
election['winner'][too_close] = np.nan

# Print the output of election.info()
print(election.info())

# Select the 'age' and 'cabin' columns: df
df = titanic[['age', 'cabin']]

# Print the shape of df
print(df.shape)

# Drop rows in df with how='any' and print the shape
print(df.dropna(how='any').shape)

# Drop rows in df with how='all' and print the shape
print(df.dropna(how='all').shape)

# Call .dropna() with thresh=1000 and axis='columns' and print the output of .info() from titanic
print(titanic.dropna(thresh=1000, axis='columns').info())

In [None]:
# Write a function to convert degrees Fahrenheit to degrees Celsius: to_celsius
def to_celsius(F):
    return 5/9*(F - 32)

# Apply the function over 'Mean TemperatureF' and 'Mean Dew PointF': df_celsius
df_celsius = weather[['Mean TemperatureF','Mean Dew PointF']].apply(to_celsius)

# Reassign the columns df_celsius
df_celsius.columns = ['Mean TemperatureC', 'Mean Dew PointC']

# Print the output of df_celsius.head()
print(df_celsius.head())

# The .map() method is used to transform values according to a Python dictionary look-up.
# Create the dictionary: red_vs_blue
red_vs_blue = {'Obama':'blue' , 'Romney':'red'}

# Use the dictionary to map the 'winner' column to the new column: election['color']
election['color'] = election['winner'].map(red_vs_blue)

# Print the output of election.head()
print(election.head())



# When performance is paramount, you should avoid using .apply() and .map() 
# because those constructs perform Python for-loops over the data stored in a pandas Series or DataFrame. 
# By using vectorized functions instead, you can loop over the data at the same speed as compiled code (C, Fortran, etc.)! 
# NumPy, SciPy and pandas come with a variety of vectorized functions (called Universal Functions or UFuncs in NumPy).
# Import zscore from scipy.stats
from scipy.stats import zscore

# Call zscore with election['turnout'] as input: turnout_zscore
turnout_zscore = zscore(election['turnout'])

# Print the type of turnout_zscore
print(type(turnout_zscore))

# Assign turnout_zscore to a new column: election['turnout_zscore']
election['turnout_zscore'] = turnout_zscore

# Print the output of election.head()
print(election.head())



## Advanced indexing

In [44]:
# indexes are immutable objects. This means that if you want to change or modify the index in a dataframe, 
# then you need to change the whole index.

# In [1]: sales
# Out[1]: 
#              eggs  salt  spam
# state month                  
# CA    1        47  12.0    17
#       2       110  50.0    31
# NY    1       221  89.0    72
#       2        77  87.0    20
# TX    1       132   NaN    52
#       2       205  60.0    55

# Create the list of new indexes: new_idx
new_idx = [month.upper() for month in sales.index]

# Assign new_idx to sales.index
sales.index = new_idx

# Print the sales DataFrame
print(sales)


# Assign the string 'MONTHS' to sales.index.name
sales.index.name = 'MONTHS'

# Print the sales DataFrame
print(sales)

# Assign the string 'PRODUCTS' to sales.columns.name 
sales.columns.name = 'PRODUCTS'

# Print the sales dataframe again
print(sales)

# You can also build the DataFrame and index independently, and then put them together. 
# If you take this route, be careful, 
# as any mistakes in generating the DataFrame or the index can cause the data and the index to be aligned incorrectly.
# Generate the list of months: months
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun']

# Assign months to sales.index
sales.index = months

# Print the modified sales DataFrame
print(sales)

# Extracting elements from the outermost level of a MultiIndex is just like in the case of a single-level Index
# Print sales.loc[['CA', 'TX']]
print(sales.loc[['CA', 'TX']])

# Print sales['CA':'TX']
# this selects rows with slicing. different from choosing columns with bracket
print(sales['CA':'TX'])

# With a MultiIndex, you should always ensure the index is sorted. 
# You can skip this only if you know the data is already sorted on the index fields.
# Set the index to be the columns ['state', 'month']: sales
sales = sales.set_index(['state', 'month'])

# Sort the MultiIndex: sales
sales = sales.sort_index()

# Print the sales DataFrame
print(sales)

# Set the index to the column 'state': sales
sales = sales.set_index(['state'])

# Print the sales DataFrame
print(sales)

# Access the data from 'NY'
print(sales.loc['NY'])


# Looking up indexed data is fast and efficient. And you have already seen that lookups based on the outermost 
# level of a MultiIndex work just like lookups on DataFrames that have a single-level Index.
# Looking up data based on inner levels of a MultiIndex can be a bit trickier.
# you need to use slice(None) in the slicing parameter for the outermost dimension(s) instead of the usual :, 
# or use pd.IndexSlice. 
# http://pandas.pydata.org/pandas-docs/stable/advanced.html
# Look up data for NY in month 1: NY_month1
NY_month1 = sales.loc[('NY', 1)]

# Look up data for CA and TX in month 2: CA_TX_month2
CA_TX_month2 = sales.loc[(['CA', 'TX'], 2),:]

# Look up data for all states in month 2: all_month2
all_month2 = sales.loc[(slice(None), 2),:]


     city weekday visitors signups
0  Austin     Mon      326       3
1  Austin     Sun      139       7
2  Dallas     Mon      456       5
3  Dallas     Sun      237      12


AttributeError: 'numpy.int64' object has no attribute 'upper'

## Rearranging and reshaping data 

### Pivot - spread rows into columns

In [56]:
# In [2]: users
# Out[2]: 
#   weekday    city  visitors  signups
# 0     Sun  Austin       139        7
# 1     Sun  Dallas       237       12
# 2     Mon  Austin       326        3
# 3     Mon  Dallas       456        5

#prepare the sales dataframe
import numpy as np
users_values = np.array([['Austin', 'Mon', 326, 3],
       ['Austin', 'Sun', 139, 7],
       ['Dallas', 'Mon', 456, 5],
       ['Dallas', 'Sun', 237, 12]], dtype=object)
users = pd.DataFrame(sales_values)
users.columns = ['city', 'weekday', 'visitors', 'signups']
print users

# Pivot the users DataFrame: visitors_pivot
visitors_pivot = users.pivot(index='weekday', columns='city', values='visitors')

# Print the pivoted DataFrame
print(visitors_pivot)

# Pivot users with signups indexed by weekday and city: signups_pivot
signups_pivot = users.pivot(index='weekday', columns='city', values='signups')

# Print signups_pivot
print(signups_pivot)

# Pivot users pivoted by both signups and visitors: pivot
pivot = users.pivot(index='weekday', columns='city')

# Print the pivoted DataFrame
print(pivot)

     city weekday visitors signups
0  Austin     Mon      326       3
1  Austin     Sun      139       7
2  Dallas     Mon      456       5
3  Dallas     Sun      237      12
city    Austin Dallas
weekday              
Mon        326    456
Sun        139    237
city    Austin Dallas
weekday              
Mon          3      5
Sun          7     12
        visitors        signups       
city      Austin Dallas  Austin Dallas
weekday                               
Mon          326    456       3      5
Sun          139    237       7     12


### Stack & Unstack

In [57]:
# for multilevel index, the above pivot method won't work
# so introduce stack and unstack methods.
users = users.set_index(['city', 'weekday'])
# Unstack users by 'weekday': byweekday
byweekday = users.unstack(level='weekday')

# Print the byweekday DataFrame
print(byweekday)

# Stack byweekday by 'weekday' and print it
print(byweekday.stack(level='weekday'))

# Unstack users by 'city': bycity
bycity = users.unstack(level='city')

# Print the bycity DataFrame
print(bycity)

# Stack bycity by 'city' and print it
print(bycity.stack(level='city'))

        visitors      signups    
weekday      Mon  Sun     Mon Sun
city                             
Austin       326  139       3   7
Dallas       456  237       5  12
               visitors signups
city   weekday                 
Austin Mon          326       3
       Sun          139       7
Dallas Mon          456       5
       Sun          237      12
        visitors        signups       
city      Austin Dallas  Austin Dallas
weekday                               
Mon          326    456       3      5
Sun          139    237       7     12
               visitors signups
weekday city                   
Mon     Austin      326       3
        Dallas      456       5
Sun     Austin      139       7
        Dallas      237      12


### Swap level

In [50]:
# Stack 'city' back into the index of bycity: newusers
newusers = bycity.stack(level='city')

# Swap the levels of the index of newusers: newusers
newusers = newusers.swaplevel(0,1)

# Print newusers and verify that the index is not sorted
print(newusers)

# Sort the index of newusers: newusers
newusers = newusers.sort_index()

# Print newusers and verify that the index is now sorted
print(newusers)

# Verify that the new DataFrame is equal to the original
print(newusers.equals(users))

(               visitors signups
 city   weekday                 
 Austin Mon          326       3
        Sun          139       7
 Dallas Mon          456       5
        Sun          237      12,      city weekday visitors signups
 0  Austin     Mon      326       3
 1  Austin     Sun      139       7
 2  Dallas     Mon      456       5
 3  Dallas     Sun      237      12)

### Melt - "unpivot" - Gather columns into rows
The goal of melting is to restore a pivoted DataFrame to its original form, or to change it from a wide shape to a long shape. 
You can explicitly specify the columns that should remain in the reshaped DataFrame with id_vars, and list which columns to convert into values with value_vars.
If you don't pass a name to the values in pd.melt(), you will lose the name of your variable. You can fix this by using the value_name keyword argument.

In [None]:
# In [1]: visitors_by_city_weekday
# Out[1]: 
# city     Austin  Dallas
# weekday                
# Mon         326     456
# Sun         139     237

# Reset the index: visitors_by_city_weekday
visitors_by_city_weekday = visitors_by_city_weekday.reset_index() 

# Print visitors_by_city_weekday
print(visitors_by_city_weekday)

# Melt visitors_by_city_weekday: visitors
visitors = pd.melt(visitors_by_city_weekday, id_vars=['weekday'], value_name='visitors')

# Print visitors
print(visitors)

# You can move multiple columns into a single column (making the data long and skinny) by "melting" multiple columns. 
# In [3]: users
# Out[3]: 
#   weekday    city  visitors  signups
# 0     Sun  Austin       139        7
# 1     Sun  Dallas       237       12
# 2     Mon  Austin       326        3
# 3     Mon  Dallas       456        5
# Melt users: skinny
skinny = pd.melt(users, id_vars=['city', 'weekday'])

# Print skinny
print(skinny)



# # Obtaining key-value pairs with melt()
# Sometimes, all you need is some key-value pairs, and the context does not matter. 
# If said context is in the index, you can easily obtain what you want.

# Set the new index: users_idx
users_idx = users.set_index(['city', 'weekday'])

# Print the users_idx DataFrame
print(users_idx)

# Obtain the key-value pairs: kv_pairs
kv_pairs = pd.melt(users_idx, col_level=0)

# Print the key-value pairs
print(kv_pairs)


### Pivot table

In [None]:
# Create the DataFrame with the appropriate pivot table: by_city_day
by_city_day = users.pivot_table(index='weekday', columns='city')

# Print by_city_day
print(by_city_day)



# Use a pivot table to display the count of each column: count_by_weekday1
count_by_weekday1 = users.pivot_table(index='weekday', aggfunc='count')

# Print count_by_weekday
print(count_by_weekday1)

# Replace 'aggfunc='count'' with 'aggfunc=len': count_by_weekday2
count_by_weekday2 = users.pivot_table(index='weekday', aggfunc=len)

# Verify that the same result is obtained
print('==========================================')
print(count_by_weekday1.equals(count_by_weekday2))


# Create the DataFrame with the appropriate pivot table: signups_and_visitors
signups_and_visitors = users.pivot_table(index='weekday', aggfunc=sum)

# Print signups_and_visitors
print(signups_and_visitors)

# Add in the margins: signups_and_visitors_total 
signups_and_visitors_total = users.pivot_table(index='weekday', aggfunc=sum, margins=True)

# Print signups_and_visitors_total
print(signups_and_visitors_total)


## Grouping data 

## Bringing it all together 