## Extracting and transforming data 

In [2]:
import pandas as pd
# sclicing in DataFrame is a view not a copy, but if need new line, the original are not changed
df2 = pd.DataFrame([['1980', 'Blondie', 'Call Me', '6'],
       ['1981', 'Chistorpher Cross', 'Arthurs Theme', '3'],
       ['1982', 'Joan Jett', 'I Love Rock and Roll', '7']])
df2.columns = ['year', 'artist', 'song', 'chart weeks']
# df3 = df2.ix[:2, 'artist':'chart weeks'].copy()
df3 = df2.ix[:2, 'artist':'chart weeks']
df3['chart weeks'] = df3['chart weeks'] + '2'
df3['test'] = 10
print df3
print df2

              artist                  song chart weeks  test
0            Blondie               Call Me          62    10
1  Chistorpher Cross         Arthurs Theme          32    10
2          Joan Jett  I Love Rock and Roll          72    10
   year             artist                  song chart weeks
0  1980            Blondie               Call Me          62
1  1981  Chistorpher Cross         Arthurs Theme          32
2  1982          Joan Jett  I Love Rock and Roll          72


In [3]:
# slicing in Series returns a view not a copy
a = pd.Series([0,2,3,4,5])
a[1:4] = 9
a

0    0
1    9
2    9
3    9
4    5
dtype: int64

In [None]:
# Create the boolean array: high_turnout
high_turnout = election['turnout'] > 70

# Filter the election DataFrame with the high_turnout array: high_turnout_df
high_turnout_df = election[high_turnout]

# Print the high_turnout_results DataFrame
print(high_turnout_df)

# Import numpy
import numpy as np

# Create the boolean array: too_close
too_close = election['margin'] < 1

# Assign np.nan to the 'winner' column where the results were too close to call
election['winner'][too_close] = np.nan

# Print the output of election.info()
print(election.info())

# Select the 'age' and 'cabin' columns: df
df = titanic[['age', 'cabin']]

# Print the shape of df
print(df.shape)

# Drop rows in df with how='any' and print the shape
print(df.dropna(how='any').shape)

# Drop rows in df with how='all' and print the shape
print(df.dropna(how='all').shape)

# Call .dropna() with thresh=1000 and axis='columns' and print the output of .info() from titanic
print(titanic.dropna(thresh=1000, axis='columns').info())

In [None]:
# Write a function to convert degrees Fahrenheit to degrees Celsius: to_celsius
def to_celsius(F):
    return 5/9*(F - 32)

# Apply the function over 'Mean TemperatureF' and 'Mean Dew PointF': df_celsius
df_celsius = weather[['Mean TemperatureF','Mean Dew PointF']].apply(to_celsius)

# Reassign the columns df_celsius
df_celsius.columns = ['Mean TemperatureC', 'Mean Dew PointC']

# Print the output of df_celsius.head()
print(df_celsius.head())

# The .map() method is used to transform values according to a Python dictionary look-up.
# Create the dictionary: red_vs_blue
red_vs_blue = {'Obama':'blue' , 'Romney':'red'}

# Use the dictionary to map the 'winner' column to the new column: election['color']
election['color'] = election['winner'].map(red_vs_blue)

# Print the output of election.head()
print(election.head())



# When performance is paramount, you should avoid using .apply() and .map() 
# because those constructs perform Python for-loops over the data stored in a pandas Series or DataFrame. 
# By using vectorized functions instead, you can loop over the data at the same speed as compiled code (C, Fortran, etc.)! 
# NumPy, SciPy and pandas come with a variety of vectorized functions (called Universal Functions or UFuncs in NumPy).
# Import zscore from scipy.stats
from scipy.stats import zscore

# Call zscore with election['turnout'] as input: turnout_zscore
turnout_zscore = zscore(election['turnout'])

# Print the type of turnout_zscore
print(type(turnout_zscore))

# Assign turnout_zscore to a new column: election['turnout_zscore']
election['turnout_zscore'] = turnout_zscore

# Print the output of election.head()
print(election.head())



## Advanced indexing

In [None]:
# indexes are immutable objects. This means that if you want to change or modify the index in a dataframe, 
# then you need to change the whole index.

# Create the list of new indexes: new_idx
new_idx = [month.upper() for month in sales.index]

# Assign new_idx to sales.index
sales.index = new_idx

# Print the sales DataFrame
print(sales)

## Rearranging and reshaping data 

## Grouping data 

## Bringing it all together 