In [1]:
import pandas as pd

## Creating DataFrame

In [60]:
df = pd.DataFrame({'id': [100,101,102], 'color':['red','blue','red']}, columns=['id', 'color'], index = ['a','b','c'])
df = pd.DataFrame([[100,'red'],[101, 'blue'], [102, 'red']], columns=['id', 'color'], index=['a','b','c'])

In [None]:
# Creating from np array
import numpy as np
arr = np.random.rand(4,2)
df = pd.DataFrame(arr, columns = ['one', 'two'], index = ['a','b','c', 'd'])

In [61]:
# Add a Series to DataFrame with index alignment:
ser = pd.Series(['round', 'square'], index=['c', 'b'], name='shape' )
pd.concat([df, ser], axis = 1, sort=False)

Unnamed: 0,id,color,shape
a,100,red,
b,101,blue,square
c,102,red,round


## Reading from file

In [None]:
df= pd.read_table('orders.tsv')
#Read selected columns:
df = pd.read_table('orders.tsv', names= ['item_name', 'item_price'])
df = pd.read_table('imdb_1000.csv', sep='|', header=None)
df = pd.read_csv('imdb_1000.csv')
#read first number of columns
df = pd.read_table('orders.tsv', nrows = 3)

## Slicing, Joining Series from DataFrame

In [None]:
df['column_name']   # select using bracket notation
df.column_name      # select using dot notation

In [None]:
# Joining columns and assign to a new column
df['new_column'] = df.col_1 + ", " + df.col_2
# Assigning new column must be done in bracket notation

## Common Attribute & Methods for DataFrame

In [None]:
df.head()
df.tail()
df.shape
df.describe()
df.dtypes
df.columns

## Rename Columns

In [None]:
# Several ways to achieve this:
df.rename(columns = {'old_col_1': 'new_col_1', 'old_col_2': 'new_col'})
df.rename({'old_col_1': 'new_col_1', 'old_col_2': 'new_col'}, axis = 0)
df.columns = ['new_col_1', 'new_col_2']

# And can be done when reading from file:
df = pd.read_csv('filename.csv', names = ['col1', 'col2', 'col3'], header = 0)

# Using str method to modify column names: 
df.columns = df.columns.str.replace(' ', '_')

## Remove Columns, Rows

In [None]:
df.drop('col_1', axis = 1, inplace = True)     # drop column
df.drop(5, axis = 0, inplace = True) # drop row
df.drop(['row_1', 'row_2']) # Drop row by name (if available)

In [None]:
# Drop multiple columns
df.drop(['col_1', 'col_2'], axis = 1, inplace = True)
df.drop(columns = ['col_1','col_2'], inplace = True)

In [None]:
# Drop multiple rows
df.drop(df.index[0:5], inplace = True)

### Remove duplicate rows

In [None]:
# Creates a boolean mask of duplicated values in a column:
df.col_1.duplicated() 
# Creates a boolean mask of duplicated rows:
df.duplicated() 
# the first duplicated item is marked as false:
df.duplicated(keep = 'first') 

In [None]:
# Removing duplicates, keeping only the last of the duplicated occurances:
df.drop_duplicates(keep = 'last', inplace = True)
# Looking at a subset of columns to identify duplicates
df.drop_duplicates(subset = ['col_1', 'col_2'], inplace = True)

### Drop non-numeric columns from DataFrame

In [None]:
df.select_dtypes(include = [np.number])

## Handling missing values

In [None]:
# Drop an entire row if any of the columns has a nan value:
df.dropna(how = 'any')
# Drop an entire row only if all of the columns are nan values:
df.dropna(how = 'all')
# Drop an entire row only if specified columns has nan values:
df.dropna(subset = ['col_1', 'col_2'], how = 'any')

In [None]:
# Fill missing values:
df['col_1'].fillna(value = 'Other', inplace = True)

#### Overriding values with nan

In [None]:
import numpy as np
df.loc[df['col_1']=='N/A', 'col_1'] = np.nan
# Note: loc method is used to re-assign values back to the same column

## Change data type

In [None]:
# Change data type in one column of DataFrame:
df['col_4'] = df.col_1.astype(float)
# Change data type when reading file into DataFrame:
df = pd.csv_read('some_file', dtype = {'col_1': float})
# Change data type of multiple columns at once:
df = df.astype({'col_1': 'float', 'col_2': 'float'})

In [None]:
# Working with currency:
df.col_4.str.replace('$','').astype(float)

In [None]:
# Changing contextual data to 1 & 0 for computing purpose:
df.col_2.str.contains('some_word').astype(int)

### DataFrame optimization

In [None]:
# To see memory usage
df.info(memory_usage = 'deep')  # Overall usage
df.memory_usage(deep = True)    # By column

#### Storing a column of string value type as category

In [None]:
df['col_1'] = df.col_1.astype('category')
#or, when reading the file:
df = pd.read_csv('filename.csv', dtype={'col_1': 'category'})

#### Logical order of string type

In [7]:
# Given a DataFrame with string type in 'Quality' column:
df = pd.DataFrame({'ID': [100,101,102,103], 'Quality':['good', 'very good', 'good', 'excellent']})

In [9]:
# Specify the order of string values in the 'Quality' column
from pandas.api.types import CategoricalDtype
quality_cat = CategoricalDtype(['good', 'very good', 'excellent'], ordered = True)
df['Quality'] = df.Quality.astype(quality_cat)

In [10]:
# As the result, sorting and filtering can be done on this column:
display (df.sort_values(by='Quality'), df.loc[df.Quality>'good', :])

Unnamed: 0,ID,Quality
0,100,good
2,102,good
1,101,very good
3,103,excellent


Unnamed: 0,ID,Quality
1,101,very good
3,103,excellent


### Value mapping / dummy variables

In [None]:
# From a column that has values either 'male' or 'female',
# create a new column that uses 1, 0 corresponding to 'male' and 'female'
df['sex_male'] = df['sex'].map({'female': 0, 'male': 1})

In [None]:
# Alternatively:
# Creates a table of columns, each column represent a value from the original column:
pd.get_dummies(df.sex)

## Apply a function to Series or DataFrame

In [None]:
# Apply len function to values in a column and store results in a new column:
df['lengths'] = df.col_1.apply(len)
#Apply Numpy ceiling function:
df['ceilings'] = df.col_1.apply(np.ceil)

In [None]:
# Get last name from a column of names 'lastname, firstnames':
df['lastname'] = df['names'].str.split(',').apply(lambda x: x[0])

In [None]:
# Get max value for each column from specified columns:
df.loc[:, 'col_3' : 'col_5'].apply(max, axis = 0)
# Locate the index of the max values:
df.loc[:, 'col_3' : 'col_5'].apply(np.argmax, axis=0)

#### agg functions

In [None]:
# agg functions can be apply to Series and DataFrame
df.col_1.agg(['mean', 'max', 'min'])
df.agg(['mean', 'max', 'min'])

#### Apply a function to each element

In [None]:
# Turn values to float type:
df.loc[:, 'col_3': 'col_5'].applymap(float)

## Sorting

#### Sort values in a column as a Series

In [None]:
df['col_1'].sort_values()

#### Sort whole DataFrame by values in one column

In [None]:
df.sort_values('col_1')

#### Sort whole DataFrame by multiple columns

In [None]:
df.sort_values(['col_1', 'col_2'])

## Filtering

#### Filter DataFrame by value in one column

In [None]:
df[df.col_1 > 200]         # Dot notation
df[df['col_1'] > 200]      # Bracket notation
df[df.col_2.str.contains('some_word')]  # Using str method

#### Selecting a column after applying the filter

In [None]:
df[df.col_1 >= 200]['col_2']         # Using bracket notation
df[df.col_1 >= 200].col_2            # Using dot notation
df.loc[df.col_1 >= 200, 'col_2']     # Best practice: using .loc method

#### Multiple filters

In [None]:
df[(df.col_1 == 200) & (df.col_2 < 100)]    # Must use '&' as the 'and' operator
df[(df.col_1 > 200) | (df.col_2 < 100)]    # Must use '|' as the 'or' operator
df[df.col_3.isin(['value1', 'value2'])]     # Multiple conditions in one column

## Date time in Pandas

In [None]:
# Convert to Pandas datetime64 type
df['date_time'] = pd.to_datetime(df['date_time'])

In [None]:
# Now the column can be used for comparison and filtering:
ts = pd.to_datetime('1/1/1999')
df.loc[df.date_time >= ts, :]

In [None]:
(ufo.date_time.max() - ufo.date_time.min()).days # difference in days

#### Creating datetime from columns with specific names that Pandas recognizes

In [53]:
# Given:
df = pd.DataFrame([[12,25,2017,10], [1,15,2018,11]], columns=['month', 'day', 'year','hour'])
df.dtypes

month    int64
day      int64
year     int64
hour     int64
dtype: object

In [54]:
df['date_time'] = pd.to_datetime(df)   # Converts to datetime64 type

In [55]:
# Alternatively, convert and use the date column as index:
df.index = pd.to_datetime(df[['month', 'day', 'year']])

## Indexing

In [None]:
df.loc[5, 'col_2']            # Select specific row, column
df.loc[0, :]                  # Select the first row
df.loc[0:2, :]                # Select first 3 rows
df.loc[:, 'col_1']            # Select a column by name
df[['col_1', 'col_3']]        # Select multiple columns
df.loc[:, ['col_1', 'col_3']] # Select multiple columns
df.loc[:, 'col_1' : 'col_3']  # Select a range of columns

In [None]:
# Set a column as index(row), and select data using row and column names:
df.set_index('col_1', inplace = True)
df.loc['a_value_in_col_1', 'col_2']
# To put the index back into a column:
df.index.name = 'col_1'
df.reset_index(inplace = True)

In [None]:
# Explicitly creating a copy instead of a view:
new_df = df.loc[0, :].copy

#### Series with matching index can be multiplied, and result will be properly aligned

In [5]:
population = pd.Series([3000, 5000], index = ['AB', 'BC'], name = 'population')
income_per_cap = pd.Series([500,500,500], index = ['AB', 'BC', 'SK'], name = 'income_per_cap')
total_income = population * income_per_cap
total_income

AB    1500000.0
BC    2500000.0
SK          NaN
dtype: float64

## Merge/ Concat DataFrame

In [None]:
# Series and DataFrame can be combined with index alignment
pd.concat([df_1, ser_1], axis = 1)

In [None]:
# Merging by a common index column
pd.merge(df_1, df_2, left_index = True, right_index = True)
# parameters: how = 'inner' (default) retains only rows in both sets
#             how = 'left'/'right' use left/right DataFrame index
#             how = 'outer' retain all rows from either DataFrames

## Append

In [None]:
# For generating data row by row and appending to the bottom
df = df.append({'col_1': val_1, 'col_2': val_2, 'col_3': val_3}, ignore_index=True)

## Splitting DataFrame (opposite selection using ~)

In [None]:
df_train = df.sample(frac = 0.75, random_state = 99)
df_test = df.loc[~df.index.isin(df_train.index), :] # Select opposite of df_train

## Iterating through DataFrame

In [None]:
for index, row in df.iterrows():
    print(index, row.col_1, row.col_2)

## groupby method

#### Categorizing data in one column, and look at values from another columns summmarized by some function:

In [None]:
# Group data by col_3, and calcualte mean in col_2:
df.groupby('col_3').col_2.mean()

In [None]:
# Specifying aggregation functions:
df.groupby('col_3').col_2.agg(['count', 'min', 'max', 'mean'])

In [None]:
# Group data by multiple columns creates a multi-index table:
df.groupby(['col_1', 'col_3']).col_2.mean()

## Value occurances in a column

In [None]:
# This gives a list of unique values in a column:
df.col_1.unique()

In [None]:
# This gives a list of unique values and their occurances:
df.col_1.value_counts()

In [None]:
# This gives a list of unique values and their frequency:
df.col_1.value_counts(normalize = True)

In [None]:
# Combining low frequency values by overwriting value to 'other'
freq_series=pd.value_counts(df['col'])
mask_freq = freq_series/freq_series.sum() * 100 < 1    #mask frequency <1%
df['col']=np.where(df['col'].isin(freq_series[mask_freq].index), 'other', df['col'])

#### crosstab method by default uses 'count' aggregation:

In [None]:
# This creates a table with col_1 as row index, col_2 as column headers
pd.crosstab(df.col_1, df.col_2)

## Multi-index Series

#### groupby method

In [67]:
stocks = pd.read_csv('stocks.csv')
# Multi-index series can be created by grouping by more than one index:
ser = stocks.groupby(['Symbol', 'Date']).Close.mean()
ser.index

MultiIndex(levels=[['AAPL', 'CSCO', 'MSFT'], ['2016-10-03', '2016-10-04', '2016-10-05']],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]],
           names=['Symbol', 'Date'])

In [69]:
# Unstacking this dual-index series, would return a DataFrame 
# with the second index being the columns
df = ser.unstack()
df

Date,2016-10-03,2016-10-04,2016-10-05
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,112.52,113.0,113.05
CSCO,31.5,31.35,31.59
MSFT,57.42,57.24,57.64


#### Alternatively, using pivot_table method

In [70]:
stocks.pivot_table(values='Close', index='Symbol', columns='Date')

Date,2016-10-03,2016-10-04,2016-10-05
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,112.52,113.0,113.05
CSCO,31.5,31.35,31.59
MSFT,57.42,57.24,57.64


#### Series with multi-index behaves like a 2-dimension DataFrame

In [None]:
ser.loc['AAPL']
ser.loc['AAPL', '2016-10-03']
ser.loc[:, '2016-10-03']

## Multi-index DataFrame

In [71]:
stocks.set_index(['Symbol','Date'], inplace=True)
stocks

Unnamed: 0_level_0,Unnamed: 1_level_0,Close,Volume
Symbol,Date,Unnamed: 2_level_1,Unnamed: 3_level_1
CSCO,2016-10-03,31.5,14070500
AAPL,2016-10-03,112.52,21701800
MSFT,2016-10-03,57.42,19189500
AAPL,2016-10-04,113.0,29736800
MSFT,2016-10-04,57.24,20085900
CSCO,2016-10-04,31.35,18460400
MSFT,2016-10-05,57.64,16726400
CSCO,2016-10-05,31.59,11808600
AAPL,2016-10-05,113.05,21453100


#### Pass the indexes as a tuple using loc

In [None]:
stocks.loc[('AAPL', '2016-10-03'), :]
stocks.loc[('AAPL', '2016-10-03'), 'Close']
stocks.loc[(['AAPL', 'MSFT'], '2016-10-03'), 'Close']
stocks.loc[(slice(None), '2016-10-03'), 'Volume']
# Note that when no slice is to be made on the first index, a special property syntax is used

In [None]:
stocks.reset_index()    # resets back to the original shape

## Notebook display options for Pandas

In [None]:
pd.reset_option('all') #reset all options to default

In [59]:
pd.set_option('display.max_rows', None)
pd.reset_option('display.max_rows')

pd.set_option('display.max_colwidth', 1000)
pd.reset_option('display.max_colwidth')

In [None]:
pd.set_option('display.precision', 2)
pd.reset_option('display.precision')

In [None]:
pd.describe_option('rows') #search in docs for methods containing 'row'