## Pandas Cheat Sheet

In [2]:
# Import Pandas Library
# df for any Pandas DataFrame
# s for any Pandas Series

import pandas as pd
import numpy as np


### Importing Data

pd.read_csv(filename) #From a CSV file

pd.read_table(filename) #From a delimited text file (like TSV)

pd.read_excel(filename) #From an Excel file (like xlsx)

pd.read_sql(query, connection_objects) #Read from a SQL table/database

pd.read_json(json_string) #Read from a JSON formatted string, URL or file

pd.read_html(url) #Parses an html URL, string or file and extracts tables to a list of dataframes

pd.read_clipboard() #Takes the contents of your clipboard and passes it to read_table( )

pd.DataFrame(dict) #From a dict, keys for column names, values for data as lists

### Create Test Objects 

pd.DataFrame(np.random.rand(20,5)) #5 columns and 20 rows of random floats

In [3]:
pd.DataFrame(np.random.rand(20,5))


Unnamed: 0,0,1,2,3,4
0,0.688901,0.723097,0.138076,0.639031,0.504366
1,0.914582,0.717245,0.476866,0.751797,0.354451
2,0.998092,0.464517,0.08503,0.807214,0.392708
3,0.691081,0.519494,0.780474,0.377801,0.428537
4,0.99776,0.806244,0.954406,0.727881,0.826605
5,0.543553,0.593414,0.301235,0.264057,0.255596
6,0.051636,0.282455,0.443836,0.038653,0.938151
7,0.773525,0.889427,0.971686,0.101659,0.950191
8,0.624968,0.58012,0.831913,0.784175,0.009376
9,0.278846,0.732181,0.395092,0.169875,0.279986


pd.Series(my_list) #Create a series from an iterable my_list

df.index = pd.date_range('1900/1/30', periods=df.shape[0]) #Add a date index

### Viewing/Inspecting Data

    df.head(n) #First n rows of the DataFrame
    df.tail(n) #Last n rows of the DataFrame
    df.shape() #Number of rows and columns 
    df.info() #Index, Datatype and Memory Information
    df.describe() #Summary statistics for numerical columns
    s.value_counts(dropna=False) #View unique values and counts
    df.apply(pd.Series.value_counts) #Unique values and counts for all columns

### Selection

    df[col] #Returns column with label col as Series
    df[[col1, col2]] #Returns columns as a new DataFrame
    s.iloc[0] #Selection by position
    s.loc['index_one'] #Selection by index
    df.iloc[0,:] #First row
    df.iloc[0,0] #First element of first column

### Data Cleaning

    df.columns = ['a','b','c'] #Rename columns 
    pd.isnull( ) #Checks for null Values, returns Boolean array
    pd.notnull( ) #Opposite of pd.isnull( ) 
    df.dropna( ) #Drop all rows that contain null or NaN values
    df.dropna(axis=1) #Drop all columns that contain null values
    df.dropna(axis=1, thresh=n) 
    df.fillna(x) #Replace all null values with x; can select mean, median etc.
    s.fillna(s.mean( )) #Replace all null values with the mean 
    s.astype(float) #Convert datatype of the series to a float
    s.replace(1,'one') #Replace all values equal to 1 with 'one'
    s.replace([1,3],['one', 'three']) #Replace all 1 with 'one' and all 3 with 'three'
    df.rename(columns=lambda x: x + 1) #Mass renaming of columns
    df.rename(columns={'old_name': 'new_name}) #Selective renaming
    df.set_index('column_one') #Change the index
    df.rename(index=lambda x: x + 1) #Mass renaming of index


### Filter, Sort, and Groupby

    df[df[col] > 0.5] #Rows where the column col is greater than 0.5
    df[(df[col] > 0.5) & (df[col] < 0.7)] #Rows where 0.7 > col > 0.5
    df.sort_values(col1) #Sort values by col1 in ascending order
    df.sort_values(col1, ascending=False) #Sort values by col2 in descending order
    df.sort_values([col1,col2], ascending=[True,False]) #Sort values by col1 in ascending order then col2 in descending order
    df.groupby(col) #Returns a groupby object for values from one column
    df.groupby([col1,col2]) #Returns a groupby object for values from multiple columns
    df.groupby(col1)[col2] #Returns the mean of the values for col2, grouped by the values in col1 *Can use other statistics
    df.pivot_table(index=col1,values=[col2,col3], aggfunc=mean) #Create a pivot table that groups by col1 and calculates the mean of col2 and col3
    df.groupby(col1).agg(np.mean) #Find the mean across all columns for every unique col1 group
    df.apply(np.max,axis=1) #Apply the function np.max( ) across each row

### Join/Combine

    df1.append(df2) #Adds the rows in df1 to the end of df2 (columns should be identical)
    pd.concat([df1,df2],axis=1) #Add the columns in df1 to the end of df2 (rows should be identical)
    df1.join(df2,on=col1,how='inner') #SQL style join the columns in df1 with columns on df2 where the rows of col1 have identical values 
    how = 'left','right','outer','inner'


### Statistics

df.describe( ) #Summary statistics for numerical columns


df.mean( ) #Returns the mean of all columns

df.corr( ) #Returns the correlation between columns in DataFrame

df.count( ) #Returns the number of non-null values in each DataFrame column

df.max( ) #Returns the highest value in each column

df.min( ) #Returns the lowest value in each column

df.median( ) #Returns the median of each column

df.std( ) #Returns the standard deviation of each column

df.var( ) #Returns the variance of each column