# Useful Methods


<a id='apply_method'></a>

## The .apply() method



In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('tips.csv')

In [None]:
df.head()

<a id='apply_function'></a>
### apply with a function

In [None]:
df.info()

In [5]:
def last_four(num):
    return str(num)[-4:]

In [None]:
df['CC Number'][0]

In [None]:
last_four(3560325168603410)

In [8]:
df['last_four'] = df['CC Number'].apply(last_four)

In [None]:
df.head()

### Using .apply() with more complex functions

In [None]:
df['total_bill'].mean()

In [11]:
def yelp(price):
    if price < 10:
        return '$'
    elif price >= 10 and price < 30:
        return '$$'
    else:
        return '$$$'

In [12]:
df['Expensive'] = df['total_bill'].apply(yelp)

In [13]:
# df

<a id='apply_lambda'></a>
### apply with lambda

In [14]:
def simple(num):
    return num*2

In [None]:
lambda num: num*2

In [None]:
df['total_bill'].apply(lambda bill:bill*0.18)

<a id='apply_multiple'></a>
## apply that uses multiple columns


In [None]:
df.head()

In [1]:
def quality(total_bill,tip):
    if tip/total_bill  > 0.25:
        return "Generous"
    else:
        return "Other"

In [19]:
df['Tip Quality'] = df[['total_bill','tip']].apply(lambda df: quality(df['total_bill'],df['tip']),axis=1)

In [None]:
df.head()

In [21]:
import numpy as np

In [22]:
df['Tip Quality'] = np.vectorize(quality)(df['total_bill'], df['tip'])

In [None]:
df.head()

So, which one is faster?

In [24]:
import timeit 
  
# code snippet to be executed only once 
setup = '''
import numpy as np
import pandas as pd
df = pd.read_csv('tips.csv')
def quality(total_bill,tip):
    if tip/total_bill  > 0.25:
        return "Generous"
    else:
        return "Other"
'''
  
# code snippet whose execution time is to be measured 
stmt_one = ''' 
df['Tip Quality'] = df[['total_bill','tip']].apply(lambda df: quality(df['total_bill'],df['tip']),axis=1)
'''

stmt_two = '''
df['Tip Quality'] = np.vectorize(quality)(df['total_bill'], df['tip'])
'''
  

In [None]:
timeit.timeit(setup = setup, 
                    stmt = stmt_one, 
                    number = 1000) 

In [None]:
timeit.timeit(setup = setup, 
                    stmt = stmt_two, 
                    number = 1000) 

Wow! Vectorization is much faster! Keep **np.vectorize()** in mind for the future.


<a id='describe'></a>
### df.describe for statistical summaries

In [None]:
df.describe()

In [None]:
df.describe().transpose()

<a id='sort'></a>
### sort_values()

In [None]:
df.sort_values('tip')

In [None]:
# Helpful if you want to reorder after a sort
# https://stackoverflow.com/questions/13148429/how-to-change-the-order-of-dataframe-columns
df.sort_values(['tip','size'])


## df.corr() for correlation checks

In [None]:
df.corr()

In [None]:
df[['total_bill','tip']].corr()

<a id='idx'></a>
### idxmin and idxmax

In [None]:
df.head()

In [None]:
df['total_bill'].max()

In [None]:
df['total_bill'].idxmax()

In [None]:
df['total_bill'].idxmin()

In [None]:
df.iloc[67]

In [None]:
df.iloc[170]

<a id='v_c'></a>
### value_counts

In [None]:
df.head()

In [None]:
df['sex'].value_counts()

<a id='replace'></a>

### replace

Quickly replace values with another one.

In [None]:
df.head()

In [None]:
df['Tip Quality'].replace(to_replace='Other',value='Ok')

In [41]:
df['Tip Quality'] = df['Tip Quality'].replace(to_replace='Other',value='Ok')

In [None]:
df.head()

<a id='uni'></a>
### unique

In [None]:
df['size'].unique()

In [None]:
df['size'].nunique()

In [None]:
df['time'].unique()

<a id='map'></a>
### map

In [45]:
my_map = {'Dinner':'D','Lunch':'L'}

In [None]:
df['time'].map(my_map)

In [None]:
df.head()

<a id='dup'></a>
## Duplicates

### .duplicated() and .drop_duplicates()

In [None]:
# Returns True for the 1st instance of a duplicated row
df.duplicated()

In [51]:
simple_df = pd.DataFrame([1,2,2],['a','b','c'])

In [None]:
simple_df

In [None]:
simple_df.duplicated()

In [None]:
simple_df.drop_duplicates()

<a id='bet'></a>
## between

In [None]:
df['total_bill'].between(10,20,inclusive=True)

In [None]:
df[df['total_bill'].between(10,20,inclusive=True)]

<a id='sample'></a>
## sample

In [None]:
df.sample(5)

<a id='n'></a>
## nlargest and nsmallest

In [None]:
df.nlargest(10,'tip')

----