# Data Wrangling or Data Munging

Here we will concentrate on the following sub-sections of this methodology:

1> Data collection: To understand different data retrieval mechanisms for 
                    different data types. -> in very brief

2> Data description: To understand various attributes and properties of the
                     data collected. -> in very brief

3> Data wrangling: To prepare data for consumption in the modeling steps.

4> Data visualization: To visualize different attributes for sharing results, better understanding, and so on.  -> "Covered through matplotlib"

In [107]:
# import required libraries
import random
import datetime 
import numpy as np
import pandas as pd
from random import randrange
from sklearn import preprocessing

from IPython.display import display

pd.options.mode.chained_assignment = None
import warnings; warnings.simplefilter('ignore')  # to suppress warnings

## Utilities

In [108]:
def _random_date(start,date_count):
    """This function generates a random date based on params
    Args:
        start (date object): the base date
        date_count (int): number of dates to be generated
    Returns:
        list of random dates

    """
    current = start
    while date_count > 0:
        curr = current + datetime.timedelta(days=randrange(42))
        yield curr
        date_count-=1


def generate_sample_data(row_count=100):
    """This function generates a random transaction dataset
    Args:
        row_count (int): number of rows for the dataframe
    Returns:
        a pandas dataframe

    """
    
    # sentinels
    startDate = datetime.datetime(2016, 1, 1,13)
    serial_number_sentinel = 1000
    user_id_sentinel = 5001
    product_id_sentinel = 101
    price_sentinel = 2000
    
    
    # base list of attributes
    data_dict = {
    'Serial No': np.arange(row_count)+serial_number_sentinel,
    'Date': np.random.permutation(pd.to_datetime([x.strftime("%d-%m-%Y") 
                                                    for x in _random_date(startDate,
                                                                          row_count)]).date
                                  ),
    'User ID': np.random.permutation(np.random.randint(0,
                                                       row_count,
                                                       size=int(row_count/10)) + user_id_sentinel).tolist()*10,
    'Product ID': np.random.permutation(np.random.randint(0,
                                                          row_count,
                                                          size=int(row_count/10))+ product_id_sentinel).tolist()*10 ,
    'Quantity Purchased': np.random.permutation(np.random.randint(1,
                                                                  42,
                                                                  size=row_count)),
    'Price': np.round(np.abs(np.random.randn(row_count)+1)*price_sentinel,
                      decimals=2),
    'User Type':np.random.permutation([chr(random.randrange(97, 97 + 3 + 1)) 
                                            for i in range(row_count)])
    }
    
    # introduce missing values
    for index in range(int(np.sqrt(row_count))): 
        data_dict['Price'][np.argmax(data_dict['Price'] == random.choice(data_dict['Price']))] = np.nan
        data_dict['User Type'][np.argmax(data_dict['User Type'] == random.choice(data_dict['User Type']))] = np.nan
        data_dict['Date'][np.argmax(data_dict['Date'] == random.choice(data_dict['Date']))] = np.nan
        data_dict['Product ID'][np.argmax(data_dict['Product ID'] == random.choice(data_dict['Product ID']))] = 0
        data_dict['Serial No'][np.argmax(data_dict['Serial No'] == random.choice(data_dict['Serial No']))] = -1
        data_dict['User ID'][np.argmax(data_dict['User ID'] == random.choice(data_dict['User ID']))] = -101
        
    
    # create data frame
    df = pd.DataFrame(data_dict)
    
    return df
    

def describe_dataframe(df=pd.DataFrame()):
    """This function generates descriptive stats of a dataframe
    Args:
        df (dataframe): the dataframe to be analyzed
    Returns:
        None

    """
    print("\n\n")
    print("*"*30)
    print("About the Data")
    print("*"*30)
    
    print("Number of rows::",df.shape[0])
    print("Number of columns::",df.shape[1])
    print("\n")
    
    print("Column Names::",df.columns.values.tolist())
    print("\n")
    
    print("Column Data Types::\n",df.dtypes)
    print("\n")
    
    print("Columns with Missing Values::",df.columns[df.isnull().any()].tolist())
    print("\n")
    
    print("Number of rows with Missing Values::",len(pd.isnull(df).any(1).nonzero()[0].tolist()))
    print("\n")
    
    print("Sample Indices with missing data::",pd.isnull(df).any(1).nonzero()[0].tolist()[0:5])
    print("\n")
    
    print("General Stats::")
    print(df.info())
    print("\n")
    
    print("Summary Stats::")
    print(df.describe())
    print("\n")
    
    print("Dataframe Sample Rows::")
    display(df.head(5))
    
def cleanup_column_names(df,rename_dict={},do_inplace=True):
    """This function renames columns of a pandas dataframe
       It converts column names to snake case if rename_dict is not passed. 
    Args:
        rename_dict (dict): keys represent old column names and values point to 
                            newer ones
        do_inplace (bool): flag to update existing dataframe or return a new one
    Returns:
        pandas dataframe if do_inplace is set to False, None otherwise

    """
    if not rename_dict:
        return df.rename(columns={col: col.lower().replace(' ','_') 
                    for col in df.columns.values.tolist()}, 
                  inplace=do_inplace)
    else:
        return df.rename(columns=rename_dict,inplace=do_inplace)

def expand_user_type(u_type):
    """This function maps user types to user classes
    Args:
        u_type (str): user type value
    Returns:
        (str) user_class value

    """
    if u_type in ['a','b']:
        return 'new'
    elif u_type == 'c':
        return 'existing'
    elif u_type == 'd':
        return 'loyal_existing'
    else:
        return 'error'

## Generate a Sample Dataset

In [109]:
df = generate_sample_data(row_count=1000)

### Describe the Dataset

In [110]:
describe_dataframe(df)




******************************
About the Data
******************************
Number of rows:: 1000
Number of columns:: 7


Column Names:: ['Serial No', 'Date', 'User ID', 'Product ID', 'Quantity Purchased', 'Price', 'User Type']


Column Data Types::
 Serial No               int32
Date                   object
User ID                 int64
Product ID              int64
Quantity Purchased      int32
Price                 float64
User Type              object
dtype: object


Columns with Missing Values:: ['Date', 'Price']


Number of rows with Missing Values:: 61


Sample Indices with missing data:: [0, 1, 2, 3, 4]


General Stats::
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
Serial No             1000 non-null int32
Date                  969 non-null object
User ID               1000 non-null int64
Product ID            1000 non-null int64
Quantity Purchased    1000 non-null int32
Price                 969 non-null float64


Unnamed: 0,Serial No,Date,User ID,Product ID,Quantity Purchased,Price,User Type
0,1000,,-101,0,36,1777.8,n
1,1001,,5975,625,2,1283.34,n
2,1002,,5821,164,30,2211.46,n
3,1003,,5838,629,5,1508.19,n
4,1004,,5771,408,1,904.19,n


### Rename Columns

In [111]:
print("Dataframe columns:\n{}".format(df.columns.tolist()))

Dataframe columns:
['Serial No', 'Date', 'User ID', 'Product ID', 'Quantity Purchased', 'Price', 'User Type']


In [112]:
cleanup_column_names(df)

In [113]:
print("Dataframe columns:\n{}".format(df.columns.tolist()))

Dataframe columns:
['serial_no', 'date', 'user_id', 'product_id', 'quantity_purchased', 'price', 'user_type']


### Sort Rows on defined attributes

In [114]:
# .. type your code here

display(df.sort_values(['serial_no','price'],ascending = [True, False]).head())


# first sorted on serial_no, all products having same serial_no sorted on price

Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type
647,-1,2016-01-20,5453,723,10,5906.17,b
602,-1,2016-01-28,5821,164,18,5881.15,c
994,-1,2016-01-14,5698,864,32,5456.67,d
281,-1,2016-01-21,5864,1068,24,5361.45,a
542,-1,2016-09-01,5169,1083,9,4279.55,c


### Rearrange Columns in a Dataframe

In [115]:
# .. type your code here

display(df[['serial_no','date','user_id','user_type','product_id','quantity_purchased','price']].head())

df=df[['serial_no','date','user_id','user_type','product_id','quantity_purchased','price']]
df.head()

Unnamed: 0,serial_no,date,user_id,user_type,product_id,quantity_purchased,price
0,1000,,-101,n,0,36,1777.8
1,1001,,5975,n,625,2,1283.34
2,1002,,5821,n,164,30,2211.46
3,1003,,5838,n,629,5,1508.19
4,1004,,5771,n,408,1,904.19


Unnamed: 0,serial_no,date,user_id,user_type,product_id,quantity_purchased,price
0,1000,,-101,n,0,36,1777.8
1,1001,,5975,n,625,2,1283.34
2,1002,,5821,n,164,30,2211.46
3,1003,,5838,n,629,5,1508.19
4,1004,,5771,n,408,1,904.19


### Filtering Columns

Using Column Index

In [116]:
# print 10 values from column at index 3
print(df.iloc[:,4].values[0:10])# it displays product_id
df['product_id'].head()

[  0 625 164 629 408 471 437 783 845 228]


0      0
1    625
2    164
3    629
4    408
Name: product_id, dtype: int64

Using Column Name

In [117]:
# print 10 values of quantity purchased
print(df.quantity_purchased.values[0:10])

[36  2 30  5  1 35 29 39 12 14]


Using Column Datatype

In [118]:
# print 10 values of columns with data type float
print(df.select_dtypes(include=['float64']).values[:10,0])#values[:10,0]<<------it is 0 bcz there is only 1 float value which is at 0th location in the dataframe
# we are printing only 0th column, i.e price
#we will get indexing error if we try to change 0th value

[1777.8  1283.34 2211.46 1508.19  904.19  666.55 2014.66 4434.27 2185.39
 2438.27]


### Filtering Rows

Select specific rows

In [119]:
display(df.iloc[[10,501,20]]) # i -> implicit indexing

Unnamed: 0,serial_no,date,user_id,user_type,product_id,quantity_purchased,price
10,-1,2016-01-13,5646,n,1038,33,2088.76
501,1501,2016-01-20,5975,a,625,19,4310.3
20,1020,,5226,n,143,9,2590.79


Exclude Specific Row indices

In [120]:
df.drop([0,2,5], axis=0).head()
# note : axis=1 would give error as, indexes 0,2,5 don't appear column wise
# default axis is 0

Unnamed: 0,serial_no,date,user_id,user_type,product_id,quantity_purchased,price
1,1001,,5975,n,625,2,1283.34
3,1003,,5838,n,629,5,1508.19
4,1004,,5771,n,408,1,904.19
6,1006,,5259,n,437,29,2014.66
7,1007,,5399,n,783,39,4434.27


Conditional Filtering

In [121]:
display(df[df.quantity_purchased>25].head())

Unnamed: 0,serial_no,date,user_id,user_type,product_id,quantity_purchased,price
0,1000,,-101,n,0,36,1777.8
2,1002,,5821,n,164,30,2211.46
5,1005,,5804,n,471,35,666.55
6,1006,,5259,n,437,29,2014.66
7,1007,,5399,n,783,39,4434.27


Offset from top of the dataframe

In [122]:
display(df[100:].head())

Unnamed: 0,serial_no,date,user_id,user_type,product_id,quantity_purchased,price
100,1100,2016-01-27,5373,a,446,23,3034.47
101,1101,,5975,b,625,20,4506.91
102,1102,2016-02-02,5821,b,164,41,
103,1103,2016-04-02,5838,d,629,3,3524.09
104,1104,2016-09-02,5771,c,408,19,929.57


Offset from bottom of the dataframe

In [123]:
display(df[-10:].head()) # 1000th row is the last row - 10 = 990

Unnamed: 0,serial_no,date,user_id,user_type,product_id,quantity_purchased,price
990,1990,2016-10-02,5304,d,155,14,3742.65
991,1991,2016-01-02,5213,a,959,26,1874.39
992,1992,2016-05-01,5915,d,651,7,2468.13
993,1993,2016-08-02,5424,a,858,36,1214.94
994,-1,2016-01-14,5698,d,864,32,5456.67


### TypeCasting/Data Type Conversion

In [124]:
df['date'] = pd.to_datetime(df.date) #available in pandas
# compare dtypes of the original df with this one
print(df.dtypes)

serial_no                      int32
date                  datetime64[ns]
user_id                        int64
user_type                     object
product_id                     int64
quantity_purchased             int32
price                        float64
dtype: object


### Apply/Map Usage

Map : Create a derived attribute using map. map() works element wise.

In [125]:
df['user_class'] = df['user_type'].map(expand_user_type)
# map function applies the user defn method expand_user_type to each value of 
# user_type
display(df.tail())

# -- expand_user_type function is defn in the Utilities. 
# -- Its body is shown here just for reference.
#  def expand_user_type(u_type):
#   if u_type in ['a','b']:
#   return 'new'
#   elif u_type == 'c':
#   return 'existing'
#   elif u_type == 'd':
#   return 'loyal_existing'
#   else:
#   return 'error'

Unnamed: 0,serial_no,date,user_id,user_type,product_id,quantity_purchased,price,user_class
995,1995,2016-01-29,5883,c,517,4,2542.61,existing
996,1996,2016-01-13,5566,d,344,37,4041.02,loyal_existing
997,1997,2016-01-18,5062,d,1088,38,1802.28,loyal_existing
998,1998,2016-01-29,5928,c,562,28,,existing
999,1999,2016-01-29,5955,d,1083,35,8187.02,loyal_existing


Apply: Using apply to get attribute ranges

In [126]:
# The apply() function is used to perform actions on the whole object, 
# depending upon the axis (default is on all rows).
display(df.select_dtypes(include=[np.number]).apply(lambda x: 
                                                        x.max()- x.min()))

serial_no             2000.00
user_id               6096.00
product_id            1098.00
quantity_purchased      40.00
price                 8655.63
dtype: float64

Applymap: Extract week from date

In [127]:
df['purchase_week'] = df[['date']].applymap(lambda dt:dt.week 
                                                if not pd.isnull(dt.week) 
                                                else 0)

# lambda fn gets the week of the transaction from the date attribute

# if not pd.isnull(dt.week) ->dt.week==>NAN
#                             therefor if not isnull(dt.week)=>isnull(dt.week)==>true
#                             if not true ===>false
#                             will goto else and purchase week=0
# to print week from the date
# df['date'].dt.week
# df.date.dt.week

In [128]:
 display(df.head()) # display() is defined under Utilities

Unnamed: 0,serial_no,date,user_id,user_type,product_id,quantity_purchased,price,user_class,purchase_week
0,1000,NaT,-101,n,0,36,1777.8,error,0
1,1001,NaT,5975,n,625,2,1283.34,error,0
2,1002,NaT,5821,n,164,30,2211.46,error,0
3,1003,NaT,5838,n,629,5,1508.19,error,0
4,1004,NaT,5771,n,408,1,904.19,error,0


### Missing Values

Imputing Missing Values : Missing values can lead to all sorts of problems when dealing with Machine Learning and Data Science related use cases. Not only can they cause problems for algorithms, they can mess up calculations and even final outcomes. 

Missing values also pose risk of being interpreted in non-standard ways as well leading to confusion and more errors. Hence, imputing missing values carries a lot of weight in the overall data wrangling process.

One of the easiest ways of handling missing values is to ignore or remove them altogether from the dataset. When the dataset is fairly large and we have enough samples of various types required, this option can be safely exercised. We use the dropna() function from pandas in the following snippet to remove rows of data where the date of transaction is missing.

In [129]:
print("Drop Rows with missing dates::" )
df_dropped = df.dropna(subset=['date'])
display(df_dropped.head())

# # dropna -> drops not available values

Drop Rows with missing dates::


Unnamed: 0,serial_no,date,user_id,user_type,product_id,quantity_purchased,price,user_class,purchase_week
9,1009,2016-01-18,5002,n,228,14,2438.27,error,3
10,-1,2016-01-13,5646,n,1038,33,2088.76,error,2
11,1011,2016-01-01,5782,n,585,4,196.7,error,53
12,1012,2016-11-01,5059,n,947,35,6403.2,error,44
14,1014,2016-01-18,5573,n,423,17,7088.03,error,3


Fill Missing Price values with mean price

In [130]:
df_dropped.shape[0]

969

In [131]:
# Often dropping rows is a very expensive and unfeasible option. 
# In many scenarios, missing values are imputed using the help of other 
# values in the dataframe. One commonly used trick is to replace missing
# values with a central tendency measure like mean or median.
# fillna -> fills not available values
df_dropped['price'].fillna(value=np.round(df.price.mean(),decimals=2),
                                inplace=True)
df_dropped['price'].head()

9     2438.27
10    2088.76
11     196.70
12    6403.20
14    7088.03
Name: price, dtype: float64

Fill Missing user_type values with value from previous row (forward fill) 

In [132]:
print("Fill Missing user_type values with value from previous row (forward fill) ::" )
df_dropped['user_type'].fillna(method='ffill',inplace=True)
df_dropped.head()

Fill Missing user_type values with value from previous row (forward fill) ::


Unnamed: 0,serial_no,date,user_id,user_type,product_id,quantity_purchased,price,user_class,purchase_week
9,1009,2016-01-18,5002,n,228,14,2438.27,error,3
10,-1,2016-01-13,5646,n,1038,33,2088.76,error,2
11,1011,2016-01-01,5782,n,585,4,196.7,error,53
12,1012,2016-11-01,5059,n,947,35,6403.2,error,44
14,1014,2016-01-18,5573,n,423,17,7088.03,error,3


Fill Missing user_type values with value from next row (backward fill)

In [133]:
df_dropped['user_type'].fillna(method='bfill',inplace=True)
# keeping inplace true writes the new data in the data set itself.


# try to figure out : what would happen if we keep

inplace = True
df-> changes copies to orginal 
thereby new data space not consumed
___________________________________

inplace = False
df-----> changes not copied to orginal
thereby new data space create

// here do this kind if coding
dfNew = df.fillna(....,inplace=False)
_____________________________________


### Duplicates

Drop Duplicate serial_no rows

In [134]:
# sample duplicates
# duplicated is a build-in fn
print(df_dropped[df_dropped.duplicated(subset=['serial_no'])].head())
print("Shape of df={}".format(df_dropped.shape))

     serial_no       date  user_id user_type  product_id  quantity_purchased  \
22          -1 2016-05-01     5368         n         882                  34   
119         -1 2016-01-15     5627         d         197                  12   
136         -1 2016-01-02     5715         a         284                   7   
258         -1 2016-01-29     5384         d         391                   8   
281         -1 2016-01-21     5864         a        1068                  24   

       price      user_class  purchase_week  
22   2610.57           error             17  
119   687.15  loyal_existing              2  
136   597.20             new             53  
258   971.35  loyal_existing              4  
281  5361.45             new              3  
Shape of df=(969, 9)


In [135]:
# drop_duplicates is a build-in function 
df_dropped.drop_duplicates(subset=['serial_no'],inplace=True)

In [136]:
# updated dataframe
print(df_dropped.head())
print("Shape of df={}".format(df_dropped.shape))

    serial_no       date  user_id user_type  product_id  quantity_purchased  \
9        1009 2016-01-18     5002         n         228                  14   
10         -1 2016-01-13     5646         n        1038                  33   
11       1011 2016-01-01     5782         n         585                   4   
12       1012 2016-11-01     5059         n         947                  35   
14       1014 2016-01-18     5573         n         423                  17   

      price user_class  purchase_week  
9   2438.27      error              3  
10  2088.76      error              2  
11   196.70      error             53  
12  6403.20      error             44  
14  7088.03      error              3  
Shape of df=(943, 9)


Remove rows which have less than 3 attributes with non-missing data

In [137]:
# there are certain conditions where a record is not much of use 
# if it has more than a certain threshold of attribute values missing. 
# For instance, if in our dataset a transaction has less than three
# attributes as non-null, the transaction might almost be unusable. 
# In such a scenario, it might be advisable to drop that data point itself. 
# We can filter out such data points using the function dropna() 
# with the parameter thresh set to the threshold of non-null attributes

display(df.dropna(thresh=3).head())#<<<<--------------thresh=3 means there are atleast 3 values in dataframe
print("Shape of df={}".format(df.dropna(thresh=3).shape))

Unnamed: 0,serial_no,date,user_id,user_type,product_id,quantity_purchased,price,user_class,purchase_week
0,1000,NaT,-101,n,0,36,1777.8,error,0
1,1001,NaT,5975,n,625,2,1283.34,error,0
2,1002,NaT,5821,n,164,30,2211.46,error,0
3,1003,NaT,5838,n,629,5,1508.19,error,0
4,1004,NaT,5771,n,408,1,904.19,error,0


Shape of df=(1000, 9)


### Encode Categoricals

One Hot Encoding using get_dummies()

In [138]:
# method to convert the categorical variable into indicator variables 
# use the get_dummies() function.
display(pd.get_dummies(df,columns=['user_type']).head())

Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_class,purchase_week,user_type_a,user_type_b,user_type_c,user_type_d,user_type_n
0,1000,NaT,-101,0,36,1777.8,error,0,0,0,0,0,1
1,1001,NaT,5975,625,2,1283.34,error,0,0,0,0,0,1
2,1002,NaT,5821,164,30,2211.46,error,0,0,0,0,0,1
3,1003,NaT,5838,629,5,1508.19,error,0,0,0,0,0,1
4,1004,NaT,5771,408,1,904.19,error,0,0,0,0,0,1


Label Mapping

In [139]:
# using the map() function, where we simply map each value 
# from the allowed set to a numeric value
type_map={'a':0,'b':1,'c':2,'d':3,np.NAN:-1}
df['encoded_user_type'] = df.user_type.map(type_map)
display((df.tail()))

Unnamed: 0,serial_no,date,user_id,user_type,product_id,quantity_purchased,price,user_class,purchase_week,encoded_user_type
995,1995,2016-01-29,5883,c,517,4,2542.61,existing,4,2.0
996,1996,2016-01-13,5566,d,344,37,4041.02,loyal_existing,2,3.0
997,1997,2016-01-18,5062,d,1088,38,1802.28,loyal_existing,3,3.0
998,1998,2016-01-29,5928,c,562,28,,existing,4,2.0
999,1999,2016-01-29,5955,d,1083,35,8187.02,loyal_existing,4,3.0


### Random Sampling data from DataFrame

In [140]:
display(df.sample(frac=0.2, replace=True, random_state=42).head())
# explaination for replace parameter
# when sampling the records are removed from the orginal dataset, so that
# their is no repetition of samples in the o/p.
# but if the size of the sample is greater than the dataset itself then keep
# replace = true

Unnamed: 0,serial_no,date,user_id,user_type,product_id,quantity_purchased,price,user_class,purchase_week,encoded_user_type
102,1102,2016-02-02,5821,b,164,41,,new,5,1.0
435,1435,2016-01-28,5520,a,813,34,,new,4,0.0
860,1860,2016-06-02,5026,d,775,2,1177.42,loyal_existing,22,3.0
270,1270,2016-01-14,5364,c,271,25,1168.55,existing,2,2.0
106,1106,2016-07-01,5259,c,437,6,5631.92,existing,26,2.0


### Normalizing Numeric Values

Attribute normalization is the process of standardizing the range of values of attributes. Machine learning algorithms in many cases utilize distance metrics, attributes or features of different scales/ranges which might adversely affect the calculations or bias the outcomes. Normalization is also called feature scaling.

Normalize price values using  **Min-Max Scaler**

In [142]:
df_normalized = df.dropna().copy()
min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(df_normalized['price'].values.reshape(-1,1))
print(np_scaled)
df_normalized['price'] = np_scaled

# fit doesnt need reshape but transfrom does
# reshape(-1,1) works like this :
# -1 means we dont know the no. of rows. hence it would take len(df).
# the 2nd parameter being 1 -> means I want only 1 column
# ex: assume z is 2D numpy array and z.shape is (3,4) 
# z.reshape(-1) would give you a 1D np.array
# now z.shape would be (12,)  like 
# array([1,2,3,4,5,6,7,8,9,10,11,12])
# and reshaping it as z.reshape(-1,1) would give us (12,1). 
# i.e 1 column with all row values. 

[[1.53994568e-01]
 [2.28505608e-01]
 [1.22267241e-01]
 [3.32853877e-01]
 [1.76288728e-01]
 [9.43362875e-02]
 [6.30524872e-01]
 [9.00616131e-02]
 [2.64223401e-01]
 [7.81791736e-02]
 [2.46891330e-03]
 [7.82034352e-03]
 [6.62500592e-01]
 [3.98472439e-01]
 [3.61303568e-01]
 [3.33578261e-01]
 [2.38991269e-01]
 [9.80275266e-02]
 [2.17305962e-01]
 [2.63198635e-01]
 [1.82923716e-01]
 [1.99879154e-01]
 [5.68704993e-02]
 [2.08645702e-01]
 [3.69168969e-01]
 [1.22459024e-01]
 [2.94131103e-01]
 [2.69708848e-02]
 [2.29922028e-01]
 [2.15725487e-01]
 [1.96616537e-01]
 [3.37556018e-01]
 [2.04335213e-01]
 [8.58516364e-03]
 [2.93246130e-01]
 [1.13823026e-01]
 [4.06044390e-01]
 [1.73630342e-01]
 [3.11470107e-01]
 [4.21433217e-01]
 [3.87068301e-01]
 [1.06318084e-01]
 [4.90386026e-02]
 [2.73427815e-01]
 [4.12214940e-01]
 [3.27128124e-01]
 [1.67309601e-01]
 [4.87907870e-01]
 [1.00141757e-01]
 [4.46152389e-01]
 [3.94744230e-01]
 [2.09193323e-02]
 [2.71638229e-01]
 [5.45414950e-01]
 [3.56854440e-01]
 [1.123869

In [106]:
display(df_normalized.head())

Unnamed: 0,serial_no,date,user_id,user_type,product_id,quantity_purchased,price,user_class,purchase_week,encoded_user_type
15,1015,2016-01-23,5122,d,382,30,0.341733,loyal_existing,3,3.0
17,1017,2016-01-29,5596,d,243,39,0.436007,loyal_existing,4,3.0
20,1020,2016-12-01,5456,d,802,2,0.608235,loyal_existing,48,3.0
33,1033,2016-05-02,5908,d,624,9,0.664128,loyal_existing,18,3.0
34,1034,2016-01-17,5101,d,183,20,0.394531,loyal_existing,2,3.0


Normalize quantity purchased values using  **Robust Scaler**

In [145]:
df_normalized = df.dropna().copy()
robust_scaler = preprocessing.RobustScaler()
rs_scaled = robust_scaler.fit_transform(df_normalized['quantity_purchased'].values.reshape(-1,1))
df_normalized['quantity_purchased'] = rs_scaled

# interested people may research the maths behind min-max scaler and
# robust_scaler and standard scaler. 

In [66]:
display(df_normalized.head())

Unnamed: 0,serial_no,date,user_id,user_type,product_id,quantity_purchased,price,user_class,purchase_week,encoded_user_type
27,1027,2016-03-02,5660,c,1099,0.3,2220.32,existing,9,2.0
31,1031,2016-03-02,5666,d,1080,0.35,1534.68,loyal_existing,9,3.0
32,1032,2016-02-01,5820,c,1074,0.75,1240.45,existing,5,2.0
36,1036,2016-05-02,5388,d,168,-0.45,5995.7,loyal_existing,18,3.0
39,1039,2016-02-01,5564,a,185,-0.1,5134.82,new,5,0.0


### Data Summarization

Data summarization refers to the process of preparing a compact representation of raw data at hand. This process involves aggregation of data using different statistical, mathematical, and other methods. Summarization is helpful for visualization, compressing raw data, and better understanding of its attributes.

Condition based aggregation

In [146]:
print("Mean price of items purchased by user_type=a :: {}".format(df['price'][df['user_type']=='a'].mean()))

Mean price of items purchased by user_type=a :: 2101.9403827751194


Condtion based counts

In [147]:
print(df['purchase_week'].value_counts())
# counts the number of transactions per week

3     161
4     159
2     110
13     56
44     52
9      50
53     48
31     47
39     46
5      45
26     42
35     41
22     41
0      31
18     26
48     25
17     20
Name: purchase_week, dtype: int64


### Group By

Group By certain attributes

In [149]:
print(df.groupby(['user_class'])['quantity_purchased'].sum())
# This statement generates a tabular output representing 
# sum of quantities purchased by each user_class.

user_class
error              659
existing          5049
loyal_existing    5251
new               9492
Name: quantity_purchased, dtype: int32


Group By with different aggregate functions

In [150]:
# The groupby() function is a powerful interface that allows us 
# to perform complex groupings and aggregations.
# With groupby() we can perform multi-attribute groupings 
# and apply multiple aggregations across attributes.

# variant-1: multiple aggregations on single attribute
display(df.groupby(['user_class'])['quantity_purchased'].agg([np.sum,
                                                                np.mean,
                                                                np.count_nonzero]))

Unnamed: 0_level_0,sum,mean,count_nonzero
user_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
error,659,21.966667,30
existing,5049,20.115538,251
loyal_existing,5251,20.431907,257
new,9492,20.545455,462


Group by specific aggregate functions for each attribute

In [151]:
# variant-2: different aggregation functions for each attribute
display(df.groupby(['user_class','user_type']).agg({'price':np.mean,
                                                        'quantity_purchased':np.max}))

Unnamed: 0_level_0,Unnamed: 1_level_0,price,quantity_purchased
user_class,user_type,Unnamed: 2_level_1,Unnamed: 3_level_1
error,n,2909.582667,39
existing,c,2385.705885,41
loyal_existing,d,2213.908193,41
new,a,2101.940383,41
new,b,2171.824244,41


Group by with multiple agg for each attribute

In [152]:
# Variant 3: Here, we do a combination of variants 1 and 2, 
# i.e., we apply multiple aggregations on the price field while 
# applying only a single one on quantity_purchased. 
# Note : a dictionary is passed, as shown in the snippet.
display(df.groupby(['user_class','user_type']).agg({'price':{  'total_price':np.sum,
                                                                'mean_price':np.mean,
                                                                'variance_price':np.std,
                                                                'count':np.count_nonzero},
                                                   'quantity_purchased':np.sum}))  

Unnamed: 0_level_0,Unnamed: 1_level_0,price,price,price,price,quantity_purchased
Unnamed: 0_level_1,Unnamed: 1_level_1,total_price,mean_price,variance_price,count,sum
user_class,user_type,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
error,n,87287.48,2909.582667,1968.091733,30.0,659
existing,c,579726.53,2385.705885,1661.674534,251.0,5049
loyal_existing,d,551263.14,2213.908193,1521.597962,257.0,5251
new,a,439305.54,2101.940383,1600.018728,216.0,4522
new,b,516894.17,2171.824244,1459.375193,246.0,4970


### Pivot Tables

In [160]:
display(df.pivot_table(index='date',columns='user_type',values='user_id',aggfunc = np.count_nonzero))
# pivot table shows us comprehensive information of mean price 
# date-wise , user_type wise

user_type,a,b,c,d,n
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-01-01,4.0,7.0,8.0,3.0,1.0
2016-01-02,6.0,2.0,10.0,6.0,1.0
2016-01-13,7.0,10.0,3.0,5.0,1.0
2016-01-14,4.0,5.0,4.0,8.0,1.0
2016-01-15,4.0,4.0,1.0,9.0,
2016-01-16,5.0,4.0,5.0,4.0,
2016-01-17,3.0,4.0,7.0,12.0,
2016-01-18,5.0,4.0,2.0,7.0,2.0
2016-01-19,6.0,3.0,6.0,8.0,
2016-01-20,5.0,6.0,6.0,7.0,


In [162]:
df_1=df.dropna()
df_1.pivot_table(index='date',columns='user_type',values='user_id',aggfunc = np.count_nonzero)

user_type,a,b,c,d
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-01-01,4,7,8,3
2016-01-02,6,2,10,6
2016-01-13,6,10,3,5
2016-01-14,4,4,4,8
2016-01-15,4,3,1,9
2016-01-16,5,4,4,3
2016-01-17,3,4,7,11
2016-01-18,5,4,2,6
2016-01-19,5,3,6,8
2016-01-20,5,6,6,7
