# Cornershop's Data Science Test

## Dev: Mateus Broilo
mateus.brilo90@gmail.com

* Objectives: Load Datasets and perform 

 # Summary

* Initials
    * Loading Libraries and Packages
    * Configurations
    * Global Variables
    * Function's Definition
        * Class of function associated with the bag of dataset's information
* Loading data
* Analytical Record

# Initials
* Loading Libraries and Packages
* Configuration
* Global Variables

## Loading Libraries and Packages 

In [2]:
# general
import warnings
import time
import gc

#data
import pandas as pd
from datetime import (datetime, date)

In [4]:
ls -l ../data/

total 504
-rw-rw-r-- 1 broilo broilo 514550 mai 20 15:58 20220520_corrected_sheet1.csv


## Configurations

In [5]:
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 20)

## Global Variables

In [6]:
PATH_DATA = "../data/"
DATA = "20220520_corrected_sheet1.csv"
DATE = time.strftime("%Y%m%d")

## Function's Definition

### Class of function associated with the bag of dataset's information

In [18]:
class infoTools:
    
    def list(dataset, listCol):
        """
        This function returns a table containing all the
        specific required information about the dataset
        and will be used to construct the Analytical Record.
        Args:
            dataset (object/spreadsheet): The dataset under analysis.
            listCol (str list): List containing the columns of the dataset under analysis.
        Returns:
            table (object/spreadsheet): Analytical Record informations.
        """

        info = []

        for i in listCol:
            dtypes = dataset[i].dtypes
            unique = len(dataset[i].unique())
            null = dataset[i].isna().sum()
            nonNull = dataset[i].loc[~dataset[i].isna()].count()
            tot = null + nonNull
            nullPct = round((null / tot)*100, 3)
            nonNullPct = round((nonNull / tot)*100, 3)
            info.append([i, dtypes, null, nonNull, nullPct, nonNullPct, unique])
            
        cols = ['COLUMN_NAME', 'COLUMN_DTYPE', '#_NULL', '#_NON_NULL', '%_NULL', '%_NON_NULL', 'UNIQUE_VALUES']
        table = pd.DataFrame(info, columns=cols)
        return table
    
    def unique(dataset, listCol):
        """
        This function returns a tabulate-form table containing 
        all the unique entries related to the columns/features.
        Args:
            dataset (object/spreadsheet): The dataset under analysis.
            listCol (str list): List containing the columns of the dataset under analysis.
        Returns:
            table (object/spreadsheet): Tabulate with unique entries.
        """

        data = []

        for i in listCol:
            uniqueLen = dataset[i].unique()
            data.append([i, dataset[i].dtypes, uniqueLen])
            
        cols = ['COLUMN_NAME', 'COLUMN_DTYPE', 'UNIQUE_VALUES']
        table = pd.DataFrame(data, columns=cols)
        return table

# Loading Datasets

In [9]:
df = pd.read_csv(PATH_DATA + DATA, sep=',')
print(df.shape)
df

(11128, 5)


Unnamed: 0,Dia,Hora,Valor,Cartão,CBK
0,2015-05-01,00:01:54,36.54,536518******2108,Não
1,2015-05-01,00:03:46,36.54,536518******2108,Não
2,2015-05-01,00:08:50,69.00,453211******1239,Não
3,2015-05-01,00:27:00,193.43,548827******1705,Não
4,2015-05-01,01:32:46,132.00,531681******9778,Não
...,...,...,...,...,...
11123,2015-05-30,23:07:01,53.00,514868******7409,Não
11124,2015-05-30,23:08:47,15.00,439354******5281,Não
11125,2015-05-30,23:15:24,20.00,549167******1648,Não
11126,2015-05-30,23:17:41,70.00,518759******8384,Não


# Analytical Record

In [19]:
print(f"infoTools.list:\n{infoTools.list.__doc__}")
print(f"infoTools.uique:\n{infoTools.unique.__doc__}")

infoTools.list:

        This function returns a table containing all the
        specific required information about the dataset
        and will be used to construct the Analytical Record.
        Args:
            dataset (object/spreadsheet): The dataset under analysis.
            listCol (str list): List containing the columns of the dataset under analysis.
        Returns:
            table (object/spreadsheet): Analytical Record informations.
        
infoTools.uique:

        This function returns a tabulate-form table containing 
        all the unique entries related to the columns/features.
        Args:
            dataset (object/spreadsheet): The dataset under analysis.
            listCol (str list): List containing the columns of the dataset under analysis.
        Returns:
            table (object/spreadsheet): Tabulate with unique entries.
        


In [14]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Valor,11128.0,129.550076,141.420554,1.0,55.0,99.0,154.0,2920.0


In [16]:
tableList = infoTools.list(df, df.columns)
tableList

Unnamed: 0,COLUMN_NAME,COLUMN_DTYPE,#_NULL,#_NON_NULL,%_NULL,%_NON_NULL,UNIQUE_VALUES
0,Dia,object,0,11128,0.0,100.0,30
1,Hora,object,0,11128,0.0,100.0,10044
2,Valor,float64,0,11128,0.0,100.0,511
3,Cartão,object,0,11128,0.0,100.0,9260
4,CBK,object,0,11128,0.0,100.0,2


In [20]:
tableUnique = infoTools.unique(df, df.columns)
tableUnique

Unnamed: 0,COLUMN_NAME,COLUMN_DTYPE,UNIQUE_VALUES
0,Dia,object,"[2015-05-01, 2015-05-02, 2015-05-03, 2015-05-0..."
1,Hora,object,"[00:01:54, 00:03:46, 00:08:50, 00:27:00, 01:32..."
2,Valor,float64,"[36.54, 69.0, 193.43, 132.0, 161.0, 110.0, 159..."
3,Cartão,object,"[536518******2108, 453211******1239, 548827***..."
4,CBK,object,"[Não, Sim]"
