# Utility classes for data management

In [3]:
import os
import tarfile
import urllib.request
import numpy as np
import hashlib
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.model_selection import StratifiedShuffleSplit

# Data analytics pipeline

1. Data acquisition
2. Data exploration
3. Data Manipulation
    - Enrich/transform variables data
    - Split in training and test sets
    - Separate features variables from target variables
    - Data cleaning
    - Handle categorical variables


In [2]:
class Download(object):
    
    @staticmethod
    def fetch_tgz(base_url, tgz, local):
        """
        Fetch tgz data from http
        
        :param str base_url: file url except the filename
        :param str tgz: name of tgz file
        :param str local: path of local directory where to save data
        """
        if not os.path.isdir(local):
            os.makedirs(local)
        tgz_file = os.path.join(local, tgz)
        full_url = "/".join([base_url, tgz])
        urllib.request.urlretrieve(full_url, tgz_file)
        tar = tarfile.open(tgz_file)
        tar.extractall(path=local)
        tar.close()
        os.remove(tgz_file)
    
    @staticmethod
    def fetch_json(base_url, json_file, local):
        """
        Fetch json data from http
        
        :param str base_url: file url
        :param str json_file: name of destination file
        :param str local: path of local directory where to save data
        """
        if not os.path.isdir(local):
            os.makedirs(local)
        urllib.request.urlretrieve(base_url, tgz_file)


## Strategies to split the dataset in train and test sets
Problem: split a dataset such that we have a train set and a dataset.

### Strategy 1: <code>DataManager.random_test()</code>
Generate a random index from a seed and use it to split the data. This is not consistent in case of update. 

### Strategy 2: <code>DataManager.hash_test()</code>
Take the hash of one of the unique identifiers provided in data. Then select instances to be part of the test set according to the last byte of hash. Consistent with respect to the identifier.

### Strategy 3: <code>DataManager.stratified_test()</code>
Split the dataset in groups according to the distribution of value in one or more attributes. Then, get a sample randomly from each group.

In [None]:
class TrainingSet(object):
    
    def __init__(self, dataframe, test_size=0.2):
        """
        Split dataframe in train and test set
        
        :param pandas dataframe: data
        :param float test_size: fraction of the dataset to be provided as test
        """
        self.data = dataframe
        self.test = test_size
        
    def random_test(self):
        """
        :return train_set, test_set
        """
        np.random.seed(42)
        indexes = np.random.permutation(self.data.shape[0])
        test_size = int(self.data.shape[0] * self.test)
        train_indexes, test_indexes = indexes[test_size:], indexes[:test_size]
        return self.data.iloc[train_indexes], self.data.iloc[test_indexes]
    
    def hash_test(self, column=None, hashf=hashlib.md5):
        """
        :param str column: col to use as unique id. If None an ID column is added.
        :param function hashf: hash function to use
        :return train_set, test_set
        """
        if column is not None:
            h = self.data
        else:
            h, column = self.data.reset_index(), 'index'
        test_data = h[column].apply(
            lambda id_: hashf(np.int64(id_)).digest()[-1] < 
            256 * self.test
        )
        return self.data.loc[~test_data], self.data.loc[test_data]
    
    def stratified_test(self, column, strata=10):
        """
        :param str column: the col to use for strata
        :param int strata: number of classes (kmeans is used to create classes)
        :return train_set, test_set
        """
        kmeans = KMeans(n_clusters=strata).fit(self.data[column].values.reshape(-1,1))
        h = self.data.copy()
        h['klasses'] = kmeans.labels_
        split = StratifiedShuffleSplit(n_splits=1, test_size=self.test, random_state=42)
        for train_i, test_i in split.split(h, h['klasses']):
            train_set, test_set = self.data.loc[train_i], self.data.loc[test_i]
        return train_set, test_set
    
    @staticmethod
    def training_labels(training_df, labels):
        """
        Separate features from target labels
        
        :param pandas dataframe training_df: the training dataframe
        :param list labels: labels to be used as target
        """
        return training_df.drop(labels, axis=1), training_df[labels].copy()

# Data cleaning
Data cleaning has the main goal of get rid of NULL values in data. Strategies:
- Get rid of the row(s) $\rightarrow$ <code>df.dropna(subset=[columns])</code>
- Get rid of the column(s) $\rightarrow$ <code>df.drop([columns], axis=1)</code>
- Set NULL values to some value (e.g., mean, median) $\rightarrow$ <code>df[columns].fillna(median, inplace=True)</code>

Another option is to use <code>sklearn.preprocessing.Imputer</code> as follows.

# Categorical data
For categorical data we implment the 'one-hot encoding', which saves a binary vector for each possible value of categorical fields.

In [None]:
from sklearn.preprocessing import Imputer, LabelBinarizer, StandardScaler


class DataCleaner(object):
    
    def __init__(self, strategy='median', text_attributes=None):
        """
        :param text_attributes: columns not containing numerical values
        """
        self.imputer = Imputer(strategy=strategy)
        self.text_attributes = text_attributes
        
    def fit(self, df):
        if self.text_attributes is not None:
            dfc = df.drop(self.text_attributes, axis=1)
        else:
            dfc = df.copy()
        self.imputer.fit(dfc)
        
    def transform(self, df, y=None):
        if self.text_attributes is not None:
            text = df[self.text_attributes].copy()
            dfc = df.drop(self.text_attributes, axis=1)
        else:
            text = None
            dfc = df.copy()
        dfc = pd.DataFrame(self.imputer.transform(dfc), columns=dfc.columns, index=dfc.index)
        if text is not None:
            dfc = dfc.join(text)
        return dfc
        
    def fit_transform(self, df, y=None):
        """
        :param pandas dataframe df: the dataframe
        """
        self.fit(df)
        return self.transform(df)
    
    
class CategoricalData(object):
    
    def __init__(self, text_attribute):
        """
        :param text_attributes: columns not containing numerical values
        """
        self.encoder = LabelBinarizer()
        self.text_attribute = text_attribute
    
    def fit(self, df):
        self.encoder.fit(df[self.text_attribute].copy())
        
    def transform(self, df, y=None):
        """
        :param pandas dataframe df: the dataframe
        :param y: see https://goo.gl/PeoVZ1
        """
        data = df[self.text_attribute].copy()
        dfc = df.drop(self.text_attribute, axis=1)
        hc = self.encoder.fit_transform(data)
        for i, col in enumerate(self.encoder.classes_):
            dfc[col] = hc[:,i]
        return dfc
    
    def fit_transform(self, df, y=None):
        """
        :param y: see https://goo.gl/PeoVZ1
        """
        return self.transform(df)
        
