In [1]:
import numpy as np
import random
import matplotlib.pyplot as plt
import pandas as pd
import math
import scipy
%matplotlib inline
plt.style.use('seaborn')

In [2]:
import sklearn
import matplotlib
import sys
libraries = (('Matplotlib', matplotlib), ('Numpy', np), ('Pandas', pd), ('Scipy', scipy), ('Sklearn', sklearn))

print("Python Version:", sys.version, '\n')
for lib in libraries:
    print('{0} Version: {1}'.format(lib[0], lib[1].__version__))

Python Version: 3.6.2 |Anaconda custom (64-bit)| (default, Sep 21 2017, 18:29:43) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)] 

Matplotlib Version: 2.0.2
Numpy Version: 1.12.1
Pandas Version: 0.20.3
Scipy Version: 0.19.1
Sklearn Version: 0.19.0


In [21]:
import numpy as np
from copy import copy


class standard_scaler:
    
    def __init__(self, demean=True, dev_scale=True):
        """
        Standard Scaler demeans each column and converts 
        each column to have a standard deviation of 1.
        ---
        KWargs:
        demean: whether to subtract the mean from each column
        dev_scale: whether to convert to unit variance
        """
        self.demean = demean
        self.dev_scale = dev_scale
        self.data_stats = {}
        self.number_of_columns = None
        
    def fit(self, X):
        """
        Learns about the input data and stores the mean and 
        standard deviation of each column.
        ---
        In: X (features); np.array or pandas dataframe/series
        """
        X = self.convert_to_array(X)
        self.number_of_columns = X.shape[1]
        
        for ix in range(self.number_of_columns):
            col = X.T[ix]
            col_mean = np.mean(col)
            col_std = np.std(col)
            self.data_stats[ix] = (col_mean, col_std)
    
    def transform(self,X):
        """
        Given the information learned about the training data,
        remove the mean and scale the new data as requested by
        the user.
        ---
        In: X (features); np.array or pandas dataframe/series
        """
        X = self.convert_to_array(X)
        new_X = copy(X)
        
        for ix in range(self.number_of_columns):
            if self.demean:
                new_X.T[ix] = new_X.T[ix] - self.data_stats[ix][0]
            if self.dev_scale:
                new_X.T[ix] = new_X.T[ix]/self.data_stats[ix][1]
        
        return new_X
    
    def fit_transform(self, X):
        """
        Learn from X and then return the transformed version
        of X for the user to use.
        ---
        In: X (features); np.array or pandas dataframe/series
        """
        self.fit(X)
        return self.transform(X)
    
    def pandas_to_numpy(self, x):
        """
        Checks if the input is a Dataframe or series, converts to numpy matrix for
        calculation purposes.
        ---
        Input: X (array, dataframe, or series)
        Output: X (array)
        """
        if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()):
            return x.as_matrix()
        if type(x) == type(np.array([1,2])):
            return x
        return np.array(x) 
    
    def handle_1d_data(self,x):
        """
        Converts 1 dimensional data into a series of rows with 1 columns
        instead of 1 row with many columns.
        """
        if x.ndim == 1:
            x = x.reshape(-1,1)
        return x
    
    def convert_to_array(self, x):
        """
        Takes in an input and converts it to a numpy array
        and then checks if it needs to be reshaped for us
        to use it properly
        """
        x = self.pandas_to_numpy(x)
        x = self.handle_1d_data(x)
        return x

In [22]:
from sklearn.datasets import load_iris

data = load_iris().data

In [23]:
ss = standard_scaler()

In [25]:
ss.fit(data)

In [26]:
ss.transform(data)[:10,0]

array([-0.90068117, -1.14301691, -1.38535265, -1.50652052, -1.02184904,
       -0.53717756, -1.50652052, -1.02184904, -1.74885626, -1.14301691])

In [20]:
from sklearn.preprocessing import StandardScaler

ss_sk = StandardScaler()
ss_sk.fit(data)
ss_sk.transform(data)[:10,0]

array([-0.90068117, -1.14301691, -1.38535265, -1.50652052, -1.02184904,
       -0.53717756, -1.50652052, -1.02184904, -1.74885626, -1.14301691])