In [1]:
import pandas as pd
import numpy as np
import math

ddof = 0
dataset = pd.read_csv("datasets/dataset_train.csv")
dataset.drop("Index", inplace=True, axis=1)
dataset.dropna(inplace=True)

In [2]:
def warn_diff(feature, val_type, standard_val, local_val):
        print("DIFF FOR: [%s] [%s] [%f] [%f]" % (feature, val_type, standard_val, local_val))

In [3]:
def retrieve_first_value(values):
    v = None
    for value in values:
        if not math.isnan(value):
            v = value
            break
    return v

In [4]:
def my_count(values):
    c = 0.0
    for value in values:
        c += 1
    return c

In [5]:
def my_sum(values):
    s = 0
    for value in values:
        s += value
    return s

In [6]:
def my_mean(values):
    return (my_sum(values) / my_count(values))

In [7]:
def my_std(values, mean, ddof=0):
    total = 0.0
    for value in values:
        total += abs(value - mean)**2
    std = np.sqrt(total / (my_count(values) - ddof))
    return std

In [8]:
def my_min(values):
    m = retrieve_first_value(values)
    for value in values:
        if value < m:
            m = value
    return m

In [9]:
def my_max(values):
    m = retrieve_first_value(values)
    for value in values:
        if value > m:
            m = value
    return m

In [10]:
def my_quantile(values, quantile):
    duplicate = values.copy()
    duplicate.sort_values(inplace=True)
    return duplicate.iloc[int(my_count(duplicate) * quantile)]

In [11]:
describe = {}
control_values = {}
for feature in dataset:
    if dataset[feature].dtypes != np.object:
        describe[feature] = {}
        control_values[feature] = {}
        
        control_values[feature]["count"] = dataset[feature].count()
        describe[feature]["count"] = my_count(dataset[feature])
        
        control_values[feature]["mean"] = dataset[feature].mean()
        describe[feature]["mean"] = my_mean(dataset[feature])
     
        control_values[feature]["std"] = dataset[feature].std(ddof=ddof)
        describe[feature]["std"] = my_std(dataset[feature], describe[feature]["mean"], ddof=ddof)
        
        control_values[feature]["min"] = dataset[feature].min()
        describe[feature]["min"] = my_min(dataset[feature])
        
        control_values[feature]["25%"] = dataset[feature].quantile(.25, interpolation='nearest')     
        describe[feature]["25%"] = my_quantile(dataset[feature], .25)
        
        control_values[feature]["50%"] = dataset[feature].quantile(.5, interpolation='nearest')     
        describe[feature]["50%"] = my_quantile(dataset[feature], .5)
        
        control_values[feature]["75%"] = dataset[feature].quantile(.75, interpolation='nearest')     
        describe[feature]["75%"] = my_quantile(dataset[feature], .75)
        
        control_values[feature]["max"] = dataset[feature].max()
        describe[feature]["max"] = my_max(dataset[feature])
        
        for value in control_values[feature].keys():
            if not np.isclose(describe[feature][value], control_values[feature][value]):
                warn_diff(feature, value, describe[feature][value], control_values[feature][value])
        

In [12]:
for feature in describe.keys():
    print("Feature name: [%s]" % (feature))
    for value in describe[feature].keys():
        print("\t%s: [%0.6f]" % (value, describe[feature][value]))

Feature name: [Arithmancy]
	count: [1251.000000]
	mean: [49453.109512]
	std: [16694.876442]
	min: [-24370.000000]
	25%: [38144.000000]
	50%: [48793.000000]
	75%: [60828.000000]
	max: [104956.000000]
Feature name: [Astronomy]
	count: [1251.000000]
	mean: [46.476449]
	std: [520.737840]
	min: [-966.740546]
	25%: [-485.396284]
	50%: [272.071636]
	75%: [528.448923]
	max: [1016.211940]
Feature name: [Herbology]
	count: [1251.000000]
	mean: [1.189457]
	std: [5.221049]
	min: [-10.295663]
	25%: [-4.261209]
	50%: [3.526427]
	75%: [5.465608]
	max: [10.296759]
Feature name: [Defense Against the Dark Arts]
	count: [1251.000000]
	mean: [-0.464764]
	std: [5.207378]
	min: [-10.162119]
	25%: [-5.284489]
	50%: [-2.720716]
	75%: [4.853963]
	max: [9.667405]
Feature name: [Divination]
	count: [1251.000000]
	mean: [3.213831]
	std: [4.109348]
	min: [-8.727000]
	25%: [3.103000]
	50%: [4.621000]
	75%: [5.729000]
	max: [10.032000]
Feature name: [Muggle Studies]
	count: [1251.000000]
	mean: [-222.903666]
	std: [