In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Exercise 1

In [None]:
%%time
## Unit Test for 1

def testImputation(filename, nasymbol, incompletePoint, expectedCompletion):
    df = read_csv(filename, nasymbol)[:10000]
    colsWithQuestionMark = []
    for col in df:
        for val in df[col]:
            if nasymbol in str(val):
                colsWithQuestionMark.append(str(col))
                break
    print("Replacement: " + ("ok" if len(colsWithQuestionMark) == 0 else (" FAILED. The following columns have a missing symbol \"" + str(nasymbol) + "\": " + str(colsWithQuestionMark))))

    # check imputed neighbor
    actNN = getNearestNeighbor(df.dropna(), incompletePoint)
    reqNN = expectedCompletion
    nnOK = True
    for index in range(len(reqNN)):
        req = reqNN[index]
        act = actNN[index]
        if req != act:
            print("Nearest neighbor does not coincide in position " + str(index) + ". Expected " + str(req) + " but saw " + str(act))
            nnOK = False
    if nnOK:
        print("Nearest neighbor correctly identified. Now checking conversion to matrix.")

        # now compute the entirely imputed
        M = readFrameAsMatrixImputed(df)
        if type(M) != np.ndarray:
            print("Output is not an np-array but ", type(M))
        else:
            hasNanEntries = False
            for row in M:
                if np.count_nonzero(row == str(np.nan)):
                    hasNanEntries = True
                    break
            print("Imputation: " + ("ok" if not hasNanEntries else "failed. There are nan-entries"))

## check for this incomplete point that the correct nearest neighbor is computed
filename = "credits.csv"
incompletePoint = pd.read_csv(filename).iloc[4,:]
print(incompletePoint)
reqNN = [27, ' Private', 103524, " HS-grad", 9, " Never-married", " Handlers-cleaners", " Unmarried", " White", " Male", 0, 0, 35, " United-States", " <=50K"]
testImputation(filename, " ?", incompletePoint, reqNN)

# Exercise 2

In [15]:
def testOutlierRemoval(df, expected_rows):
    numeric_columns = list(df.select_dtypes(include=np.number).columns)
    for k in [0.5, 1, 2]:
        print("k = " + str(k))
        dfReduced = removeOutliers(df, k)
        print("Outcome size: " + ("OK" if len(dfReduced) == expected_rows[str(k)] else "FAIL. Expected " + str(expected_rows[str(k)]) + " but saw " + str(len(dfReduced))))
        for col in numeric_columns:
            vals = dfReduced[col].values
            q75, q25 = np.percentile(vals, [75 ,25])
            print("max of " + col + ": " + ("OK" if max(vals) <= q75 + k * (q75-q25) else "FAIL"))
            print("min of " + col + ": " + ("OK" if min(vals) >= q25 - k * (q75-q25) else "FAIL"))

testOutlierRemoval(pd.read_csv("credits.csv"), {"0.5": 901, "1": 14781, "2": 18803})

k = 0.5
Outcome size: OK
max of age: OK
min of age: OK
max of fnlwgt: OK
min of fnlwgt: OK
max of education-num: OK
min of education-num: OK
max of capital-gain: OK
min of capital-gain: OK
max of capital-loss: OK
min of capital-loss: OK
max of hours-per-week: OK
min of hours-per-week: OK
k = 1
Outcome size: OK
max of age: OK
min of age: OK
max of fnlwgt: OK
min of fnlwgt: OK
max of education-num: OK
min of education-num: OK
max of capital-gain: OK
min of capital-gain: OK
max of capital-loss: OK
min of capital-loss: OK
max of hours-per-week: OK
min of hours-per-week: OK
k = 2
Outcome size: FAIL. Expected 18803 but saw 18806
max of age: OK
min of age: OK
max of fnlwgt: OK
min of fnlwgt: OK
max of education-num: OK
min of education-num: OK
max of capital-gain: OK
min of capital-gain: OK
max of capital-loss: OK
min of capital-loss: OK
max of hours-per-week: OK
min of hours-per-week: OK


In [2]:
def getNumericalAttributes(df):
    return list(df.select_dtypes(exclude=[object]))

In [3]:
def getNumericalAttributes(df):
    return list(df.select_dtypes(exclude=[object]))

def createBoxPlots(D):
    iqrMultipliers = [0.25, 0.5, 1, 1.5, 2]
    numericalAtts = getNumericalAttributes(D)
    d = len(numericalAtts)
    
    for i in numericalAtts:
        fig, ax = plt.subplots(1,5, figsize=(20,5))
        for j, whis in enumerate(iqrMultipliers):
            col = D[i]        
            ax[j].set_title(i)
            ax[j].yaxis.set_ticks_position('none')
            ax[j].grid(color='grey', axis='y', linestyle='-', linewidth=0.25, alpha=0.5)
            ax[j].set_xlabel('whisker at ' + str(whis))
            ax[j].boxplot(col, whis=whis)  
            fig.tight_layout()

In [14]:
def removeOutliers(D, k):
    
    row_length = D.shape[0]
    if row_length == 0:
        return D
    
    numericalAtts = getNumericalAttributes(D)

    trues = np.array([True] * row_length)
    for i in numericalAtts:
        col = D[i]  
        iqr = np.subtract(*np.percentile(col, [75, 25]))
        q3 = np.percentile(col, [75])
        q1 = np.percentile(col, [25])
        above = np.array(col)<=(q3[0] + k*iqr)
        below = np.array(col)>=(q1[0] - k*iqr)
        trues = trues & above & below

    conditioned = pd.DataFrame(D[trues])
    final_frame = conditioned.reset_index(drop = True)
    final_row_length = final_frame.shape[0]
    
    if final_row_length!= row_length:
        return removeOutliers(final_frame, k)
    else:
        return final_frame
    return final_frame

# Exercise 3

In [None]:
def testCorrelationRemoval(df, tau, expected_cols):
    dfNumeric = df.select_dtypes(include=['int16', 'int32', 'int64', 'float16', 'float32', 'float64'])
    DReduced = eliminateCorrelations(dfNumeric.values, tau)
    cors = np.abs(np.corrcoef(DReduced,rowvar=False))
    cors = [cors[i,j] for i in range(len(cors)) for j in range(i)]
    
    print("Filter Success: " + ("OK" if max(cors) < tau else "FAIL. There is a pair with correlation " + str(max(cors)) + " > " + str(tau)))
    print("Shape: " + ("OK" if (len(DReduced) == len(dfNumeric)) & (DReduced.shape[1] == expected_cols) else "FAIL"))

testCorrelationRemoval(pd.read_csv("credits.csv"), 0.1, expected_cols=5)