In [3]:
import pandas as pd
import numpy as np

In [5]:
# Insert the functions create_normalization and apply_normalization below (after the comments)
#
# Input to create_normalization:
# df: a dataframe (where the column names "CLASS" and "ID" have special meaning)
# normalizationtype: "minmax" (default) or "zscore"
#
# Output from create_normalization:
# df: a new dataframe, where each numeric value in a column has been replaced by a normalized value
# normalization: a mapping (dictionary) from each column name to a triple, consisting of
#                ("minmax",min_value,max_value) or ("zscore",mean,std)
#
# Hint 1: First copy the input dataframe and modify the copy (the input dataframe should be kept unchanged)
# Hint 2: Consider columns of type "float" or "int" only (and which are not labeled "CLASS" or "ID"),
#         the other columns should remain unchanged
# Hint 3: Take a close look at the lecture slides on data preparation
#
# Input to apply_normalization:
# df: a dataframe
# normalization: a mapping (dictionary) from column names to triples (see above)
#
# Output from apply_normalization:
# df: a new dataframe, where each numerical value has been normalized according to the mapping
#
# Hint 1: First copy the input dataframe and modify the copy (the input dataframe should be kept unchanged)
# Hint 2: For minmax-normalization, you may consider to limit the output range to [0,1]

def create_normalization(df, normalizationtype="minmax"):
    normalized_df = df.copy()    
    columns = normalized_df.columns
    #Col_Df = normalized_df.dtypes
    
    # create a dictionary to save the normalization data
    normalization = {}

    for col in columns:
        if (col != 'CLASS' and col != 'ID') and (normalized_df[col].dtype == 'float64' or normalized_df[col].dtype == 'int64'):
            if normalizationtype == 'minmax':
                min = normalized_df[col].min()
                max = normalized_df[col].max()
                normalization[col] = ('minmax', min, max)
                normalized_df[col] = [(x-min)/(max-min) for x in normalized_df[col]]
            
            elif normalizationtype == 'zscore':            
                mean = normalized_df[col].mean()
                std = normalized_df[col].std()
                normalization[col] = ('zscore', mean, std)
                normalized_df[col] = normalized_df[col].apply(lambda x: (x-mean)/std)
    
    return normalized_df, normalization

def apply_normalization(df, normalization):
    normalized_df = df.copy()  
    columns = normalized_df.columns
    Col_Df = normalized_df.dtypes
    col_num = 0
    normalizationtype = normalization['RI'][0]
    
    for col in columns:
        if (col != 'CLASS' and col != 'ID') and (normalized_df[col].dtype == 'float64' or normalized_df[col].dtype == 'int64'):
            if normalizationtype == 'minmax':
                min = normalization[col][1]
                max = normalization[col][2]
                normalization[col] = ('minmax', min, max)
                normalized_df[col] = [(x-min)/(max-min) for x in normalized_df[col]]
            
            elif normalizationtype == 'zscore':
                mean = normalization[col][1]
                std = normalization[col][2]
                normalization[col] = ('zscore', mean, std)
                normalized_df[col] = [(x-min)/(max-min) for x in normalized_df[col]]        
        
    return normalized_df

In [6]:
glass_train_df = pd.read_csv("glass_train.txt")

glass_test_df = pd.read_csv("glass_test.txt")

glass_train_norm, normalization = create_normalization(glass_train_df,normalizationtype="minmax")

print("normalization:\n")
for f in normalization:
    print("{}:{}".format(f,normalization[f]))
    
glass_test_norm = apply_normalization(glass_test_df,normalization)
print("\nglass_test_norm:\n")

'''
Col_Df = glass_test_norm.dtypes

print(isinstance(Col_Df[1], object))
print(glass_test_norm[columns[1]].dtype)

#### below basic dtype check didn't work properly when I checked for int and float so I had to check as above
print(isinstance(glass_test_norm[columns[1]], float))
print(glass_test_norm[columns[1]].dtype == 'float64')

print(Col_Df)
'''

# set below options if you want to see all data in dataframe, otherwise jupyter shows some of it from the top and the bottom
#pd.set_option('display.max_columns', None)  # or 1000
#pd.set_option('display.max_rows', None)
glass_test_norm
# i.e. by default usage, you can't see below rows. you see them only after the above settings or if you specifically call them
#glass_test_norm.iloc[30:77,:]

normalization:

RI:('minmax', 1.51131, 1.53125)
Na:('minmax', 10.73, 15.79)
Mg:('minmax', 0.0, 4.49)
Al:('minmax', 0.29, 3.04)
Si:('minmax', 69.81, 75.18)
K:('minmax', 0.0, 6.21)
Ca:('minmax', 5.43, 14.68)
Ba:('minmax', 0.0, 3.15)
Fe:('minmax', 0.0, 0.37)

glass_test_norm:



Unnamed: 0,ID,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,CLASS
0,101,0.262788,0.399209,0.634744,0.418182,0.644320,0.091787,0.363243,0.034921,0.594595,2
1,104,0.799398,0.606719,0.701559,0.134545,0.141527,0.012882,0.671351,0.000000,0.000000,2
2,44,0.541123,0.592885,0.855234,0.156364,0.363128,0.027375,0.465946,0.000000,0.000000,1
3,17,0.327482,0.385375,0.817372,0.316364,0.614525,0.098229,0.353514,0.000000,0.000000,1
4,81,0.231194,0.420949,0.783964,0.665455,0.530726,0.111111,0.274595,0.000000,0.000000,2
5,142,0.361083,0.488142,0.808463,0.283636,0.562384,0.091787,0.322162,0.028571,0.459459,2
6,120,0.261284,0.559289,0.795100,0.429091,0.491620,0.103060,0.273514,0.000000,0.000000,2
7,123,0.278837,0.494071,0.788419,0.432727,0.564246,0.090177,0.288649,0.000000,0.000000,2
8,133,0.342026,0.533597,0.886414,0.323636,0.499069,0.093398,0.294054,0.000000,0.000000,2
9,185,-0.008024,1.314229,0.000000,0.018182,1.042831,0.000000,0.131892,0.000000,0.000000,6


In [7]:
# Insert the functions create_imputation and apply_imputation below (after the comments)
#
# Input to create_imputation:
# df: a dataframe (where the column names "CLASS" and "ID" have special meaning)
#
# Output from create_imputation:
# df: a new dataframe, where each missing numeric value in a column has been replaced by the mean of that column 
#     and each missing categoric value in a column has been replaced by the mode of that column
# imputation: a mapping (dictionary) from column name to value that has replaced missing values
#
# Hint 1: First copy the input dataframe and modify the copy (the input dataframe should be kept unchanged)
# Hint 2: Handle columns of type "float" or "int" only (and which are not labeled "CLASS" or "ID") in one way
#         and columns of type "object" and "category" in other ways
# Hint 3: Consider using the pandas functions mean() and mode() respectively, as well as fillna
# Hint 4: In the rare case of all values in a column being missing, replace numeric values with 0,
#         object values with "" and category values with the first category (cat.categories[0])  
#
# Input to apply_imputation:
# df: a dataframe
# imputation: a mapping (dictionary) from column name to value that should replace missing values
#
# Output from apply_imputation:
# df: a new dataframe, where each missing value has been replaced according to the mapping
#
# Hint 1: First copy the input dataframe and modify the copy (the input dataframe should be kept unchanged)
# Hint 2: Consider using fillna

def create_imputation(df):
    replaced_df = df.copy()
    columns = replaced_df.columns

    # create a dictionary to save the normalization data
    imputation = {}
    
    for col in columns:
    #while col_num < len(columns):        
        if col != 'CLASS' and col != 'ID':
            if (replaced_df[col].dtype == 'float64' or replaced_df[col].dtype == 'int64'):
                if np.isnan(replaced_df[col].values).all():
                    replaced_df[col].fillna(0, inplace=True)
                    imputation[col] = 0
                else:
                    replaced_df[col].fillna(replaced_df[col].mean(), inplace=True)
                    imputation[col] = replaced_df[col].mean()
            elif replaced_df[col].dtype == 'object':
                #if np.isnan(replaced_df[columns[col_num]].values).all():
                if pd.isnull(replaced_df[col].values).all():
                    replaced_df[col].fillna("", inplace=True)
                    imputation[col] = ""
                else:
                    replaced_df[col].fillna(replaced_df[col].mode()[0], inplace=True)
                    imputation[col] = replaced_df[col].mode()[0]
            elif replaced_df[co].dtype == 'category':
                if np.isnan(replaced_df[col].values).all():
                    replaced_df[col].fillna(replaced_df[col].cat.categories[0], inplace=True)
                    imputation[col] = replaced_df[col].cat.categories[0]
                else:
                    replaced_df[col].fillna(replaced_df[col].mode()[0], inplace=True)
                    imputation[col] = replaced_df[col].mode()[0]
        
    return replaced_df, imputation

def apply_imputation(df, imputation):
    replaced_df = df.copy()
    columns = replaced_df.columns
    #Col_Df = replaced_df.dtypes
    # create a dictionary to save the normalization data
        
    for col in columns:
        if col != 'CLASS' and col != 'ID':
            replaced_df[col].fillna(imputation[col], inplace=True)
        
    return replaced_df

In [8]:
# a code snippet to check if isnan(df).all() or isnan(df[column_od].values).all() works fine

df_test = pd.DataFrame(np.full((10,10), np.nan))

df_test.iloc[0:3, 0:2] = 1

for i in range (len(df_test.columns)):
    print('Column-{} has all values NaN?: {}'.format(i, np.isnan(df_test[i].values).all()))

#print (df_test['1'])  # fails
#print (df_test[1])  # works
#print (df_test)

anneal_train_df = pd.read_csv("anneal_train.txt")
anneal_test_df = pd.read_csv("anneal_test.txt")

anneal_train_df

Column-0 has all values NaN?: False
Column-1 has all values NaN?: False
Column-2 has all values NaN?: True
Column-3 has all values NaN?: True
Column-4 has all values NaN?: True
Column-5 has all values NaN?: True
Column-6 has all values NaN?: True
Column-7 has all values NaN?: True
Column-8 has all values NaN?: True
Column-9 has all values NaN?: True


Unnamed: 0,family,product-type,steel,carbon,hardness,temper_rolling,condition,formability,strength,non-ageing,...,s,p,shape,thick,width,len,oil,bore,packing,CLASS
0,,C,A,0,0,,S,2.0,0,,...,,,SHEET,0.999,1220.0,4880,,0,,3
1,,C,,0,0,,A,2.0,0,,...,,,SHEET,0.700,1320.0,4880,,0,,3
2,TN,C,A,0,0,,,3.0,0,N,...,,,SHEET,0.500,1220.0,4880,,0,,5
3,ZS,C,A,0,50,T,,,0,,...,,,SHEET,0.451,1250.0,762,,0,,3
4,,C,A,0,85,T,,,0,,...,,,COIL,4.000,1000.0,0,,600,,U
5,,C,R,0,0,,,,500,,...,,,COIL,1.600,1320.0,0,,0,,3
6,,C,R,6,0,T,,,0,,...,,,COIL,0.500,25.0,0,,0,,3
7,,C,A,0,0,,S,3.0,0,N,...,,,SHEET,0.750,610.0,4880,,0,,3
8,,C,A,0,0,,S,2.0,0,,...,,,COIL,0.300,1320.0,0,,0,,3
9,,C,K,45,0,,,,0,,...,,,COIL,0.600,900.0,0,,0,,3


In [9]:
# Test your code (leave this part unchanged)

anneal_train_df = pd.read_csv("anneal_train.txt")
anneal_test_df = pd.read_csv("anneal_test.txt")

anneal_train_imp, imputation = create_imputation(anneal_train_df)
anneal_test_imp = apply_imputation(anneal_test_df, imputation)

print("Imputation:\n")
for f in imputation:
    print("{}:{}".format(f,imputation[f]))
    
print("\nNo. of replaced missing values in training data:\n{}".format(anneal_train_imp.count()-anneal_train_df.count()))
print("\nNo. of replaced missing values in test data:\n{}".format(anneal_test_imp.count()-anneal_test_df.count()))

Imputation:

family:TN
product-type:C
steel:A
carbon:3.859688195991091
hardness:13.084632516703786
temper_rolling:T
condition:S
formability:2.2517482517482557
strength:26.302895322939868
non-ageing:N
surface-finish:P
surface-quality:E
enamelability:1.7142857142857018
bc:Y
bf:Y
bt:Y
bw/me:B
bl:Y
m:0
chrom:C
phos:P
cbond:Y
marvi:0
exptl:0
ferro:Y
corr:0
blue-bright-varn-clean:B
lustre:Y
jurofm:0
s:0
p:0
shape:SHEET
thick:1.1911937639198227
width:769.4917594654789
len:1229.293986636971
oil:Y
bore:35.18930957683742
packing:3.0

No. of replaced missing values in training data:
family                    382
product-type                0
steel                      43
carbon                      0
hardness                    0
temper_rolling            374
condition                 160
formability               163
strength                    0
non-ageing                391
surface-finish            444
surface-quality           128
enamelability             442
bc                        448
b

In [10]:
# Insert the functions create_bins and apply_bins below
#
# Input to create_bins:
# df: a dataframe
# nobins: no. of bins (default = 10)
# bintype: either "equal-width" (default) or "equal-size" 
#
# Output from create_bins:
# df: a new dataframe, where each numeric feature value has been replaced by a categoric (corresponding to some bin)
# binning: a mapping (dictionary) from column name to bins (threshold values for the bin)
#
# Hint 1: First copy the input dataframe and modify the copy (the input dataframe should be kept unchanged)
# Hint 2: Discretize columns of type "float" or "int" only (and which are not labeled "CLASS" or "ID")
# Hint 3: Consider using pd.cut and pd.qcut respectively, with labels=False, retbins=True and duplicates="drop"
#         (the last option will avoid errors when not enough bins can be created)
# Hint 4: Set all columns in the new dataframe to be of type "category"
# Hint 5: Set the categories of the discretized features to be [0,...,nobins-1]
# Hint 6: Change the first and the last element of each binning to -np.inf and np.inf respectively 
#
# Input to apply_bins:
# df: a dataframe
# binning: a mapping (dictionary) from column name to bins (threshold values for the bin)
#
# Output from apply_bins:
# df: a new dataframe, where each numeric feature value has been replaced by a categoric (corresponding to some bin)
#
# Hint 1: First copy the input dataframe and modify the copy (the input dataframe should be kept unchanged)
# Hint 2: Consider using pd.cut 
# Hint 3: Set all columns in the new dataframe to be of type "category"
# Hint 4: Set the categories of the discretized features to be [0,...,nobins-1]
#

def create_bins(df, nobins = 10, bintype = "equal-width"):
    cat_df = df.copy()
    columns = cat_df.columns
    
    binning = {}
    
    for col in columns:
        if (col != 'CLASS' and col != 'ID') and (cat_df[col].dtype == 'float64' or cat_df[col].dtype == 'int64'):
            res, bins = pd.cut(cat_df[col], nobins, bintype, retbins=True, duplicates="drop", labels=False)
            bins = np.append(-np.inf, bins)
            bins = np.append(bins, np.inf)
            binning[col] = bins            
            #cat_df = cat_df.astype({col: category})  # should work, too
            cat_df[col] = cat_df[col].astype("category")
            cat_df[col] = cat_df[col].cat.set_categories(bins)
            cat_df[col] = pd.cut(cat_df[col], binning[col])
    
    return cat_df, binning

def apply_bins(df, binning):
    cat_df = df.copy()
    columns = cat_df.columns
    
    for col in columns:
        if (col != 'CLASS' and col != 'ID') and (cat_df[col].dtype == 'float64' or cat_df[col].dtype == 'int64'):
            cat_df[col] = pd.cut(cat_df[col], binning[col])
    return cat_df

In [11]:
# Test your code  (leave this part unchanged)

glass_train_df = pd.read_csv("glass_train.txt")

glass_test_df = pd.read_csv("glass_test.txt")

## WARNING: I believe there is something wrong in the provided output or the question is missing something because 
## bins should have 11 emelents each but RI has 9, Ba has 1, Fe has 3 and so on
glass_train_disc, binning = create_bins(glass_train_df,nobins=10,bintype="equal-size")
print("binning:\n")
for f in binning:
    print("{}:{}".format(f,binning[f]))
    
## WARNING: The ouput for the test also doesn't look proper in the question provided
glass_test_disc = apply_bins(glass_test_df,binning)
print("\nglass_test_disc:\n")
glass_test_disc


binning:

RI:[      -inf 1.51129006 1.513304   1.515298   1.517292   1.519286
 1.52128    1.523274   1.525268   1.527262   1.529256   1.53125
        inf]
Na:[    -inf 10.72494 11.236   11.742   12.248   12.754   13.26    13.766
 14.272   14.778   15.284   15.79         inf]
Mg:[    -inf -0.00449  0.449    0.898    1.347    1.796    2.245    2.694
  3.143    3.592    4.041    4.49         inf]
Al:[   -inf 0.28725 0.565   0.84    1.115   1.39    1.665   1.94    2.215
 2.49    2.765   3.04        inf]
Si:[    -inf 69.80463 70.347   70.884   71.421   71.958   72.495   73.032
 73.569   74.106   74.643   75.18         inf]
K:[    -inf -0.00621  0.621    1.242    1.863    2.484    3.105    3.726
  4.347    4.968    5.589    6.21         inf]
Ca:[    -inf  5.42075  6.355    7.28     8.205    9.13    10.055   10.98
 11.905   12.83    13.755   14.68         inf]
Ba:[    -inf -0.00315  0.315    0.63     0.945    1.26     1.575    1.89
  2.205    2.52     2.835    3.15         inf]
Fe:[    -inf -

Unnamed: 0,ID,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,CLASS
0,101,"(1.515, 1.517]","(12.248, 12.754]","(2.694, 3.143]","(1.39, 1.665]","(73.032, 73.569]","(-0.00621, 0.621]","(8.205, 9.13]","(-0.00315, 0.315]","(0.185, 0.222]",2
1,104,"(1.525, 1.527]","(13.766, 14.272]","(3.143, 3.592]","(0.565, 0.84]","(70.347, 70.884]","(-0.00621, 0.621]","(10.98, 11.905]","(-0.00315, 0.315]","(-0.00037, 0.037]",2
2,44,"(1.521, 1.523]","(13.26, 13.766]","(3.592, 4.041]","(0.565, 0.84]","(71.421, 71.958]","(-0.00621, 0.621]","(9.13, 10.055]","(-0.00315, 0.315]","(-0.00037, 0.037]",1
3,17,"(1.517, 1.519]","(12.248, 12.754]","(3.592, 4.041]","(1.115, 1.39]","(73.032, 73.569]","(-0.00621, 0.621]","(8.205, 9.13]","(-0.00315, 0.315]","(-0.00037, 0.037]",1
4,81,"(1.515, 1.517]","(12.754, 13.26]","(3.143, 3.592]","(1.94, 2.215]","(72.495, 73.032]","(0.621, 1.242]","(7.28, 8.205]","(-0.00315, 0.315]","(-0.00037, 0.037]",2
5,142,"(1.517, 1.519]","(12.754, 13.26]","(3.592, 4.041]","(0.84, 1.115]","(72.495, 73.032]","(-0.00621, 0.621]","(8.205, 9.13]","(-0.00315, 0.315]","(0.148, 0.185]",2
6,120,"(1.515, 1.517]","(13.26, 13.766]","(3.143, 3.592]","(1.39, 1.665]","(71.958, 72.495]","(0.621, 1.242]","(7.28, 8.205]","(-0.00315, 0.315]","(-0.00037, 0.037]",2
7,123,"(1.515, 1.517]","(12.754, 13.26]","(3.143, 3.592]","(1.39, 1.665]","(72.495, 73.032]","(-0.00621, 0.621]","(7.28, 8.205]","(-0.00315, 0.315]","(-0.00037, 0.037]",2
8,133,"(1.517, 1.519]","(13.26, 13.766]","(3.592, 4.041]","(1.115, 1.39]","(71.958, 72.495]","(-0.00621, 0.621]","(7.28, 8.205]","(-0.00315, 0.315]","(-0.00037, 0.037]",2
9,185,"(-inf, 1.511]","(15.79, inf]","(-0.00449, 0.449]","(0.287, 0.565]","(75.18, inf]","(-0.00621, 0.621]","(6.355, 7.28]","(-0.00315, 0.315]","(-0.00037, 0.037]",6


In [1]:
# Insert the function split below
#
# Input to split:
# df: a dataframe
# testfraction: a float in the range (0,1) (default = 0.5)
#
# Output from split:
# trainingdf: a dataframe consisting of a random sample of (1-testfraction) of the rows in df
# testdf: a dataframe consisting of the rows in df that are not included in trainingdf
#
# Hint: You may use np.random.permutation(df.index) to get a permuted list of indexes where a 
#       prefix corresponds to the test instances, and the suffix to the training instances 
import math

def split(df, testfraction = 0.5):
    copy_df = df.copy()
    # NOTE: Shuffling can be achived as below
    #copy_df = copy_df.iloc[np.random.permutation(len(df))]   # both this one and the below work
    #copy_df = copy_df.iloc[np.random.permutation(df.index)]
    #testdf = df.iloc[:int(len(copy_df)*testfraction), :]s
    #trainingdf = df.iloc[int(len(copy_df)*testfraction):, :]    
    testdf = copy_df.iloc[:math.ceil(len(df)*testfraction), :]
    trainingdf = copy_df.iloc[math.ceil(len(df)*testfraction):, :]
    return trainingdf, testdf

In [4]:
# Test your code  (leave this part unchanged)

glass_df = pd.read_csv("glass.txt")

glass_train, glass_test = split(glass_df,testfraction=0.25)

print("Training IDs:\n{}".format(glass_train["ID"].values))

print("\nTest IDs:\n{}".format(glass_test["ID"].values))

print("\nOverlap: {}".format(set(glass_train["ID"]).intersection(set(glass_test["ID"]))))

Training IDs:
[ 55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108
 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214]

Test IDs:
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
 49 50 51 52 53 54]

Overlap: set()


In [60]:
# Insert the function accuracy below
#
# Input to accuracy:
# df: a dataframe with class labels as column names and each row corresponding to
#     a prediction with estimated probabilities for each class
# correctlabels: an array (or list) of the correct class label for each prediction
#                (the number of correct labels must equal the number of rows in df)
#
# Output from accuracy:
# accuracy: the fraction of cases for which the predicted class label coincides with the correct label
#
# Hint: In case the label receiving the highest probability is not unique, you may
#       resolve that by picking the first (as ordered by the column names) or 
#       by randomly selecting one of the labels with highest probaility.

def accuracy(df, correctlabels):
    max_vals = predictions.max(axis=1)
    predict_max = np.zeros(5, dtype=str)
    correct_prediction = 0
    
    for i in range(len(predictions)):    
        for col in predictions.columns:
            if predictions[col].values[i] == max_vals[i]:
                predict_max[i] = col
                if col == correctlabels[i]:
                    correct_prediction += 1                
                break
                
    return correct_prediction/len(predictions)
    
    

In [56]:
predictions = pd.DataFrame({"A":[0.5,0.5,0.5,0.25,0.25],"B":[0.5,0.25,0.25,0.5,0.25],"C":[0.0,0.25,0.25,0.25,0.5]})
predictions

Unnamed: 0,A,B,C
0,0.5,0.5,0.0
1,0.5,0.25,0.25
2,0.5,0.25,0.25
3,0.25,0.5,0.25
4,0.25,0.25,0.5


In [62]:
correctlabels = ["B","A","B","B","C"]

accuracy(predictions, correctlabels) # Note that depending on how ties are resolved the accuracy may be 0.6 or 0.8

0.6

In [120]:
# Insert the functions create_one_hot and apply_one_hot below
#
# Input to create_one_hot:
# df: a dataframe
#
# Output from create_one_hot:
# df: a new dataframe, where each categoric feature has been replaced by a set of binary features 
#    (as many new features as there are possible values)
# one_hot: a mapping (dictionary) from column name to a set of categories (possible values for the feature)
#
# Hint 1: First copy the input dataframe and modify the copy (the input dataframe should be kept unchanged)
# Hint 2: Consider columns of type "object" or "category" only (and which are not labeled "CLASS" or "ID")
# Hint 3: Consider creating new column names by merging the original column name and the categorical value
# Hint 4: Set all new columns to be of type "float"
# Hint 5: Do not forget to remove the original categoric feature
#
# Input to apply_one_hot:
# df: a dataframe
# one_hot: a mapping (dictionary) from column name to categories
#
# Output from apply_one_hot:
# df: a new dataframe, where each categoric feature has been replaced by a set of binary features
#
# Hint: See the above Hints


def create_one_hot(df):
    copy_df = df.copy()
    categories = {} # one_hot
    ## NOTE: I considered each column/feature can have different category groups but it seem like they all have
    ## the same groups. If we consider all wiil have the same categories, it would be easier.
    
    columns = copy_df.columns
    new_columns = list() # new_columns = []
    new_columns.append('CLASS')
    
    for col in columns:
        if (col != 'CLASS' and col != 'ID') and (copy_df[col].dtype == 'object' or copy_df[col].dtype == 'category'):
            #binary_df.rename(columns={col:col+, 'lifeExp':'life_exp', 'gdpPercap':'gdp_per_cap'}, inplace=True)
            copy_df[col] = copy_df[col].astype("category")
            categories[col] = copy_df[col].cat.categories
            for i in range(len(categories[col])):
                new_columns.append(col + '-' + categories[col][i])
    
    data_temp = np.zeros((len(copy_df), len(new_columns)) ,float)
    #binX_df = pd.DataFrame(xx, columns = new_columns, index=train_df.index.values)
    binary_df = pd.DataFrame(data_temp, columns = new_columns, index = copy_df.index.values)
        
    for row_num in range(len(copy_df)):
        #for col_num in range(len(copy_df.columns))
        for col in columns:
            if col == 'CLASS':
                binary_df[col] = copy_df[col].values[row_num]
            else:
                for i in range(len(categories[col])):
                    if copy_df[col].values[row_num] == categories[col][i]:
                        binary_df[(col + '-' + categories[col][i])].values[row_num] = 1.0
                        break;  

    return binary_df, categories

# one_hot == categories
def apply_one_hot(df, categories):
    copy_df = df.copy()
    columns = copy_df.columns
    new_columns = list() # new_columns = []
    new_columns.append('CLASS')
    
    for col in columns:
        if (col != 'CLASS' and col != 'ID') and (copy_df[col].dtype == 'object' or copy_df[col].dtype == 'category'):
            #binary_df.rename(columns={col:col+, 'lifeExp':'life_exp', 'gdpPercap':'gdp_per_cap'}, inplace=True)
            copy_df[col] = copy_df[col].astype("category")
            categories[col] = copy_df[col].cat.categories
            for i in range(len(categories[col])):
                new_columns.append(col + '-' + categories[col][i])
   
    data_temp = np.zeros((len(copy_df), len(new_columns)) ,float)
    #binX_df = pd.DataFrame(xx, columns = new_columns, index=train_df.index.values)
    binary_df = pd.DataFrame(data_temp, columns = new_columns, index = copy_df.index.values)
        
    for row_num in range(len(copy_df)):
        #for col_num in range(len(copy_df.columns))
        for col in columns:
            if col == 'CLASS':
                binary_df[col] = copy_df[col].values[row_num]
            else:
                for i in range(len(categories[col])):
                    if copy_df[col].values[row_num] == categories[col][i]:
                        binary_df[(col + '-' + categories[col][i])].values[row_num] = 1.0
                        break;  

    return binary_df

In [121]:
# Test your code  (leave this part unchanged)

tictactoe = pd.read_csv("tic-tac-toe.txt")

train_df, test_df = split(tictactoe) # Using your above function

new_train, one_hot = create_one_hot(train_df)

#print(new_train)
new_test = apply_one_hot(test_df,one_hot)
new_test

## NOTE: The values are matching wiht the values in the question except the INDEX values. It should be double checked
## I also believe that there should be an easier way to do these calculations... Will think about it later..

Unnamed: 0,CLASS,top-left-square-o,top-left-square-x,top-middle-square-b,top-middle-square-o,top-middle-square-x,top-right-square-b,top-right-square-o,top-right-square-x,middle-left-square-b,...,middle-right-square-x,bottom-left-square-b,bottom-left-square-o,bottom-left-square-x,bottom-middle-square-b,bottom-middle-square-o,bottom-middle-square-x,bottom-right-square-b,bottom-right-square-o,bottom-right-square-x
0,positive,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
1,positive,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,positive,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,positive,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,positive,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
5,positive,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
6,positive,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
7,positive,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
8,positive,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
9,positive,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [117]:
# Insert the function folds below
#
# Input to folds:
# df: a dataframe
# nofolds: an integer greater than 1 (default = 10)
#
# Output from folds:
# folds: a list (of length = nofolds) dataframes consisting of random non-overlapping, 
#        approximately equal-sized subsets of the rows in df
#
# Hint: You may use np.random.permutation(df.index) to get a permuted list of indexes from which a 
#       prefix corresponds to the test instances, and the suffix to the training instances 

def folds(df,nofolds=10):
    
    
### WARNING: I don't understand what the question is asking me to do??

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Index(['b', 'o', 'x'], dtype='object')
Index(['b', 'o', 'x'], dtype='object')
Index(['b', 'o', 'x'], dtype='object')
Index(['b', 'o', 'x'], dtype='object')
Index(['b', 'o', 'x'], dtype='object')
Index(['b', 'o', 'x'], dtype='object')
Index(['b', 'o', 'x'], dtype='object')
Index(['b', 'o', 'x'], dtype='object')
Index(['b', 'o', 'x'], dtype='object')


'\nnew_train, one_hot = create_one_hot(train_df)\n\nnew_test = apply_one_hot(test_df,one_hot)\nnew_test\n'

In [123]:
# Test your code  (leave this part unchanged)

glass_df = pd.read_csv("glass.txt")
print(glass_df)
'''
glass_folds = folds(glass_df,nofolds=5)

fold_sizes = [len(f) for f in glass_folds]

print("Fold sizes:{}\nTotal no. instances: {}".format(fold_sizes,sum(fold_sizes)))
'''

      ID       RI     Na    Mg    Al     Si     K    Ca    Ba    Fe  CLASS
0      1  1.52101  13.64  4.49  1.10  71.78  0.06  8.75  0.00  0.00      1
1      2  1.51761  13.89  3.60  1.36  72.73  0.48  7.83  0.00  0.00      1
2      3  1.51618  13.53  3.55  1.54  72.99  0.39  7.78  0.00  0.00      1
3      4  1.51766  13.21  3.69  1.29  72.61  0.57  8.22  0.00  0.00      1
4      5  1.51742  13.27  3.62  1.24  73.08  0.55  8.07  0.00  0.00      1
5      6  1.51596  12.79  3.61  1.62  72.97  0.64  8.07  0.00  0.26      1
6      7  1.51743  13.30  3.60  1.14  73.09  0.58  8.17  0.00  0.00      1
7      8  1.51756  13.15  3.61  1.05  73.24  0.57  8.24  0.00  0.00      1
8      9  1.51918  14.04  3.58  1.37  72.08  0.56  8.30  0.00  0.00      1
9     10  1.51755  13.00  3.60  1.36  72.99  0.57  8.40  0.00  0.11      1
10    11  1.51571  12.72  3.46  1.56  73.20  0.67  8.09  0.00  0.24      1
11    12  1.51763  12.80  3.66  1.27  73.01  0.60  8.56  0.00  0.00      1
12    13  1.51589  12.88 

'\nglass_folds = folds(glass_df,nofolds=5)\n\nfold_sizes = [len(f) for f in glass_folds]\n\nprint("Fold sizes:{}\nTotal no. instances: {}".format(fold_sizes,sum(fold_sizes)))\n'

In [215]:
# Insert the function brier_score below
#
# Input to brier_score:
# df: a dataframe with class labels as column names and each row corresponding to
#     a prediction with estimated probabilities for each class
# correctlabels: an array (or list) of the correct class label for each prediction
#                (the number of correct labels must equal the number of rows in df)
#
# Output from brier_score:
# brier_score: the average square error of the predicted probabilties 
#
# Hint: Compare each predicted vector to a vector for each correct label, which is all zeros except 
#       for at the index of the correct class. The index can be found using np.where(df.columns==l)[0] 
#       where l is the correct label.

def brier_score(df, correctlabels):
    #max_vals = df.max(axis=1)

    #predict_max = np.zeros(5, dtype=str)
    data_temp = np.zeros((len(correctlabels), len(df.columns)) ,float)
    correct_df = pd.DataFrame(data_temp, columns = df.columns)
    square_errors = []
        
    for i in range(len(correct_df)):    
        for col in correct_df.columns:
            #print('correctlabels[{}]={}'.format(i, correctlabels[i]))
            #print('col={}'.format(col))
            if col == correctlabels[i]:
                correct_df[col].values[i] = 1
                #correct_df[i, col] == 1
                #print(correct_df[col].values[i])
                #break;
    
    for i in range(len(correct_df)):    
        for col in correct_df.columns:
            square_errors.append((correct_df[col].values[i]-df[col].values[i])**2)
            #print(correct_df[col].values[i])            
    
    square_errors_array = np.array(square_errors)
    print('Prediction Matrix')
    print(df)
    print('Correct Matrix (correct label=1, others=0)')
    print(correct_df)    
    print('Error squares calculated by the extraction of the first and second matrices and squared the error')
    print(np.reshape(square_errors_array, (5,3)))
    print('Calculated error mean:', square_errors_array.mean())
    return square_errors_array.mean()
          #square_errors_array.mean()

In [216]:
# Test your code  (leave this part unchanged)

predictions = pd.DataFrame({"A":[0.5,0.5,0.5,0.25,0.25],"B":[0.5,0.25,0.25,0.5,0.25],"C":[0.0,0.25,0.25,0.25,0.5]})

correctlabels = ["B","A","B","B","C"]

## NOTE: Apprarently, I didn't understand the question correctly, again. The question is not clear to me.
brier_score(predictions,correctlabels)

Prediction Matrix
      A     B     C
0  0.50  0.50  0.00
1  0.50  0.25  0.25
2  0.50  0.25  0.25
3  0.25  0.50  0.25
4  0.25  0.25  0.50
Correct Matrix (correct label=1, others=0)
     A    B    C
0  0.0  1.0  0.0
1  1.0  0.0  0.0
2  0.0  1.0  0.0
3  0.0  1.0  0.0
4  0.0  0.0  1.0
Error squares calculated by the extraction of the first and second matrices and squared the error
[[0.25   0.25   0.    ]
 [0.25   0.0625 0.0625]
 [0.25   0.5625 0.0625]
 [0.0625 0.25   0.0625]
 [0.0625 0.0625 0.25  ]]
Calculated error mean: 0.16666666666666666


0.16666666666666666

In [None]:
# Insert the function auc below
#
# Input to auc:
# df: a dataframe with class labels as column names and each row corresponding to
#     a prediction with estimated probabilities for each class
# correctlabels: an array (or list) of the correct class label for each prediction
#                (the number of correct labels must equal the number of rows in df)
#
# Output from auc:
# auc: the weighted area under ROC curve
#
# Hint 1: Calculate the binary AUC first for each class label c, i.e., treating the
#         predicted probability of this class for each instance as a score; the positive
#         instances are the ones belonging to class c and the negative instances the rest
# Hint 2: When calculating the binary AUC, first find the scores of the positive instances and then
#         the scores of the negative instances
# Hint 3: You may use a dictionary with a mapping from each score to an array of two numbers; 
#         the number of positive instances with this score and the number of negative instances with this score
# Hint 4: Created a (reversely) sorted (on the scores) list of pairs from the dictionary and
#         iterate over this to additively calculate the AUC
# Hint 5: For each pair in the above list, there are three cases to consider; the no. of true positives
#         (tp_i) is zero, the number of false positives (fp_i) (negatives) is zero, and both are non-zero
# Hint 6: Calculate the weighted AUC by summing the individual AUCs weighted by the relative
#         frequency of each class (as estimated from the correct labels)

def auc(df, correctlabels):
    
    return auc