# Setup

In [1]:
import csv
import pandas as pd
import numpy as np

In [2]:
# Training
train_df = pd.read_csv("training.csv", delimiter=",", header=0)

test_df = pd.read_csv("testing.csv", delimiter=",", header=0)

confidence_df = pd.read_csv("annotation_confidence.csv", delimiter=",", header=0)

additional_df = pd.read_csv("additional_training.csv", delimiter=",", header=0)

proportions_df = pd.read_csv("test_proportions.csv", delimiter=",", header=None, dtype="unicode")

## Confident Training Data

In [3]:
c_df = confidence_df[confidence_df.confidence == 1]
confident_ids = confidence_df[confidence_df.confidence == 1].loc[c_df.ID].index.values.tolist()

confident_train_df = train_df[train_df['ID'].isin(confident_ids)]

confident_train_0_df = confident_train_df[confident_train_df.prediction == 0]

confident_train_1_df = confident_train_df[confident_train_df.prediction == 1]

## Confident Additional Data

In [4]:
confident_additional_df = additional_df[additional_df['ID'].isin(confident_ids)]

confident_additional_0_df = confident_additional_df[confident_additional_df['prediction']==0]

confident_additional_1_df = confident_additional_df[confident_additional_df['prediction']==1]

## All Data

In [5]:
all_df = pd.concat([train_df, additional_df])

confident_all_df = all_df[all_df['ID'].isin(confident_ids)]

# Creating

## Get Averages for Confident 0 and 1 predictions from TrainingDF per column

In [6]:
confident_train_0_avg = pd.DataFrame(confident_train_0_df.mean(axis=0)[1:-1]).T
confident_train_1_avg = pd.DataFrame(confident_train_1_df.mean(axis=0)[1:-1]).T
print("Confident train 0 and 1 avg")
display(confident_train_0_avg)
display(confident_train_1_avg)


Confident train 0 and 1 avg


Unnamed: 0,CNNs,CNNs.1,CNNs.2,CNNs.3,CNNs.4,CNNs.5,CNNs.6,CNNs.7,CNNs.8,CNNs.9,...,GIST.502,GIST.503,GIST.504,GIST.505,GIST.506,GIST.507,GIST.508,GIST.509,GIST.510,GIST.511
0,0.215292,0.339621,0.183216,0.097272,0.254131,0.066736,0.155396,0.129293,0.0779,0.375786,...,0.036616,0.027793,0.027661,0.037078,0.033417,0.027121,0.029276,0.038444,0.036219,0.02656


Unnamed: 0,CNNs,CNNs.1,CNNs.2,CNNs.3,CNNs.4,CNNs.5,CNNs.6,CNNs.7,CNNs.8,CNNs.9,...,GIST.502,GIST.503,GIST.504,GIST.505,GIST.506,GIST.507,GIST.508,GIST.509,GIST.510,GIST.511
0,0.103416,0.181478,0.023226,0.1279,0.080345,0.224387,0.312417,0.351748,0.281232,0.272855,...,0.028806,0.021864,0.021611,0.031417,0.029565,0.021452,0.023752,0.03,0.029956,0.024447


## Create DataFrames for Non-Confident 0 and 1 predictions from TrainingDF and AdditionalDF

In [7]:
non_confident_all_df = all_df[~all_df['ID'].isin(confident_ids)]

non_confident_all_0_df = non_confident_all_df[non_confident_all_df.prediction == 0]
non_confident_all_1_df = non_confident_all_df[non_confident_all_df.prediction == 1]

display(non_confident_all_0_df[:3])
display(non_confident_all_1_df[:3])


Unnamed: 0,ID,CNNs,CNNs.1,CNNs.2,CNNs.3,CNNs.4,CNNs.5,CNNs.6,CNNs.7,CNNs.8,...,GIST.503,GIST.504,GIST.505,GIST.506,GIST.507,GIST.508,GIST.509,GIST.510,GIST.511,prediction
3,4,0.31198,0.24452,0.2121,0.97855,0.0,1.3198,0.0,0.0,0.0,...,0.033687,0.06691,0.036916,0.029357,0.017351,0.020543,0.0153,0.016477,0.019715,0
8,9,0.23359,0.0,0.0,0.0,0.0,0.0,0.1308,0.0,0.0,...,0.041533,0.00442,0.053275,0.045695,0.01579,0.001562,0.025589,0.048539,0.027832,0
10,11,0.16797,0.16616,0.24607,0.0,0.0,0.2038,0.43494,0.0,0.0,...,0.043074,0.076456,0.045263,0.018106,0.027679,0.057511,0.054574,0.01646,0.024385,0


Unnamed: 0,ID,CNNs,CNNs.1,CNNs.2,CNNs.3,CNNs.4,CNNs.5,CNNs.6,CNNs.7,CNNs.8,...,GIST.503,GIST.504,GIST.505,GIST.506,GIST.507,GIST.508,GIST.509,GIST.510,GIST.511,prediction
1,2,0.0,0.0,0.0,0.0,0.0,0.63184,0.0,1.8388,0.4959,...,0.033147,0.008112,0.004126,0.014677,0.04898,0.011394,0.012629,0.033668,0.048248,1
5,6,0.0,0.0,0.0,0.0,0.0,0.0,2.3096,0.81257,0.0,...,0.006797,0.030276,0.037172,0.019828,0.010732,0.053016,0.041817,0.012208,0.00754,1
6,7,0.0,0.0,0.0,0.0,0.0,0.0,0.48886,0.82632,0.36878,...,0.027909,0.024642,0.011153,0.022865,0.016119,0.028122,0.017835,0.01797,0.016795,1


## Get Averages for Non-Confident 0 and 1 predictions from TrainingDF and AdditionalTrainingDF per column

### Sum of columns in training and additional for 0 and 1

In [8]:
# ALL
non_confident_all_0_sum = pd.DataFrame(non_confident_all_0_df.sum(axis=0)[1:-1]).T
non_confident_all_1_sum = pd.DataFrame(non_confident_all_1_df.sum(axis=0)[1:-1]).T

display(non_confident_all_0_sum)
display(non_confident_all_1_sum)

Unnamed: 0,CNNs,CNNs.1,CNNs.2,CNNs.3,CNNs.4,CNNs.5,CNNs.6,CNNs.7,CNNs.8,CNNs.9,...,GIST.502,GIST.503,GIST.504,GIST.505,GIST.506,GIST.507,GIST.508,GIST.509,GIST.510,GIST.511
0,150.858993,246.594134,86.114006,126.988254,101.48184,154.171063,154.233013,183.836066,146.344151,288.372471,...,27.067351,20.178714,19.615432,25.807023,25.880596,20.153725,21.653353,24.888545,26.737698,22.229199


Unnamed: 0,CNNs,CNNs.1,CNNs.2,CNNs.3,CNNs.4,CNNs.5,CNNs.6,CNNs.7,CNNs.8,CNNs.9,...,GIST.502,GIST.503,GIST.504,GIST.505,GIST.506,GIST.507,GIST.508,GIST.509,GIST.510,GIST.511
0,114.882625,226.865885,45.888628,150.66733,86.155814,236.338999,244.20087,380.469747,285.208531,324.414226,...,34.023719,24.840494,22.771809,32.459625,32.807495,23.52299,25.187398,31.345316,33.826983,25.617096


### Averaging by number of non-nan values per column

In [9]:
non_confident_all_0_avg = non_confident_all_0_sum.copy()
#display(non_confident_all_0_avg)
for column in non_confident_all_0_sum:
    non_confident_all_0_avg[column] = non_confident_all_0_sum[[column]] / non_confident_all_0_df[[column]].count()
non_confident_all_0_avg

Unnamed: 0,CNNs,CNNs.1,CNNs.2,CNNs.3,CNNs.4,CNNs.5,CNNs.6,CNNs.7,CNNs.8,CNNs.9,...,GIST.502,GIST.503,GIST.504,GIST.505,GIST.506,GIST.507,GIST.508,GIST.509,GIST.510,GIST.511
0,0.182417,0.316146,0.10873,0.157946,0.127972,0.193197,0.191119,0.232999,0.187621,0.360466,...,0.032769,0.025005,0.024157,0.031861,0.032432,0.025287,0.027444,0.031868,0.033339,0.027752


In [10]:
non_confident_all_1_avg = non_confident_all_1_sum.copy()
display(non_confident_all_1_avg)
for column in non_confident_all_1_sum:
    non_confident_all_1_avg[column] = non_confident_all_1_sum[[column]] / non_confident_all_1_df[[column]].count()
non_confident_all_1_avg

Unnamed: 0,CNNs,CNNs.1,CNNs.2,CNNs.3,CNNs.4,CNNs.5,CNNs.6,CNNs.7,CNNs.8,CNNs.9,...,GIST.502,GIST.503,GIST.504,GIST.505,GIST.506,GIST.507,GIST.508,GIST.509,GIST.510,GIST.511
0,114.882625,226.865885,45.888628,150.66733,86.155814,236.338999,244.20087,380.469747,285.208531,324.414226,...,34.023719,24.840494,22.771809,32.459625,32.807495,23.52299,25.187398,31.345316,33.826983,25.617096


Unnamed: 0,CNNs,CNNs.1,CNNs.2,CNNs.3,CNNs.4,CNNs.5,CNNs.6,CNNs.7,CNNs.8,CNNs.9,...,GIST.502,GIST.503,GIST.504,GIST.505,GIST.506,GIST.507,GIST.508,GIST.509,GIST.510,GIST.511
0,0.1091,0.209867,0.042767,0.140548,0.079626,0.222123,0.231032,0.354916,0.263594,0.303474,...,0.031329,0.023022,0.021203,0.029889,0.030519,0.022212,0.022939,0.028784,0.03053,0.023919


# Application

## Fill AdditionalTrainingDF Nan values with Averages from Confident 0 and 1 predictions

In [None]:
rd = 3
n_additional_df = additional_df.copy()
for column in n_additional_df:
    df.loc[()]

In [18]:
rd = 3

new_additional_df = additional_df.copy()
display(new_additional_df[:rd])
new_confident_additional_df = new_additional_df[new_additional_df['ID'].isin(confident_ids)]
display(new_confident_additional_df[:rd])
new_confident_0_additional_df = new_confident_additional_df[new_confident_additional_df.prediction == 0].drop(['ID','prediction'], axis=1)
display(new_confident_0_additional_df[:rd])
new_confident_1_additional_df = new_confident_additional_df[new_confident_additional_df.prediction == 1]
display(new_confident_1_additional_df[:rd])

for column in new_confident_0_additional_df:
    new_confident_0_additional_df[column] = new_confident_0_additional_df[[column]].fillna(value=confident_train_0_avg.iloc[0][column])
display(new_confident_0_additional_df[:5])
display(new_additional_df[:20])

Unnamed: 0,ID,CNNs,CNNs.1,CNNs.2,CNNs.3,CNNs.4,CNNs.5,CNNs.6,CNNs.7,CNNs.8,...,GIST.503,GIST.504,GIST.505,GIST.506,GIST.507,GIST.508,GIST.509,GIST.510,GIST.511,prediction
0,457,,1.7263,,0.0,0.0,0.799,0.0,0.0,0.0,...,,0.022648,0.043312,0.025739,0.02818,0.032245,0.036754,0.029104,,1
1,458,0.83494,0.0,0.87363,0.18423,,0.35443,1.2575,0.84533,0.8482,...,0.005451,,0.016365,0.014265,,0.026452,,0.053096,0.057367,0
2,459,,,0.0,0.0,,0.0,,0.71016,0.5472,...,0.008683,,0.015751,0.005463,0.005901,0.018561,0.02534,0.005856,0.003299,1


Unnamed: 0,ID,CNNs,CNNs.1,CNNs.2,CNNs.3,CNNs.4,CNNs.5,CNNs.6,CNNs.7,CNNs.8,...,GIST.503,GIST.504,GIST.505,GIST.506,GIST.507,GIST.508,GIST.509,GIST.510,GIST.511,prediction
0,457,,1.7263,,0.0,0.0,0.799,0.0,0.0,0.0,...,,0.022648,0.043312,0.025739,0.02818,0.032245,0.036754,0.029104,,1
2,459,,,0.0,0.0,,0.0,,0.71016,0.5472,...,0.008683,,0.015751,0.005463,0.005901,0.018561,0.02534,0.005856,0.003299,1
3,460,,0.58965,,0.0,0.25197,0.0,0.80329,0.0,0.23879,...,0.028226,0.004367,0.013654,0.0367,0.029672,0.009894,0.005218,0.037673,0.031507,1


Unnamed: 0,CNNs,CNNs.1,CNNs.2,CNNs.3,CNNs.4,CNNs.5,CNNs.6,CNNs.7,CNNs.8,CNNs.9,...,GIST.502,GIST.503,GIST.504,GIST.505,GIST.506,GIST.507,GIST.508,GIST.509,GIST.510,GIST.511
5,0.0,0.0,0.041689,0.44601,0.0,,0.22815,0.021538,0.0,,...,0.04925,0.017483,0.01738,0.03831,0.054242,0.054469,0.012272,0.011186,0.046882,0.042986
6,0.0,0.14695,,0.0,1.4148,1.7356,0.0,0.057167,0.0,0.0,...,,0.004572,0.003766,0.018723,,0.012496,0.00765,0.006387,0.004385,
26,0.028981,,0.0,0.0,0.45042,0.0,0.0,0.0,,1.7191,...,0.065472,0.054324,0.059546,0.061372,0.04263,0.023522,0.069031,,0.016562,0.014903


Unnamed: 0,ID,CNNs,CNNs.1,CNNs.2,CNNs.3,CNNs.4,CNNs.5,CNNs.6,CNNs.7,CNNs.8,...,GIST.503,GIST.504,GIST.505,GIST.506,GIST.507,GIST.508,GIST.509,GIST.510,GIST.511,prediction
0,457,,1.7263,,0.0,0.0,0.799,0.0,0.0,0.0,...,,0.022648,0.043312,0.025739,0.02818,0.032245,0.036754,0.029104,,1
2,459,,,0.0,0.0,,0.0,,0.71016,0.5472,...,0.008683,,0.015751,0.005463,0.005901,0.018561,0.02534,0.005856,0.003299,1
3,460,,0.58965,,0.0,0.25197,0.0,0.80329,0.0,0.23879,...,0.028226,0.004367,0.013654,0.0367,0.029672,0.009894,0.005218,0.037673,0.031507,1


Unnamed: 0,CNNs,CNNs.1,CNNs.2,CNNs.3,CNNs.4,CNNs.5,CNNs.6,CNNs.7,CNNs.8,CNNs.9,...,GIST.502,GIST.503,GIST.504,GIST.505,GIST.506,GIST.507,GIST.508,GIST.509,GIST.510,GIST.511
5,0.0,0.0,0.041689,0.44601,0.0,0.066736,0.22815,0.021538,0.0,0.375786,...,0.04925,0.017483,0.01738,0.03831,0.054242,0.054469,0.012272,0.011186,0.046882,0.042986
6,0.0,0.14695,0.183216,0.0,1.4148,1.7356,0.0,0.057167,0.0,0.0,...,0.036616,0.004572,0.003766,0.018723,0.033417,0.012496,0.00765,0.006387,0.004385,0.02656
26,0.028981,0.339621,0.0,0.0,0.45042,0.0,0.0,0.0,0.0779,1.7191,...,0.065472,0.054324,0.059546,0.061372,0.04263,0.023522,0.069031,0.038444,0.016562,0.014903
33,0.17866,1.024,0.0,0.097272,0.061065,0.0,0.155396,0.129293,0.0,0.375786,...,0.036616,0.006458,0.020163,0.030288,0.033417,0.027121,0.036814,0.034028,0.037823,0.030416
34,1.3024,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.05308,0.071356,0.055986,0.082698,0.093055,0.096622,0.029276,0.048184,0.028617,0.01501


Unnamed: 0,ID,CNNs,CNNs.1,CNNs.2,CNNs.3,CNNs.4,CNNs.5,CNNs.6,CNNs.7,CNNs.8,...,GIST.503,GIST.504,GIST.505,GIST.506,GIST.507,GIST.508,GIST.509,GIST.510,GIST.511,prediction
0,457,,1.7263,,0.0,0.0,0.799,0.0,0.0,0.0,...,,0.022648,0.043312,0.025739,0.02818,0.032245,0.036754,0.029104,,1
1,458,0.83494,0.0,0.87363,0.18423,,0.35443,1.2575,0.84533,0.8482,...,0.005451,,0.016365,0.014265,,0.026452,,0.053096,0.057367,0
2,459,,,0.0,0.0,,0.0,,0.71016,0.5472,...,0.008683,,0.015751,0.005463,0.005901,0.018561,0.02534,0.005856,0.003299,1
3,460,,0.58965,,0.0,0.25197,0.0,0.80329,0.0,0.23879,...,0.028226,0.004367,0.013654,0.0367,0.029672,0.009894,0.005218,0.037673,0.031507,1
4,461,0.63241,1.0967,0.0,0.0,0.0,,0.12193,0.0,0.4786,...,0.009443,0.017366,0.03762,,0.007043,0.033402,,0.046515,0.011995,1
5,462,0.0,0.0,0.041689,0.44601,0.0,,0.22815,0.021538,0.0,...,0.017483,0.01738,0.03831,0.054242,0.054469,0.012272,0.011186,0.046882,0.042986,0
6,463,0.0,0.14695,,0.0,1.4148,1.7356,0.0,0.057167,0.0,...,0.004572,0.003766,0.018723,,0.012496,0.00765,0.006387,0.004385,,0
7,464,0.0,,0.0,0.0,0.0,0.0,0.061245,1.2361,0.48354,...,0.01692,0.001926,,0.012741,0.019761,,0.008236,,0.021828,1
8,465,0.22492,0.0,0.0,0.0,0.0,0.0,0.072602,0.0,0.0,...,0.015835,,,0.052578,0.043284,0.011697,0.039276,0.040441,0.026759,1
9,466,0.033102,0.97846,0.32537,0.0,0.30961,0.005373,,0.0,0.0,...,0.043655,0.021958,,0.034631,0.048773,0.012623,,0.019914,0.033429,1


In [17]:
confident_train_0_avg

Unnamed: 0,CNNs,CNNs.1,CNNs.2,CNNs.3,CNNs.4,CNNs.5,CNNs.6,CNNs.7,CNNs.8,CNNs.9,...,GIST.502,GIST.503,GIST.504,GIST.505,GIST.506,GIST.507,GIST.508,GIST.509,GIST.510,GIST.511
0,0.215292,0.339621,0.183216,0.097272,0.254131,0.066736,0.155396,0.129293,0.0779,0.375786,...,0.036616,0.027793,0.027661,0.037078,0.033417,0.027121,0.029276,0.038444,0.036219,0.02656


### Copies for testing purposes

In [None]:
ca0_d = confident_additional_0_df.drop(['ID', 'prediction'],axis=1).copy()
c0_avg = confident_train_0_avg.copy()

ca1_d = confident_additional_1_df.drop(['ID', 'prediction'],axis=1).copy()
c1_avg = confident_train_1_avg.copy()

In [27]:
#display(ca0_d[:3])
#display(c0_avg)
for column in ca0_d:
    ca0_d[[column]] = ca0_d[[column]].fillna(value=c0_avg.iloc[0][column])
#display(ca0_d[:3])

In [28]:
#display(ca1_d[:3])
#display(c1_avg)
for column in ca1_d:
    ca1_d[[column]] = ca1_d[[column]].fillna(value=c1_avg.iloc[0][column])
#display(ca1_d[:3])

In [97]:
display(ca1_d[:3])

Unnamed: 0,CNNs,CNNs.1,CNNs.2,CNNs.3,CNNs.4,CNNs.5,CNNs.6,CNNs.7,CNNs.8,CNNs.9,...,GIST.502,GIST.503,GIST.504,GIST.505,GIST.506,GIST.507,GIST.508,GIST.509,GIST.510,GIST.511
0,0.103416,1.7263,0.023226,0.0,0.0,0.799,0.0,0.0,0.0,0.0,...,0.027248,0.021864,0.022648,0.043312,0.025739,0.02818,0.032245,0.036754,0.029104,0.024447
2,0.103416,0.181478,0.0,0.0,0.080345,0.0,0.312417,0.71016,0.5472,0.48071,...,0.012577,0.008683,0.021611,0.015751,0.005463,0.005901,0.018561,0.02534,0.005856,0.003299
3,0.103416,0.58965,0.023226,0.0,0.25197,0.0,0.80329,0.0,0.23879,0.272855,...,0.028806,0.028226,0.004367,0.013654,0.0367,0.029672,0.009894,0.005218,0.037673,0.031507


## Fill AdditionalTrainingDF Nan values with Averages from Non-Confident 0 and 1 predictions

In [39]:
n_cal0_d = non_confident_all_0_df.drop(['ID', 'prediction'],axis=1).copy()
n_cal0_avg = non_confident_all_0_avg.copy()

n_cal1_d = non_confident_all_1_df.drop(['ID', 'prediction'],axis=1).copy()
n_cal1_avg = non_confident_all_1_avg.copy()
display(n_cal0_d[500:503])
display(n_cal0_avg)



display(n_cal0_d[500:503])

Unnamed: 0,CNNs,CNNs.1,CNNs.2,CNNs.3,CNNs.4,CNNs.5,CNNs.6,CNNs.7,CNNs.8,CNNs.9,...,GIST.502,GIST.503,GIST.504,GIST.505,GIST.506,GIST.507,GIST.508,GIST.509,GIST.510,GIST.511
2086,,0.0,0.24943,0.0,1.8954,0.37354,0.0,0.52736,0.0,0.89157,...,0.070134,0.030339,0.038942,0.068479,0.02999,,0.056815,0.044255,0.0146,0.010884
2087,0.0,0.46494,0.0,0.0,,0.0,,0.0,0.0,0.87029,...,,0.019798,0.048874,0.043055,0.043357,0.038329,0.042956,0.047676,0.035714,0.024152
2089,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.048797,0.023553,,0.04387,0.053008,0.035608,0.03666,0.052871,0.063575,0.027157


Unnamed: 0,CNNs,CNNs.1,CNNs.2,CNNs.3,CNNs.4,CNNs.5,CNNs.6,CNNs.7,CNNs.8,CNNs.9,...,GIST.502,GIST.503,GIST.504,GIST.505,GIST.506,GIST.507,GIST.508,GIST.509,GIST.510,GIST.511
0,0.261457,0.335707,0.197927,0.146898,0.227588,0.120929,0.178128,0.100659,0.077675,0.418158,...,0.036161,0.027122,0.027339,0.036665,0.036806,0.028447,0.02951,0.03688,0.037241,0.029349


Unnamed: 0,CNNs,CNNs.1,CNNs.2,CNNs.3,CNNs.4,CNNs.5,CNNs.6,CNNs.7,CNNs.8,CNNs.9,...,GIST.502,GIST.503,GIST.504,GIST.505,GIST.506,GIST.507,GIST.508,GIST.509,GIST.510,GIST.511
2086,0.261457,0.0,0.24943,0.0,1.8954,0.37354,0.0,0.52736,0.0,0.89157,...,0.070134,0.030339,0.038942,0.068479,0.02999,0.028447,0.056815,0.044255,0.0146,0.010884
2087,0.0,0.46494,0.0,0.0,0.227588,0.0,0.178128,0.0,0.0,0.87029,...,0.036161,0.019798,0.048874,0.043055,0.043357,0.038329,0.042956,0.047676,0.035714,0.024152
2089,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.048797,0.023553,0.027339,0.04387,0.053008,0.035608,0.03666,0.052871,0.063575,0.027157
