In [33]:
import numpy as np 
import matplotlib.pyplot as plt
import time
import pandas as pd

from sklearn.model_selection import cross_val_score
from sklearn import svm
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import StackingClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler


In [34]:
# Opening Data With Pandas
conf_rate = pd.read_csv('annotation_confidence.csv')
# Reverting to NumPy for splitting traning data from additonal data  
conf_rate=np.array(conf_rate)
ytraining=conf_rate[:, 1][:657]
yadditional=conf_rate[:, 1][657:] 

In [35]:
# Summary of the confidence rates on the training data 
yt1=ytraining[ytraining==1.0].sum()
yt2=ytraining[ytraining==0.66].sum()/0.66
print("Training Data")
print(f"Data with 100% confidence:{yt1}")
print(f"Data with 66% confidence: {yt2}")
print(f"Total: {yt1+yt2}")

Training Data
Data with 100% confidence:186.0
Data with 66% confidence: 471.0
Total: 657.0


In [36]:
# Summary of the confidence rates on the Additional Data
ya1=yadditional[yadditional==1.0].sum()
ya2=yadditional[yadditional==0.66].sum()/0.66
print("Additional Data")
print(f"Data with 100% confidence:{ya1}")
print(f"Data with 66% confidence: {ya2}")
print(f"Total: {ya1+ya2}")

Additional Data
Data with 100% confidence:1567.0
Data with 66% confidence: 4342.0
Total: 5909.0


In [37]:
# Training Dataset
train_csv = pd.read_csv("training.csv")
X_train = np.array(train_csv)[:, 1:-1]
y_train = np.array(train_csv)[:,-1]
print('training:', X_train.shape, y_train.shape)

training: (657, 4608) (657,)


In [38]:
# Additional Data 
additional_data  = pd.read_csv("additional_training.csv")
additional_data=np.array(additional_data)

# Filling in the blank for the additional data 
imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
additional_data = imputer.fit_transform(additional_data)

# Separating training data from labels
Xa=additional_data[:, 1:-1]
ya=additional_data[:,-1]
print('additional:', Xa.shape)

additional: (5909, 4608)


In [39]:
# Test Dataset
test_csv = pd.read_csv("testing.csv")
X_test = np.array(test_csv)[:, 1:]
print(y_train[0:10])

[0. 0. 0. 1. 1. 1. 1. 1. 1. 1.]


In [40]:
# Splitting data based on confidence scores
idx1=ytraining==1.0
idx66=ytraining==0.66
# Data with 100% confidence
X_train_1=X_train[idx1]
y_train_1=y_train[idx1]
# Data with 66% confidence
X_train_66=X_train[idx66]
y_train_66=y_train[idx66]

print(f"Data with 100% Confidence: {X_train_1.shape}")
print(f"Data with 66% Confidence: {X_train_66.shape}")
print(f"Total: {X_train_1.shape[0]+X_train_66.shape[0]}")

Data with 100% Confidence: (186, 4608)
Data with 66% Confidence: (471, 4608)
Total: 657


In [41]:
# Splitting data based on confidence scores
idx1a=  yadditional==1.0 
idx66a= yadditional==0.66
# Data with 100% confidence
X_train_1a=Xa[idx1a]
y_train_1a=ya[idx1a]
# Data with 66% confidence
X_train_66a=Xa[idx66a]
y_train_66a=ya[idx66a]

print(f"Data with 100% Confidence: {X_train_1a.shape}")
print(f"Data with 66% Confidence: {X_train_66a.shape}")
print(f"Total: {X_train_1a.shape[0]+X_train_66a.shape[0]}")

Data with 100% Confidence: (1567, 4608)
Data with 66% Confidence: (4342, 4608)
Total: 5909


In [10]:
# All 100% Confidence 
# Support Vecotor Machine 90%
X_a1cg1 = np.vstack([X_train_1a, X_train_1])
Y_a1cg1 = np.hstack([y_train_1a, y_train_1])
print(X_a1cg1.shape)

# Support Vector Machine
# K Best 2000
K_Best = SelectKBest(chi2, k=2000)
K_Best.fit(X_a1cg1, Y_a1cg1)
X_current2000 = K_Best.transform(X_a1cg1)
# Binarization
binarization = preprocessing.binarize(X_current2000)
SVM_Model = svm.SVC(kernel='rbf', C=1)
SVM_Model.fit(binarization, Y_a1cg1)
scores = cross_val_score(SVM_Model, binarization, Y_a1cg1, cv=5)
print(f"SVM - For combined data where confidence is 100% accuracy is {scores.mean()}")

# Random Forest
RandomForest_Model = RandomForestClassifier(random_state=0, n_estimators=500)
RandomForest_Model.fit(X_a1cg1, Y_a1cg1)
scores = cross_val_score(RandomForest_Model, X_a1cg1, Y_a1cg1, cv=5)
print(f"Random Forest - For combined data where confidence is 100% accuracy is {scores.mean()}")

# Both Combined
estimators = [('rf', RandomForestClassifier(n_estimators=500, random_state=42)),
             ('svr', make_pipeline(MinMaxScaler(), LinearSVC(random_state=42)))]

combined = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
combined.fit(X_a1cg1, Y_a1cg1)
scores = cross_val_score(combined, X_a1cg1, Y_a1cg1, cv=5)
print(f"Combined - For combined data where confidence is 100% accuracy is {scores.mean()}")

(1753, 4608)
SVM - For combined data where confidence is 100% accuracy is 0.9007456247456247
Random Forest - For combined data where confidence is 100% accuracy is 0.8961953601953601
Combined - For combined data where confidence is 100% accuracy is 0.9115750915750915


In [42]:
# Support Vector Machine
# K Best 2000
K_Best = SelectKBest(chi2, k=1500)
K_Best.fit(X_a1cg1, Y_a1cg1)
X_current = K_Best.transform(X_a1cg1)
# Both Combined
estimators = [('rf', RandomForestClassifier(n_estimators=500, random_state=42)),
             ('svr', make_pipeline(MinMaxScaler(), LinearSVC(random_state=42)))]

combined = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
combined.fit(X_current, Y_a1cg1)
scores = cross_val_score(combined, X_current, Y_a1cg1, cv=5)
print(f"Combined - For combined data where confidence is 100% accuracy is {scores.mean()}")

Combined - For combined data where confidence is 100% accuracy is 0.9053170533170534


In [45]:
# Prediction
X_current = K_Best.transform(X_test)
predictions = combined.predict(X_current)

In [47]:
# CSV Setup
datas='ID,prediction\n'
for i, yy in enumerate(predictions):
    data=str(i+1)+','+str( int(yy) )
    datas+=data+'\n'
    print(yy)

# Create CSV
with open('Final Prediction F.csv', 'w') as pf:
    pf.write(datas)

1.0
1.0
0.0
0.0
0.0
1.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0
0.0
1.0
1.0
1.0
1.0
1.0
0.0
1.0
1.0
1.0
1.0
1.0
0.0
1.0
1.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
1.0
0.0
0.0
0.0
0.0
1.0
1.0
1.0
1.0
0.0
0.0
1.0
1.0
1.0
1.0
1.0
0.0
1.0
0.0
1.0
1.0
1.0
0.0
1.0
1.0
1.0
0.0
1.0
0.0
0.0
1.0
1.0
0.0
1.0
0.0
1.0
1.0
0.0
0.0
0.0
1.0
1.0
0.0
1.0
1.0
0.0
0.0
1.0
1.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
1.0
1.0
1.0
1.0
0.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0
0.0
1.0
0.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0
0.0
1.0
0.0
1.0
0.0
0.0
1.0
0.0
1.0
1.0
0.0
1.0
1.0
0.0
1.0
0.0
1.0
1.0
0.0
1.0
0.0
0.0
1.0
0.0
0.0
1.0
0.0
1.0
1.0
1.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
1.0
1.0
0.0
1.0
0.0
1.0
0.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0
0.0
0.0
0.0
1.0
1.0
1.0
0.0
0.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0
1.0
1.0
1.0
1.0
0.0
0.0
0.0
1.0
0.0
1.0
0.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0
1.0
1.0
1.0
0.0
1.0
0.0
1.0
1.0
0.0
0.0
1.0
0.0
1.0
0.0
0.0
0.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0
0.0
0.0


In [12]:
# All 100% Confidence GIST
xGist_1 = X_a1cg1[:, 4096:]
yGist_1 = Y_a1cg1
print(yGist_1.shape)

# Support Vector Machine
# K Best 400
K_Best = SelectKBest(chi2, k=400)
K_Best.fit(xGist_1, yGist_1)
X_current2000 = K_Best.transform(xGist_1)
# Binarization
binarization = preprocessing.binarize(X_current2000)
SVM_Model = svm.SVC(kernel='rbf', C=1)
SVM_Model.fit(binarization, yGist_1)
scores = cross_val_score(SVM_Model, binarization, yGist_1, cv=5)
print(f"SVM - For combined GIST data where confidence is 100% accuracy is {scores.mean()}")

# Random Forest
RandomForest_Model = RandomForestClassifier(random_state=0, n_estimators=500)
RandomForest_Model.fit(xGist_1, yGist_1)
scores = cross_val_score(RandomForest_Model, xGist_1, yGist_1, cv=5)
print(f"Random Forest - For combined GIST data where confidence is 100% accuracy is {scores.mean()}")

(1753,)
SVM - For combined GIST data where confidence is 100% accuracy is 0.7404444444444446
Random Forest - For combined GIST data where confidence is 100% accuracy is 0.8174603174603174


In [13]:
# CNN + GIST 66% Confidence
# Additional Data 100% Confidence 
X_a1cg6 = np.vstack([X_train_1a, X_train])
Y_a1cg6 = np.hstack([y_train_1a, y_train])
print(X_a1cg6.shape)

# Support Vector Machine
# K Best 2000
K_Best = SelectKBest(chi2, k=2000)
K_Best.fit(X_a1cg6, Y_a1cg6)
X_current2000 = K_Best.transform(X_a1cg6)
# Binarization
binarization = preprocessing.binarize(X_current2000)
SVM_binarize = svm.SVC(kernel='rbf', C=1)
SVM_binarize.fit(binarization, Y_a1cg6)
scores = cross_val_score(SVM_binarize, binarization, Y_a1cg6, cv=5)
print(f"SVM - When additional data has 100% confidence accuracy is {scores.mean()}")

# Random Forest
RandomForest_Model = RandomForestClassifier(random_state=0, n_estimators=500)
RandomForest_Model.fit(X_a1cg6, Y_a1cg6)
scores = cross_val_score(RandomForest_Model, X_a1cg6, Y_a1cg6, cv=5)
print(f"Random Forest - When additional data has 100% confidence accuracy is {scores.mean()}")

(2224, 4608)
SVM - When additional data has 100% confidence accuracy is 0.855630124506529
Random Forest - When additional data has 100% confidence accuracy is 0.8511377669804636


In [14]:
# CNN + GIST Only
X_CnnGist = X_train
Y_CnnGist = y_train
print(X_CnnGist.shape)

# Support Vector Machine
# K Best 2000
K_Best = SelectKBest(chi2, k=2000)
K_Best.fit(X_CnnGist, Y_CnnGist)
X_current2000 = K_Best.transform(X_CnnGist)
# Binarization
binarization = preprocessing.binarize(X_current2000)
SVM_Model = svm.SVC(kernel='rbf', C=1)
SVM_Model.fit(binarization, Y_CnnGist)
scores = cross_val_score(SVM_Model, binarization, Y_CnnGist, cv=5)
print(f"SVM - The accuracy for the GIST+CNN data is {scores.mean()}")

# Random Forest
RandomForest_Model = RandomForestClassifier(random_state=0, n_estimators=500)
RandomForest_Model.fit(X_CnnGist, Y_CnnGist)
scores = cross_val_score(RandomForest_Model, X_CnnGist, Y_CnnGist, cv=5)
print(f"Random Forest - The accuracy for the GIST+CNN data is {scores.mean()}")

(657, 4608)
SVM - The accuracy for the GIST+CNN data is 0.7670714781401804
Random Forest - The accuracy for the GIST+CNN data is 0.7610108720795744


In [15]:
# CNN + Additional
# Support Vector Machine 54%
xTarin = X_train[:, :4096]
xxTrain = Xa[:, :4096]
X_Acnn = np.vstack([xxTrain, xTarin])
Y_Acnn = np.hstack([ya, y_train])
print(Y_Acnn.shape)

# Support Vector Machine
# K Best 2000
K_Best = SelectKBest(chi2, k=2000)
K_Best.fit(X_Acnn, Y_Acnn)
X_current2000 = K_Best.transform(X_Acnn)
# Binarization
binarization = preprocessing.binarize(X_current2000)
SVM_Model = svm.SVC(kernel='rbf', C=1)
SVM_Model.fit(binarization, Y_Acnn)
scores = cross_val_score(SVM_Model, binarization, Y_Acnn, cv=5)
print(f"SVM - The accuracy for all the data combined (including additional data and GIST) is {scores.mean()}")

# Random Forest
RandomForest_Model = RandomForestClassifier(random_state=0, n_estimators=500)
RandomForest_Model.fit(X_Acnn, Y_Acnn)
scores = cross_val_score(RandomForest_Model, X_Acnn, Y_Acnn, cv=5)
print(f"Random Forest - The accuracy for all the data combined (including additional data and GIST) is {scores.mean()}")

(6566,)
SVM - The accuracy for all the data combined (including additional data and GIST) is 0.7488570564116476
Random Forest - The accuracy for all the data combined (including additional data and GIST) is 0.7482483443286373


In [16]:
# GIST + Additional
xTrain = X_train[:, 4096:]
xxTrain = Xa[:, 4096:]
X_Agist = np.vstack([xxTrain, xTrain])
Y_Agist = np.hstack([ya, y_train])
print(X_Agist.shape)

# Support Vector Machine
# K Best 400
K_Best = SelectKBest(chi2, k=400)
K_Best.fit(X_Agist, Y_Agist)
X_current2000 = K_Best.transform(X_Agist)

# Binarization
binarization = preprocessing.binarize(X_current2000)
SVM_Model = svm.SVC(kernel='rbf', C=1)
SVM_Model.fit(binarization, Y_Agist)
scores = cross_val_score(SVM_Model, binarization, Y_Agist, cv=5)
print(f"SVM - The accuracy for all GIST features (including additional data) is {scores.mean()}")

# Random Forest
RandomForest_Model = RandomForestClassifier(random_state=0, n_estimators=500)
RandomForest_Model.fit(X_Agist, Y_Agist)
scores = cross_val_score(RandomForest_Model, X_Agist, Y_Agist, cv=5)
print(f"Random Forest - The accuracy for all GIST features (including additional data) is {scores.mean()}")


(6566, 512)
SVM - The accuracy for all GIST features (including additional data) is 0.5388364336960567
Random Forest - The accuracy for all GIST features (including additional data) is 0.6815389020461582


In [17]:
# CNN Only
X_CNN = X_train[:, :4096]
Y_CNN = y_train[:4096]
Test_CNN = X_test[:, :4096]
print(X_CNN.shape)

# Support Vector Machine
# K Best 400
K_Best = SelectKBest(chi2, k=400)
K_Best.fit(X_CNN, Y_CNN)
X_current2000 = K_Best.transform(X_CNN)

# Binarization
binarization = preprocessing.binarize(X_current2000)
SVM_Model = svm.SVC(kernel='rbf', C=1)
SVM_Model.fit(binarization, Y_CNN)
scores = cross_val_score(SVM_Model, binarization, Y_CNN, cv=5)
print(f"The accuracy for all CNN features only is {scores.mean()}")

# Random Forest
RandomForest_Model = RandomForestClassifier(random_state=0, n_estimators=500)
RandomForest_Model.fit(X_CNN, Y_CNN)
scores = cross_val_score(RandomForest_Model, X_CNN, Y_CNN, cv=5)
print(f"Random Forest - The accuracy for all GIST features (including additional data) is {scores.mean()}")


(657, 4096)
The accuracy for all CNN features only is 0.7579227388387693
Random Forest - The accuracy for all GIST features (including additional data) is 0.7518390006939626


In [18]:
# GIST Only
X_GIST = X_train[:, 4096:]
Y_GIST = y_train
Test_GIST = X_test[:, 4096:]
print(X_GIST.shape)

# Support Vector Machine
# K Best 400
K_Best = SelectKBest(chi2, k=400)
K_Best.fit(X_GIST, Y_GIST)
X_current2000 = K_Best.transform(X_GIST)

# Binarization
binarization = preprocessing.binarize(X_current2000)
SVM_Model = svm.SVC(kernel='rbf', C=1)
SVM_Model.fit(binarization, Y_GIST)
scores = cross_val_score(SVM_Model, binarization, Y_GIST, cv=5)
print(f"The accuracy for all GIST features only is {scores.mean()}")

(657, 512)
The accuracy for all GIST features only is 0.535762202174416


In [19]:
# Additional Only
X_add = Xa
Y_add = ya
print(X_add.shape)

# Support Vector Machine
# K Best 400
K_Best = SelectKBest(chi2, k=400)
K_Best.fit(X_add, Y_add)
X_current2000 = K_Best.transform(X_add)

# Binarization
binarization = preprocessing.binarize(X_current2000)
SVM_Model = svm.SVC(kernel='rbf', C=1)
SVM_Model.fit(binarization, Y_add)
scores = cross_val_score(SVM_Model, binarization, Y_add, cv=5)
print(f"The accuracy for the additional data only is {scores.mean()}")

(5909, 4608)
The accuracy for the additional data only is 0.7442880864677759


In [20]:
# Experiment 1 - Finding the best parameters for SVM

# C = 1
SVM_param1 = svm.SVC(kernel='rbf', C=1)
SVM_param1.fit(X_CnnGist, Y_CnnGist)
scores = cross_val_score(SVM_param1, X_CnnGist, Y_CnnGist, cv=5)
print(f"When C = 1 accuracy is {scores.mean()}")

# C = 5
SVM_param5 = svm.SVC(kernel='rbf', C=5)
SVM_param5.fit(X_CnnGist, Y_CnnGist)
scores = cross_val_score(SVM_param5, X_CnnGist, Y_CnnGist, cv=5)
print(f"When C = 5 accuracy is {scores.mean()}")

# C = 10
SVM_param10 = svm.SVC(kernel='rbf', C=50)
SVM_param10.fit(X_CnnGist, Y_CnnGist)
scores = cross_val_score(SVM_param10, X_CnnGist, Y_CnnGist, cv=5)
print(f"When C = 10 accuracy is {scores.mean()}")

When C = 1 accuracy is 0.7426786953504511
When C = 5 accuracy is 0.7122484385843164
When C = 10 accuracy is 0.7122484385843164


In [21]:
# Experiment 2 - Finding the best kernel for SVM

# RBF Kernel
SVM_paramrbf = svm.SVC(kernel='rbf', C=1)
SVM_paramrbf.fit(X_CnnGist, Y_CnnGist)
scores = cross_val_score(SVM_paramrbf, X_CnnGist, Y_CnnGist, cv=5)
print(f"When the kernel is 'rbf', accuracy is {scores.mean()}")

# Polynomial Kernel
SVM_parampoly = svm.SVC(kernel='poly', C=1)
SVM_parampoly.fit(X_CnnGist, Y_CnnGist)
scores = cross_val_score(SVM_parampoly, X_CnnGist, Y_CnnGist, cv=5)
print(f"When the kernel is 'poly', accuracy is {scores.mean()}")

# Linear Kernel
SVM_paramlinear = svm.SVC(kernel='linear', C=1)
SVM_paramlinear.fit(X_CnnGist, Y_CnnGist)
scores = cross_val_score(SVM_paramlinear, X_CnnGist, Y_CnnGist, cv=5)
print(f"When the kernel is 'linear', accuracy is {scores.mean()}")

When the kernel is 'rbf', accuracy is 0.7426786953504511
When the kernel is 'poly', accuracy is 0.7259426324311822
When the kernel is 'linear', accuracy is 0.6650821188989128


In [25]:
# Experiment 3 - Feature selection


# K Best 2000
K_Best = SelectKBest(chi2, k=2000)
K_Best.fit(X_CnnGist, Y_CnnGist)
X_current2000 = K_Best.transform(X_CnnGist)
Test_Current2000 = K_Best.transform(X_test)

SVM_paramk2000 = svm.SVC(kernel='rbf', C=1)
SVM_paramk2000.fit(X_current2000, Y_CnnGist)
scores = cross_val_score(SVM_paramk2000, X_current2000, Y_CnnGist, cv=5)
print(f"When k best is 2000, accuracy is {scores.mean()}")


# K Best 2500
K_Best = SelectKBest(chi2, k=2500)
K_Best.fit(X_CnnGist, Y_CnnGist)
X_current2500 = K_Best.transform(X_CnnGist)
Test_Current2500 = K_Best.transform(X_test)

SVM_paramk2500 = svm.SVC(kernel='rbf', C=1)
SVM_paramk2500.fit(X_current2500, Y_CnnGist)
scores = cross_val_score(SVM_paramk2500, X_current2500, Y_CnnGist, cv=5)
print(f"When k best is 2500, accuracy is {scores.mean()}")

# K Best 3000
K_Best = SelectKBest(chi2, k=3000)
K_Best.fit(X_CnnGist, Y_CnnGist)
X_current3000 = K_Best.transform(X_CnnGist)
Test_Current3000 = K_Best.transform(X_test)

SVM_paramk3000 = svm.SVC(kernel='rbf', C=1)
SVM_paramk3000.fit(X_current3000, Y_CnnGist)
scores = cross_val_score(SVM_paramk3000, X_current3000, Y_CnnGist, cv=5)
print(f"When k best is 3000, accuracy is {scores.mean()}")

When k best is 2000, accuracy is 0.7503238491788109
When k best is 2500, accuracy is 0.7472588480222068
When k best is 3000, accuracy is 0.7426902613925515


In [26]:
# Experiment 4 - Preprocessing


# Standardization
standardisation = preprocessing.StandardScaler()
standardize = standardisation.fit_transform(X_current2500)
SVM_standardize = svm.SVC(kernel='rbf', C=1)
SVM_standardize.fit(standardize, Y_CnnGist)
scores = cross_val_score(SVM_standardize, standardize, Y_CnnGist, cv=5)
print(f"When using Standardization, accuracy is {scores.mean()}")

# Min Max
scaling = MinMaxScaler()
minmax = scaling.fit_transform(X_current2500)
SVM_minmax = svm.SVC(kernel='rbf', C=1)
SVM_minmax.fit(minmax, Y_CnnGist)
scores = cross_val_score(SVM_minmax, minmax, Y_CnnGist, cv=5)
print(f"When using MinMax Scaling, accuracy is {scores.mean()}")


# Binarization
binarization = preprocessing.binarize(X_current2500)
SVM_binarize = svm.SVC(kernel='rbf', C=1)
SVM_binarize.fit(binarization, Y_CnnGist)
scores = cross_val_score(SVM_binarize, binarization, Y_CnnGist, cv=5)
print(f"When using binarization, accuracy is {scores.mean()}")



When using Standardization, accuracy is 0.7518158686097617
When using MinMax Scaling, accuracy is 0.7518274346518621
When using binarization, accuracy is 0.7640180430256766
