In [2]:
import numpy as np
import pandas as pd
import os
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.svm import SVC

from sklearn.preprocessing import MinMaxScaler

#from dfply import arrange

from sklearn.utils import resample
from sklearn.metrics import cohen_kappa_score, make_scorer

In [3]:
#path1 = "D:/tunc_oz/apply_model"
path1 = "/home/rsrg9/Documents/tunc_oz/apply_model"
os.chdir(path1)

# Second path
path2 = "csv_data_r_import/cologne/scale"
os.chdir(path2)

In [141]:
def svm_fit(x, y, index_train=None):
    # Expand coarse grid
    coarse_grid = {'C': 2.0 ** np.arange(-4, 13, 2),
                   'gamma': 2.0 ** np.arange(-5, 4, 2)}
    
    kappa_scorer = make_scorer(cohen_kappa_score)

    # Coarse grid search
    svm_coarse = SVC(kernel='rbf')
    svm_coarse_cv = GridSearchCV(svm_coarse, param_grid=coarse_grid, scoring=kappa_scorer)#, cv=index_train)
    svm_coarse_cv.fit(x, y)
    
    # Get best coarse grid parameters
    best_c = svm_coarse_cv.best_params_['C']
    best_gamma = svm_coarse_cv.best_params_['gamma']
    
    # Define narrow grid borders
    a_gamma = np.log2(best_gamma) - 2
    b_gamma = np.log2(best_gamma) + 2
    a_c = np.log2(best_c) - 2
    b_c = np.log2(best_c) + 2
    
    # Expand narrow grid
    narrow_grid = {'C': 2.0 ** np.arange(a_c, b_c, 0.5),
                   'gamma': 2.0 ** np.arange(a_gamma, b_gamma, 0.5)}
    
    # Narrow grid search
    svm_narrow = SVC(kernel='rbf')
    svm_narrow_cv = GridSearchCV(svm_narrow, param_grid=narrow_grid, scoring=kappa_scorer)#, cv=index_train)
    svm_narrow_cv.fit(x, y)
    
    return svm_narrow_cv

# Usage example:
# svm_model = svm_fit(X_train, y_train, StratifiedKFold(n_splits=10, shuffle=True))

# Euclidean Distance between two points lying in the input space
def euc_dis(a, b):
    temp = 0
    for ii in range(len(a)):
        temp += (a[ii] - b[ii])**2
    return np.sqrt(temp)

# Evaluate the distance between Virtual Support Vectors and Support Vectors lying in the input space
def rem_extrem(org, VSV1, a):
    distance = pd.DataFrame(index=range(len(org)), columns=['label', 'distance'])
    distanceSVC1 = []
    distanceSVC2 = []
    
    for l in range(len(org)):
        distance.loc[l, 'label'] = str(org.iloc[l, -1])
        distance.loc[l, 'distance'] = euc_dis(org.iloc[l, :-1], VSV1.iloc[l, :-1])
    
    SVClass1 = org[org['REF'] == org['REF'].unique()[0]]
    SVClass2 = org[org['REF'] == org['REF'].unique()[1]]
    
    if len(SVClass1) > 0:
        for n in range(len(SVClass1) - 1):
            for nn in range(n, len(SVClass1) - 1):
                distanceSVC1.append(euc_dis(SVClass1.iloc[n, :-1], SVClass1.iloc[n + nn, :-1]))
        disClass1median = np.mean(distanceSVC1)
        boundClass1 = disClass1median * a
    
    if len(SVClass2) > 0:
        for n in range(len(SVClass2) - 1):
            for nn in range(n, len(SVClass2) - 1):
                distanceSVC2.append(euc_dis(SVClass2.iloc[n, :-1], SVClass2.iloc[n + nn, :-1]))
        disClass2median = np.mean(distanceSVC2)
        boundClass2 = disClass2median * a
    
    for k in range(len(org)):
        if np.isnan(distance.loc[k, 'distance']):
            VSV1.iloc[k, :] = np.nan
        else:
            if boundClass1 is not None:
                if distance.loc[k, 'label'] == org['REF'].unique()[0]:
                    if distance.loc[k, 'distance'] > boundClass1:
                        VSV1.iloc[k, :] = np.nan
            else:
                if boundClass2 is not None:
                    if distance.loc[k, 'label'] == org['REF'].unique()[1]:
                        if distance.loc[k, 'distance'] > boundClass2:
                            VSV1.iloc[k, :] = np.nan
    return VSV1

# Kernel distance between two points lying in the hyperspace
def kern_dis(a, b, kernelfunc):
    a = np.array(a).flatten()
    b = np.array(b).flatten()
    dk = np.sqrt(kernelfunc(a, a) + kernelfunc(b, b) - 2 * kernelfunc(a, b))
    return dk

# Evaluate the distance between Virtual Support Vectors and Support Vectors lying in the hyperspace
def rem_extrem_kerneldist(org, VSV1, a, kernelfunc):
    distance = pd.DataFrame(index=range(len(org)), columns=['label', 'distance'])
    distanceSVC1 = []
    distanceSVC2 = []
    
    for l in range(len(org)):
        distance.loc[l, 'label'] = str(org.iloc[l, -1])
        distance.loc[l, 'distance'] = kern_dis(org.iloc[l, :-1], VSV1.iloc[l, :-1], kernelfunc)
    
    SVClass1 = org[org['REF'] == org['REF'].unique()[0]]
    SVClass2 = org[org['REF'] == org['REF'].unique()[1]]
    
    if len(SVClass1) > 0:
        for n in range(len(SVClass1) - 1):
            for nn in range(n, len(SVClass1) - 1):
                distanceSVC1.append(kern_dis(SVClass1.iloc[n, :-1], SVClass1.iloc[n + nn, :-1], kernelfunc))
        disClass1median = np.mean(distanceSVC1)
        boundClass1 = disClass1median * a
    
    if len(SVClass2) > 0:
        for n in range(len(SVClass2) - 1):
            for nn in range(n, len(SVClass2) - 1):
                distanceSVC2.append(kern_dis(SVClass2.iloc[n, :-1], SVClass2.iloc[n + nn, :-1], kernelfunc))
        disClass2median = np.mean(distanceSVC2)
        boundClass2 = disClass2median * a
    
    for k in range(len(org)):
        if np.isnan(distance.loc[k, 'distance']):
            VSV1.iloc[k, :] = np.nan
        else:
            if boundClass1 is not None:
                if distance.loc[k, 'label'] == org['REF'].unique()[0]:
                    if distance.loc[k, 'distance'] > boundClass1:
                        VSV1.iloc[k, :] = np.nan
            else:
                if boundClass2 is not None:
                    if distance.loc[k, 'label'] == org['REF'].unique()[1]:
                        if distance.loc[k, 'distance'] > boundClass2:
                            VSV1.iloc[k, :] = np.nan
    return VSV1

def pred_one(model, data_point):
    # Extract necessary components from the SVM model
    support_vectors = model.n_support_
    kernel_function = model.kernel
    coefficients = model.dual_coef_.ravel()
    intercept = model.intercept_
    
    # Initialize prediction variable
    prediction = 0
    
    # Iterate over each support vector
    for j in range(len(support_vectors)):
        # Compute kernel function value between the j-th support vector and the data point
        kernel_value = kernel_function(data_point.reshape(1, -1), model.support_vectors_[j, :].reshape(1, -1))
        
        # Multiply kernel value by the corresponding coefficient and add to prediction
        weighted_value = kernel_value * coefficients[j]
        prediction += weighted_value
    
    # Subtract intercept to get the final prediction
    final_prediction = prediction - intercept
    
    return final_prediction

def uncertainty_dist_v2_2(org, samp):
    distance = pd.DataFrame(columns=['control_label', 'distance'], index=range(len(samp)))
    
    for k in range(len(samp)):
        distance.loc[k, 'distance'] = np.sign(pred_one(org.finalModel, samp.iloc[k, :-1])) * \
                                      np.where(pred_one(org.finalModel, samp.iloc[k, :-1]) > 0, 1, -1)
    
    # Normalize distance
    preProc = preprocessing.MinMaxScaler()
    preProc.fit(distance[['distance']])
    normdistance = preProc.transform(distance[['distance']])
    
    samp['normdistance'] = normdistance
    
    return samp

def alter_labels(distance_data, ref):
    # Merge features and original labels
    ref_added = pd.concat([distance_data, ref], axis=1)
    # Order by most uncertain samples
    ref_added_or = ref_added.sort_values(by='distance')
    # Re-label most uncertain n number of samples
    ref_added_or.iloc[:250, -1] = ref_added_or.iloc[:250, -2]
    ref_added_or.iloc[:250, -2] = 1.0
    # Re-order dataset by its index
    ref_added_or['index'] = range(len(ref_added_or))
    ref_added_reor = ref_added_or.sort_values(by='index')
    
    # Extract labels for prediction
    labels = ref_added_reor.iloc[:, -5]
    return labels

def ExCsvMSD(datadase, filename=None):
    # Convert to numpy array
    datadase = np.array(datadase)
    n = datadase.shape[1]
    MSDdata = np.empty((2, n), dtype=float)
    
    MSDdata[0, :] = np.mean(datadase, axis=0)
    MSDdata[1, :] = np.std(datadase, axis=0)
    
    MSDdata_final = np.vstack((datadase, MSDdata))
    
    # Export final mean and standard deviation to .csv-file
    if filename is not None:
        pd.DataFrame(MSDdata_final).to_csv(filename, index=False, header=False)
    
    return MSDdata_final

In [152]:
inputPath = "cologne_res_100_L2-L13.csv"
sMax = 1000
bound = [0.3, 0.6, 0.9]
boundMargin = [1.5, 1.0, 0.5]
sampleSizesPor = [40, 25, 16, 12, 10, 8, 6, 4, 3, 2, 1]
colheader = ["40", "25", "16", "12", "10", "8", "6", "4", "3", "2", "1"]
sindexSVMDATA = 36
numFeat = 18
eindexSVMDATA = sindexSVMDATA + numFeat - 1
objInfoNames = ["Lx_g_comp", "Lx_g_elfi", "Lx_g_refi", "Lx_g_roun", "Lx_g_shin",
                "Lx_m_bl", "Lx_m_gr", "Lx_m_ndvi", "Lx_m_nir", "Lx_m_re",
                "Lx_sd_bl", "Lx_sd_gr", "Lx_sd_ndvi", "Lx_sd_nir", "Lx_sd_re",
                "Lx_t_diss", "Lx_t_hom", "Lx_t_mean",
                "label"]
columnClass = [None] * 217 + ["factor", "integer"]

# Import data
preproc_DataPool = pd.read_csv(inputPath, header=0, sep=";", dtype=str, na_values=None)

tmp_DataPool = preproc_DataPool.iloc[:, :-2]

generalDataPool_columns = tmp_DataPool.columns

converters = {col: lambda x: float(x.replace(',', '.')) for col in generalDataPool_columns}
generalDataPool = pd.read_csv(inputPath, header=0, sep=";", na_values=None, converters=converters)


generalDataPool.dropna(subset=["REF"], inplace=True)  # Remove rows with missing REF values
generalDataPool["REF"] = pd.Categorical(generalDataPool["REF"])

# Transform to 2-Class-Case "Bushes Trees" VS rest
first_label_class = generalDataPool["REF"].cat.categories[0]  # Note that the first record is of class "bushes trees"
generalDataPool["REF"] = generalDataPool["REF"].apply(lambda x: first_label_class if x == first_label_class else "other")
generalDataPool["REF"] = pd.Categorical(generalDataPool["REF"])

data = generalDataPool.iloc[:, sindexSVMDATA:eindexSVMDATA + 1]
REF = generalDataPool.iloc[:, -1]
data_with_label = pd.concat([data, REF], axis=1)
data_label = data_with_label.iloc[:, -1]

In [7]:
generalDataPool.dtypes

L02_G_COMP     float64
L02_G_EFIT     float64
L02_G_RFIT     float64
L02_G_ROUN     float64
L02_G_SHIN     float64
                ...   
L13_T_DISS     float64
L13_T_HOM      float64
L13_T_MEA      float64
REF           category
USE              int64
Length: 218, dtype: object

In [18]:
#normalizedFeat['L02_G_COMP'] = normalizedFeat['L02_G_COMP'].replace(',', '.', regex=True)

#normalizedFeat['L02_G_COMP'] = normalizedFeat['L02_G_COMP'].astype(float)

In [None]:
#normalizedFeat['L02_G_EFIT'] = normalizedFeat['L02_G_EFIT'].apply(lambda x: pd.to_numeric(x.str.replace(',', '.'), errors='coerce'))

In [153]:
normalizedFeat = generalDataPool.iloc[:, :-2]
normalizedLabelUSE = generalDataPool.iloc[:, -2:]

# Scaling
preProc = MinMaxScaler()
normalizedFeatBase = pd.DataFrame(preProc.fit_transform(normalizedFeat.iloc[:, sindexSVMDATA:eindexSVMDATA + 1]), columns=objInfoNames[:-1])

# Apply range of basemodel to all levels
normalizedFeat2 = pd.DataFrame(preProc.fit_transform(normalizedFeat.iloc[:, :numFeat]), columns=objInfoNames[:-1])
normalizedFeat3 = pd.DataFrame(preProc.fit_transform(normalizedFeat.iloc[:, numFeat:(2 * numFeat)]), columns=objInfoNames[:-1])
normalizedFeat5 = pd.DataFrame(preProc.fit_transform(normalizedFeat.iloc[:, (3 * numFeat):(4 * numFeat)]), columns=objInfoNames[:-1])
normalizedFeat6 = pd.DataFrame(preProc.fit_transform(normalizedFeat.iloc[:, (4 * numFeat):(5 * numFeat)]), columns=objInfoNames[:-1])
normalizedFeat7 = pd.DataFrame(preProc.fit_transform(normalizedFeat.iloc[:, (5 * numFeat):(6 * numFeat)]), columns=objInfoNames[:-1])
normalizedFeat8 = pd.DataFrame(preProc.fit_transform(normalizedFeat.iloc[:, (6 * numFeat):(7 * numFeat)]), columns=objInfoNames[:-1])
normalizedFeat9 = pd.DataFrame(preProc.fit_transform(normalizedFeat.iloc[:, (7 * numFeat):(8 * numFeat)]), columns=objInfoNames[:-1])
normalizedFeat10 = pd.DataFrame(preProc.fit_transform(normalizedFeat.iloc[:, (8 * numFeat):(9 * numFeat)]), columns=objInfoNames[:-1])
normalizedFeat11 = pd.DataFrame(preProc.fit_transform(normalizedFeat.iloc[:, (9 * numFeat):(10 * numFeat)]), columns=objInfoNames[:-1])

# Recombine normalized sets to one data frame
normalizedDataPoolAllLev = pd.concat([normalizedFeat2, normalizedFeat3, normalizedFeatBase, normalizedFeat5,
                                      normalizedFeat6, normalizedFeat7, normalizedFeat8, normalizedFeat9,
                                      normalizedFeat10, normalizedFeat11, normalizedLabelUSE], axis=1)

# Remove used temporary variables
del normalizedFeat, normalizedFeat2, normalizedFeat3, normalizedFeatBase, normalizedFeat5, normalizedFeat6
del normalizedFeat7, normalizedFeat8, normalizedFeat9, normalizedFeat10, normalizedFeat11


In [154]:
# Split data into test, train, and validate data
trainDataPoolAllLev, testDataAllLev, validateDataAllLev = [df for _, df in normalizedDataPoolAllLev.groupby('USE')]
trainDataPoolAllLev = trainDataPoolAllLev.iloc[:, :-1]
testDataAllLev = testDataAllLev.iloc[:, :-1]
validateFeatAllLev = validateDataAllLev.iloc[:, :-2]
validateLabels = validateDataAllLev.iloc[:, -2]

# Order train data pool by class label in alphabetical order
trainDataPoolAllLev = trainDataPoolAllLev.sort_values(by=trainDataPoolAllLev.columns[-1])

# Current training data-set, updated (refreshed) after each iteration
trainDataCur = trainDataPoolAllLev.copy()
testDataCur = testDataAllLev.copy()

# Set randomized seed for the random sampling procedure
seed = 5

# Initial seed value for randomized sampling
seed += np.random.randint(1, 101)

# Definition of apriori-probabilities
pA = pB = pC = pD = pE = pF = 1 / 6

# Definition of training sample set sizes S [% of max. sample size]
sCur = sMax * (sampleSizesPor[0] / 100)
# Definition of sample shares
nA, nB, nC, nD, nE, nF = [round(sCur * p) for p in [pA, pB, pC, pD, pE, pF]]
shares = np.array([nA, nB, nC, nD, nE, nF])

# Set randomized seed for the random sampling procedure
np.random.seed(seed)

In [116]:
validateDataAllLev.columns[-2]

'REF'

In [32]:
#stratSamp = trainDataCur.groupby('REF', observed=False)[trainDataCur.columns].apply(lambda x: x.sample(67, replace=False))

In [155]:
# Define the sampling function
def sample_within_group(group):
    # Add the original IDs as a new column
    group['ID_unit'] = group.index
    
    # Perform sampling within the group
    sampled_group = group.sample(min(len(group), 67), replace=False)
    
    return sampled_group

# Apply the sampling function to each group
stratSamp = trainDataCur.groupby('REF', observed=False).apply(sample_within_group)

# Reset the index to obtain a flat DataFrame with the original IDs preserved
stratSamp.reset_index(drop=True, inplace=True)

# Get samples of trainDataCur and set trainDataCur new
trainDataCurRemaining = trainDataCur.drop(stratSamp["ID_unit"])

# Split test feat from test label for later join with trainData
trainFeat = stratSamp.iloc[:, :len(trainDataPoolAllLev.columns)-1]
trainLabels = stratSamp.iloc[:, len(trainDataPoolAllLev.columns)-1]

  stratSamp = trainDataCur.groupby('REF', observed=False).apply(sample_within_group)


In [101]:
len(trainDataPoolAllLev.columns)

181

In [102]:
trainFeat.columns

Index(['Lx_g_comp', 'Lx_g_elfi', 'Lx_g_refi', 'Lx_g_roun', 'Lx_g_shin',
       'Lx_m_bl', 'Lx_m_gr', 'Lx_m_ndvi', 'Lx_m_nir', 'Lx_m_re',
       ...
       'Lx_m_nir', 'Lx_m_re', 'Lx_sd_bl', 'Lx_sd_gr', 'Lx_sd_ndvi',
       'Lx_sd_nir', 'Lx_sd_re', 'Lx_t_diss', 'Lx_t_hom', 'Lx_t_mean'],
      dtype='object', length=180)

In [104]:
stratSamp.columns[len(trainDataPoolAllLev.columns)-1]

'REF'

In [30]:
#stratSamp = trainDataCur.groupby('REF',observed=False)
#stratSamp = stratSamp.apply(lambda x: x.sample(67, replace=False))

In [None]:
# Definition of sampling configuration (strata: random sampling without replacement)
#stratSamp = trainDataCur.groupby('REF').apply(lambda x: x.sample(shares, replace=False)).reset_index(drop=True)

In [46]:
trainDataCur.columns[-1]

'Lx_t_diss'

In [58]:
stratSamp.iloc[:,181]

0      141133
1      311080
2       27290
3       25174
4        5752
        ...  
129    199495
130    348052
131     63271
132    104838
133     45255
Name: ID_unit, Length: 134, dtype: int64

In [57]:
stratSamp["ID_unit"]

0      141133
1      311080
2       27290
3       25174
4        5752
        ...  
129    199495
130    348052
131     63271
132    104838
133     45255
Name: ID_unit, Length: 134, dtype: int64

In [None]:
trainFeat.describe

In [156]:

# Subset for each outer iteration test data to speed up computing
testDataCur = testDataCur.sort_values(by=testDataCur.columns[-1])
# Apply the sampling function to each group
stratSamp = testDataCur.groupby('REF', observed=False).apply(sample_within_group)

# Split test feat from test label for later join with trainData
testFeat = stratSamp.iloc[:, :len(testDataCur.columns)-1]
testLabels = stratSamp.iloc[:, len(testDataCur.columns)-1]

# Subset on base level
testFeatsub = testFeat.iloc[:, sindexSVMDATA:eindexSVMDATA + 1]

# TrainData index to split between train and test in svmFit
countTrainData = trainFeat.shape[0]
indexTrainData = [list(range(1, countTrainData + 1))]

# SVM base for invariants

# Subset on L_4
trainFeat = trainFeat.iloc[:, sindexSVMDATA:eindexSVMDATA + 1]

# Join train and test data (separable through indexTrainData in svmFit)
tuneFeat = pd.concat([trainFeat, testFeatsub], axis=0)
tuneLabel = np.concatenate((trainLabels.values, testLabels.values))

validateFeatsub = validateFeatAllLev.iloc[:, sindexSVMDATA:eindexSVMDATA+1]

  stratSamp = testDataCur.groupby('REF', observed=False).apply(sample_within_group)


In [76]:
from sklearn import datasets
iris = datasets.load_iris()
# Expand coarse grid
coarse_grid = {'C': 2.0 ** np.arange(-4, 13, 2),
                'gamma': 2.0 ** np.arange(-5, 4, 2)}


# Coarse grid search
svm_coarse = SVC(kernel='rbf')
svm_coarse_cv = GridSearchCV(svm_coarse, param_grid=coarse_grid, scoring='accuracy')#, cv=indexTrainData)
svm_coarse_cv.fit(iris.data, iris.target)

In [107]:
tuneFeat.describe

<bound method NDFrame.describe of                  Lx_g_elfi  Lx_g_refi  Lx_g_roun  Lx_g_shin   Lx_m_bl  \
0                 0.881864   0.099390   0.033490   0.043741  0.041110   
1                 0.772282   0.222935   0.136538   0.036692  0.039930   
2                 0.839724   0.136943   0.070390   0.035988  0.034338   
3                 0.703653   0.346674   0.221069   0.037933  0.033814   
4                 0.763603   0.296652   0.222943   0.026393  0.019910   
...                    ...        ...        ...        ...       ...   
(other, 385192)   0.819644   0.159218   0.150555   0.155139  0.206465   
(other, 722141)   0.129779   0.623823   0.431590   0.067814  0.070626   
(other, 647980)   0.616562   0.336806   0.330787   0.114776  0.126091   
(other, 452074)   0.789957   0.383627   0.285932   0.039511  0.033419   
(other, 488555)   0.478919   0.686964   0.702629   0.035380  0.030130   

                  Lx_m_gr  Lx_m_ndvi  Lx_m_nir   Lx_m_re  Lx_sd_bl  Lx_sd_gr  \
0        

In [108]:
tuneLabel

array(['bushes_trees', 'bushes_trees', 'bushes_trees', 'bushes_trees',
       'bushes_trees', 'bushes_trees', 'bushes_trees', 'bushes_trees',
       'bushes_trees', 'bushes_trees', 'bushes_trees', 'bushes_trees',
       'bushes_trees', 'bushes_trees', 'bushes_trees', 'bushes_trees',
       'bushes_trees', 'bushes_trees', 'bushes_trees', 'bushes_trees',
       'bushes_trees', 'bushes_trees', 'bushes_trees', 'bushes_trees',
       'bushes_trees', 'bushes_trees', 'bushes_trees', 'bushes_trees',
       'bushes_trees', 'bushes_trees', 'bushes_trees', 'bushes_trees',
       'bushes_trees', 'bushes_trees', 'bushes_trees', 'bushes_trees',
       'bushes_trees', 'bushes_trees', 'bushes_trees', 'bushes_trees',
       'bushes_trees', 'bushes_trees', 'bushes_trees', 'bushes_trees',
       'bushes_trees', 'bushes_trees', 'bushes_trees', 'bushes_trees',
       'bushes_trees', 'bushes_trees', 'bushes_trees', 'bushes_trees',
       'bushes_trees', 'bushes_trees', 'bushes_trees', 'bushes_trees',
      

In [109]:
# Expand coarse grid
coarse_grid = {'C': 2.0 ** np.arange(-4, 13, 2),
                'gamma': 2.0 ** np.arange(-5, 4, 2)}

# Coarse grid search
svm_coarse = SVC(kernel='rbf')
svm_coarse_cv = GridSearchCV(svm_coarse, param_grid=coarse_grid, scoring='accuracy')#, cv=indexTrainData)
svm_coarse_cv.fit(tuneFeat, tuneLabel)

In [110]:
# Get best coarse grid parameters
best_c = svm_coarse_cv.best_params_['C']
best_gamma = svm_coarse_cv.best_params_['gamma']

# Define narrow grid borders
a_gamma = np.log2(best_gamma) - 2
b_gamma = np.log2(best_gamma) + 2
a_c = np.log2(best_c) - 2
b_c = np.log2(best_c) + 2

# Expand narrow grid
narrow_grid = {'C': 2.0 ** np.arange(a_c, b_c, 0.5),
                'gamma': 2.0 ** np.arange(a_gamma, b_gamma, 0.5)}

# Narrow grid search
svm_narrow = SVC(kernel='rbf')
svm_narrow_cv = GridSearchCV(svm_narrow, param_grid=narrow_grid, scoring='accuracy')#, cv=indexTrainData)
svm_narrow_cv.fit(tuneFeat, tuneLabel)


In [157]:
# SVM parameter tuning
svm_model=svm_fit(tuneFeat, tuneLabel, indexTrainData)

In [118]:
validateLabels

522473    bushes_trees
522474    bushes_trees
523472    bushes_trees
523473           other
524470    bushes_trees
              ...     
997994           other
997995           other
997996           other
997997           other
997998           other
Name: REF, Length: 333323, dtype: category
Categories (2, object): ['bushes_trees', 'other']

In [158]:

# Predict labels of test data
predLabelsSVM = svm_model.predict(validateFeatsub)

# Accuracy assessment
accSVM = accuracy_score(validateLabels, predLabelsSVM)
print(accSVM)

0.8594306423499128


In [175]:
# VSVM on all Level SV
SVindex = svm_model.best_estimator_.support_  # Indices of support vectors
SVtotal = trainDataCur.iloc[SVindex, sindexSVMDATA:eindexSVMDATA+1].reset_index(drop=True)  # Get support vectors

SVL2 = trainDataCur.iloc[SVindex, sindexSVMDATA - 2 * numFeat:sindexSVMDATA - numFeat ].reset_index(drop=True)
SVL3 = trainDataCur.iloc[SVindex, sindexSVMDATA - numFeat:sindexSVMDATA ].reset_index(drop=True)

SVL5 = trainDataCur.iloc[SVindex, sindexSVMDATA + numFeat:sindexSVMDATA + 2 * numFeat ].reset_index(drop=True)
SVL6 = trainDataCur.iloc[SVindex, sindexSVMDATA + 2 * numFeat:sindexSVMDATA + 3 * numFeat ].reset_index(drop=True)
SVL7 = trainDataCur.iloc[SVindex, sindexSVMDATA + 3 * numFeat:sindexSVMDATA + 4 * numFeat ].reset_index(drop=True)
SVL8 = trainDataCur.iloc[SVindex, sindexSVMDATA + 4 * numFeat:sindexSVMDATA + 5 * numFeat ].reset_index(drop=True)
SVL9 = trainDataCur.iloc[SVindex, sindexSVMDATA + 5 * numFeat:sindexSVMDATA + 6 * numFeat ].reset_index(drop=True)
SVL10 = trainDataCur.iloc[SVindex, sindexSVMDATA + 6 * numFeat:sindexSVMDATA + 7 * numFeat ].reset_index(drop=True)
SVL11 = trainDataCur.iloc[SVindex, sindexSVMDATA + 7 * numFeat:sindexSVMDATA + 8 * numFeat ].reset_index(drop=True)


# Bind original SV with modified to new train data set
SVinvar = pd.concat([SVtotal, SVL2, SVL3, SVL5, SVL6, SVL7, SVL8, SVL9, SVL10, SVL11],ignore_index=True)

# Split for training to feature and label
trainFeatVSVM = SVinvar.iloc[:, :-1]
trainLabelsVSVM = SVinvar.iloc[:, -1]

# Get list with index of train data to split between train and test in svmFit
countTrainData = SVinvar.shape[0]
indexTrainData = [list(range(1, countTrainData + 1))]

# Join of train and test test data (through indexTrainData in svmFit separable)
tuneFeatVSVM = pd.concat([trainFeatVSVM, testFeatsub], axis=0)
tuneLabelsVSVM = np.concatenate((trainLabelsVSVM.values, testLabels.values))

# VSVM parameter tuning
tunedVSVM = svm_fit(tuneFeatVSVM, tuneLabelsVSVM)

# Run classification and accuracy assessment for modified SV
predLabelsVSVM = tunedVSVM.predict(validateFeatsub)
accVSVM = accuracy_score(validateLabels, predLabelsVSVM)
print(accVSVM)


InvalidIndexError: Reindexing only valid with uniquely valued Index objects

In [176]:
SVtotal[0:10].reset_index(drop=True)

Unnamed: 0,Lx_g_comp,Lx_g_elfi,Lx_g_refi,Lx_g_roun,Lx_g_shin,Lx_m_bl,Lx_m_gr,Lx_m_ndvi,Lx_m_nir,Lx_m_re,Lx_sd_bl,Lx_sd_gr,Lx_sd_ndvi,Lx_sd_nir,Lx_sd_re,Lx_t_diss,Lx_t_hom,Lx_t_mean
0,0.050208,0.831325,0.858572,0.097416,0.048185,0.028829,0.029048,0.778228,0.137015,0.022815,0.028229,0.023671,0.250152,0.130107,0.030528,0.124826,0.04256,0.785337
1,0.050208,0.831325,0.858572,0.097416,0.048185,0.028829,0.029048,0.778228,0.137015,0.022815,0.028229,0.023671,0.250152,0.130107,0.030528,0.124826,0.04256,0.785337
2,0.094363,0.594595,0.789575,0.273995,0.186283,0.029562,0.038956,0.658483,0.12635,0.045296,0.087441,0.108569,0.624496,0.336826,0.159947,0.146974,0.050292,0.82642
3,0.094363,0.594595,0.789575,0.273995,0.186283,0.029562,0.038956,0.658483,0.12635,0.045296,0.087441,0.108569,0.624496,0.336826,0.159947,0.146974,0.050292,0.82642
4,0.073022,0.67033,0.798266,0.201985,0.077653,0.021294,0.014574,0.631935,0.06167,0.007359,0.026509,0.022434,0.316412,0.119451,0.020073,0.098026,0.060738,0.803718
5,0.052193,0.707317,0.819672,0.142156,0.04989,0.033915,0.035445,0.67832,0.109,0.029649,0.026247,0.020902,0.091693,0.051384,0.018004,0.213381,0.016879,0.770114
6,0.052193,0.707317,0.819672,0.142156,0.04989,0.033915,0.035445,0.67832,0.109,0.029649,0.026247,0.020902,0.091693,0.051384,0.018004,0.213381,0.016879,0.770114
7,0.063844,0.743396,0.851452,0.147286,0.074513,0.03759,0.032785,0.566912,0.081766,0.034202,0.053797,0.046509,0.229015,0.105233,0.037623,0.174866,0.026871,0.783304
8,0.063844,0.743396,0.851452,0.147286,0.074513,0.03759,0.032785,0.566912,0.081766,0.034202,0.053797,0.046509,0.229015,0.105233,0.037623,0.174866,0.026871,0.783304
9,0.06714,0.759791,0.842358,0.139249,0.106003,0.032682,0.034175,0.707802,0.126718,0.033567,0.053523,0.054488,0.388945,0.165385,0.0835,0.099998,0.060635,0.789355


In [177]:
SVL2[0:10].reset_index(drop=True)

Unnamed: 0,Lx_g_comp,Lx_g_elfi,Lx_g_refi,Lx_g_roun,Lx_g_shin,Lx_m_bl,Lx_m_gr,Lx_m_ndvi,Lx_m_nir,Lx_m_re,Lx_sd_bl,Lx_sd_gr,Lx_sd_ndvi,Lx_sd_nir,Lx_sd_re,Lx_t_diss,Lx_t_hom,Lx_t_mean
0,0.081117,0.831325,0.831181,0.131844,0.076223,0.034446,0.038443,0.783527,0.14082,0.040684,0.042728,0.037975,0.203268,0.188933,0.049257,0.083217,0.042629,0.683222
1,0.081117,0.831325,0.831181,0.131844,0.076223,0.034446,0.038443,0.783527,0.14082,0.040684,0.042728,0.037975,0.203268,0.188933,0.049257,0.083217,0.042629,0.683222
2,0.085631,0.647059,0.737209,0.248516,0.07816,0.030705,0.045201,0.649178,0.120411,0.058535,0.060847,0.119105,0.30051,0.184781,0.180655,0.14233,0.023999,0.726856
3,0.149065,0.666667,0.71581,0.129485,0.162943,0.027486,0.03147,0.653787,0.088553,0.036543,0.05898,0.046669,0.140672,0.093051,0.057494,0.176797,0.003772,0.748122
4,0.292604,0.037037,0.477876,0.597481,0.375123,0.030327,0.031084,0.709437,0.096326,0.031764,0.051962,0.038266,0.127531,0.114355,0.028668,0.178123,0.015826,0.702376
5,0.084326,0.707317,0.784747,0.192395,0.078921,0.038859,0.043656,0.686006,0.114641,0.046953,0.039727,0.033532,0.074507,0.074616,0.029051,0.142254,0.01695,0.669979
6,0.084326,0.707317,0.784747,0.192395,0.078921,0.038859,0.043656,0.686006,0.114641,0.046953,0.039727,0.033532,0.074507,0.074616,0.029051,0.142254,0.01695,0.669979
7,0.103148,0.743396,0.822682,0.199338,0.117872,0.042047,0.041488,0.57726,0.089192,0.051128,0.081426,0.074613,0.186093,0.152812,0.060707,0.116578,0.026941,0.681453
8,0.103148,0.743396,0.822682,0.199338,0.117872,0.042047,0.041488,0.57726,0.089192,0.051128,0.081426,0.074613,0.186093,0.152812,0.060707,0.116578,0.026941,0.681453
9,0.090983,0.6,0.750619,0.191248,0.12641,0.035457,0.043464,0.815578,0.158897,0.041787,0.042882,0.032771,0.107264,0.099906,0.049345,0.131846,0.039565,0.664842


In [179]:
pd.concat([SVtotal,SVL2,SVL5])

Unnamed: 0,Lx_g_comp,Lx_g_elfi,Lx_g_refi,Lx_g_roun,Lx_g_shin,Lx_m_bl,Lx_m_gr,Lx_m_ndvi,Lx_m_nir,Lx_m_re,Lx_sd_bl,Lx_sd_gr,Lx_sd_ndvi,Lx_sd_nir,Lx_sd_re,Lx_t_diss,Lx_t_hom,Lx_t_mean
0,0.050208,0.831325,0.858572,0.097416,0.048185,0.028829,0.029048,0.778228,0.137015,0.022815,0.028229,0.023671,0.250152,0.130107,0.030528,0.124826,0.042560,0.785337
1,0.050208,0.831325,0.858572,0.097416,0.048185,0.028829,0.029048,0.778228,0.137015,0.022815,0.028229,0.023671,0.250152,0.130107,0.030528,0.124826,0.042560,0.785337
2,0.094363,0.594595,0.789575,0.273995,0.186283,0.029562,0.038956,0.658483,0.126350,0.045296,0.087441,0.108569,0.624496,0.336826,0.159947,0.146974,0.050292,0.826420
3,0.094363,0.594595,0.789575,0.273995,0.186283,0.029562,0.038956,0.658483,0.126350,0.045296,0.087441,0.108569,0.624496,0.336826,0.159947,0.146974,0.050292,0.826420
4,0.073022,0.670330,0.798266,0.201985,0.077653,0.021294,0.014574,0.631935,0.061670,0.007359,0.026509,0.022434,0.316412,0.119451,0.020073,0.098026,0.060738,0.803718
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,0.128719,0.450644,0.696731,0.415129,0.209054,0.033694,0.046634,0.732685,0.160815,0.046254,0.042091,0.036228,0.225737,0.110220,0.048571,0.192499,0.038910,0.788608
85,0.048208,0.821429,0.893459,0.113375,0.126157,0.033787,0.043146,0.646950,0.127958,0.049292,0.047404,0.037481,0.187865,0.130733,0.036498,0.197834,0.042960,0.807497
86,0.048208,0.821429,0.893459,0.113375,0.126157,0.033787,0.043146,0.646950,0.127958,0.049292,0.047404,0.037481,0.187865,0.130733,0.036498,0.197834,0.042960,0.807497
87,0.048208,0.821429,0.893459,0.113375,0.126157,0.033787,0.043146,0.646950,0.127958,0.049292,0.047404,0.037481,0.187865,0.130733,0.036498,0.197834,0.042960,0.807497


In [None]:


# VSVM - EVALUATION of all Level VSV
actKappa = 0
bestFittingModel = None

# Iteration over bound to test different bound thresholds determining the radius of acceptance
for jj, bound_val in enumerate(bound, start=1):
    SVinvarRadi_list = []
    
    # Iterating over boundMargin to test different thresholds on margin distance
    for kk, bound_margin_val in enumerate(boundMargin, start=1):
        SVinvar_list = []
        
        # Iterate over SVinvarRadi and evaluate distance to hyperplane
        for m in range(len(SVinvarRadi)):
            signa = pred_one(tunedSVM.finalModel, SVinvarRadi[m, :-1])
            if SVinvarRadi[m, -1] == levels(generalDataPool.REF)[0]:
                if -bound_margin_val < signa < bound_margin_val:
                    SVinvar_list.append(SVinvarRadi[m, :])
            else:
                if -bound_margin_val < signa < bound_margin_val:
                    SVinvar_list.append(SVinvarRadi[m, :])

        SVinvar = pd.DataFrame(SVinvar_list, columns=objInfoNames)
        
        # Merge elected VSV with original SV
        SVinvar_org = pd.concat([SVtotal, SVinvar])

        # Split for training to feature and label
        trainFeatVSVM = SVinvar_org.iloc[:, :-1]
        trainLabelsVSVM = SVinvar_org.iloc[:, -1]

        # Get list with index of trainData to split between train and test in svmFit
        countTrainData = SVinvar_org.shape[0]
        indexTrainData = [list(range(1, countTrainData + 1))]

        # Join of train and test data (through indexTrainData in svmFit separable)
        names = objInfoNames[:-1]
        tuneFeatVSVM = pd.concat([trainFeatVSVM, testFeatsub], axis=0)
        tuneFeatVSVM.columns = names
        tuneLabelsVSVM = np.concatenate((trainLabelsVSVM.values, testLabels.values))

        ######################################## VSVM control parameter tuning ########################################
        tunedVSVM = SVC(kernel='linear')
        tunedVSVM.fit(tuneFeatVSVM, tuneLabelsVSVM)

        # Get the best fitting model based on Kappa
        if actKappa < tunedVSVM.resample.Kappa:
            bestFittingModel = tunedVSVM
            actKappa = tunedVSVM.resample.Kappa

# Run classification and accuracy assessment for the best bound setting
# Predict labels of test data
predLabelsVSVMsum = bestFittingModel.predict(validateFeatsub)

# Accuracy assessment
accVSVM_SL = accuracy_score(validateLabels, predLabelsVSVMsum)


In [None]:



# Balanced & Random unlabeled samples
# Balanced samples

# Definition of sampling configuration (strata: random sampling without replacement)
stratSampRemaining_b = resample(trainDataCurRemaining, n_samples=[b, b, b, b, b, b], replace=False)
samplesRemaining_b = trainDataCurRemaining.iloc[stratSampRemaining_b]

trainDataCurRemaining_b = samplesRemaining_b.iloc[:, :-1]
trainDataCurRemainingsub_b = trainDataCurRemaining_b.iloc[:, sindexSVMDATA:eindexSVMDATA]
REF_b = bestFittingModel.predict(trainDataCurRemainingsub_b)

SVindexUn_b = np.arange(1, len(trainDataCurRemainingsub_b) + 1)
SVtotalUn_b = trainDataCurRemaining_b.iloc[SVindexUn_b - 1, sindexSVMDATA:eindexSVMDATA]
SVtotalUn_b['REF'] = REF_b

SVL2Un_b = trainDataCurRemaining.iloc[SVindexUn_b - 1, sindexSVMDATA - 2*numFeat:sindexSVMDATA - numFeat - 1].copy()
SVL2Un_b['REF'] = REF_b
SVL3Un_b = trainDataCurRemaining.iloc[SVindexUn_b - 1, sindexSVMDATA - numFeat:sindexSVMDATA - 1].copy()
SVL3Un_b['REF'] = REF_b
SVL5Un_b = trainDataCurRemaining_b.iloc[SVindexUn_b - 1, sindexSVMDATA + numFeat:sindexSVMDATA + 2*numFeat - 1].copy()
SVL5Un_b['REF'] = REF_b
SVL6Un_b = trainDataCurRemaining_b.iloc[SVindexUn_b - 1, sindexSVMDATA + 2*numFeat:sindexSVMDATA + 3*numFeat - 1].copy()
SVL6Un_b['REF'] = REF_b
SVL7Un_b = trainDataCurRemaining_b.iloc[SVindexUn_b - 1, sindexSVMDATA + 3*numFeat:sindexSVMDATA + 4*numFeat - 1].copy()
SVL7Un_b['REF'] = REF_b
SVL8Un_b = trainDataCurRemaining_b.iloc[SVindexUn_b - 1, sindexSVMDATA + 4*numFeat:sindexSVMDATA + 5*numFeat - 1].copy()
SVL8Un_b['REF'] = REF_b
SVL9Un_b = trainDataCurRemaining_b.iloc[SVindexUn_b - 1, sindexSVMDATA + 5*numFeat:sindexSVMDATA + 6*numFeat - 1].copy()
SVL9Un_b['REF'] = REF_b
SVL10Un_b = trainDataCurRemaining_b.iloc[SVindexUn_b - 1, sindexSVMDATA + 6*numFeat:sindexSVMDATA + 7*numFeat - 1].copy()
SVL10Un_b['REF'] = REF_b
SVL11Un_b = trainDataCurRemaining_b.iloc[SVindexUn_b - 1, sindexSVMDATA + 7*numFeat:sindexSVMDATA + 8*numFeat - 1].copy()
SVL11Un_b['REF'] = REF_b

SVinvarUn_b = pd.concat([SVtotalUn_b, SVL2Un_b, SVL3Un_b, SVL5Un_b, SVL6Un_b, SVL7Un_b, SVL8Un_b, SVL9Un_b, SVL10Un_b, SVL11Un_b])

# Balanced Unlabeled samples

actKappa = 0

for jj in range(len(bound)):
    SVinvarRadiUn_b_list = []
    for m in range(len(SVinvarRadiUn_b)):
        signa = pred_one(tunedSVM.finalModel, SVinvarRadiUn_b.iloc[m, :-1])
        if SVinvarRadiUn_b.iloc[m, -1] == levels(generalDataPool.REF)[0]:
            if -bound_margin_val < signa < bound_margin_val:
                SVinvarRadiUn_b_list.append(SVinvarRadiUn_b.iloc[m, :])
        else:
            if -bound_margin_val < signa < bound_margin_val:
                SVinvarRadiUn_b_list.append(SVinvarRadiUn_b.iloc[m, :])

    SVinvarUn_b = pd.DataFrame(SVinvarRadiUn_b_list, columns=objInfoNames)
    
    SVinvar_orgUn_b = pd.concat([SVtotal, SVinvarUn_b])

    trainFeatVSVMUn_b = SVinvar_orgUn_b.iloc[:, :-1]
    trainLabelsVSVMUn_b = SVinvar_orgUn_b.iloc[:, -1]

    countTrainDataUn_b = SVinvar_orgUn_b.shape[0]
    indexTrainDataUn_b = [list(range(1, countTrainDataUn_b + 1))]

    names = objInfoNames[:-1]
    tuneFeatVSVMUn_b = pd.concat([trainFeatVSVMUn_b, testFeatsub], axis=0)
    tuneFeatVSVMUn_b.columns = names
    tuneLabelsVSVMUn_b = np.concatenate((trainLabelsVSVMUn_b.values, testLabels.values))

    tunedVSVMUn_b = SVC(kernel='linear')
    tunedVSVMUn_b.fit(tuneFeatVSVMUn_b, tuneLabelsVSVMUn_b)

    if actKappa < tunedVSVMUn_b.resample.Kappa:
        bestFittingModelUn_b = tunedVSVMUn_b
        actKappa = tunedVSVMUn_b.resample.Kappa

# Run classification and accuracy assessment for the best bound setting
predLabelsVSVMsumUn_b = bestFittingModelUn_b.predict(validateFeatsub)

# Accuracy assessment
accVSVM_SL_Un_b = accuracy_score(validateLabels, predLabelsVSVMsumUn_b)


In [None]:
# Add predicted labels to the features data set
predLabelsVSVMsumUn_unc = pd.concat([validateFeatsub, pd.DataFrame(predLabelsVSVMsumUn_b, columns=["Predicted_Labels"])], axis=1)
predLabelsVSVMsumUn_unc.columns = objInfoNames

# Calculate uncertainty of the samples by selecting SV's and data set
normdistvsvm_sl_un = uncertainty_dist_v2_2(bestFittingModelUn_b, predLabelsVSVMsumUn_unc)

# Alter labels
predlabels_vsvm_Slu = alter_labels(normdistvsvm_sl_un, validateLabels)

# Accuracy assessment
accVSVM_SL_Un_b_ad = accuracy_score(validateLabels, predlabels_vsvm_Slu)
