# CPLEX

In [4]:
import cplex
import numpy as np
from synthetic_datasets import ClusterDataset
import pandas as pd

In [15]:
seeds = [
    85287339, # 412
    20200621154912, # 378
    20200623170005 # 433
]

In [16]:
np.random.seed(seeds[0])

In [17]:
def generate_random_unit_vector(dim):
    vec = np.random.randn(dim)
    return vec / np.linalg.norm(vec)

In [18]:
def choose_random_point_in_hypercube(dim):
    return np.random.uniform(0, 1, dim)

In [19]:
def separating_hyperplane(P, N, eps_P, eps_N, eps_R, theta, lamb, num_trials=100):
    """
    Finds the initial separating hyperplane using the provided algorithm.
    
    Args:
        P (list): Set of positive samples (numpy arrays).
        N (list): Set of negative samples (numpy arrays).
        eps_P (float): Parameter for positive samples.
        eps_N (float): Parameter for negative samples.
        eps_R (float): Regularization parameter.
        theta (float): Scaling parameter.
        num_trials (int): Number of random trials.
        
    Returns:
        tuple: Optimal hyperplane (w, c).
    """
    dim = len(P[0])  # Dimension of the feature space
    L = -float('inf')
    best_h = None

    for _ in range(num_trials):
        # Choose a random unit vector w
        w = generate_random_unit_vector(dim)
        # Choose a random point c in the unit hypercube
        c = choose_random_point_in_hypercube(dim)
        c = -np.dot(c, w)

        # Compute x_tilde and y_tilde arrays
        x_tilde = np.array([0 if np.dot(w, s) < eps_P else 1 for s in P])
        y_tilde = np.array([1 if np.dot(w, s) > -eps_N else 0 for s in N])

        # Compute V_tilde
        V_tilde = max(0, (theta - 1) * np.sum(x_tilde) +
                      theta * np.sum(y_tilde) + theta * eps_R)

        # Compute L_tilde
        L_tilde = np.sum(x_tilde) - V_tilde * lamb

        # Update L and h
        if L_tilde > L:
            L = L_tilde
            best_h = (w, c)

    return best_h

In [20]:
# P = np.array([[1, 1], [1, 0], [0, 1]])  
# N = np.array([[0, 0]])  
theta0 = 99
theta1 = 100
theta = theta0 / theta1
eps_R = 3e-3
eps_P = 1e-3
eps_N = 2e-3
n = 10000
lamb = 100 * (n + 1)

# Breast Cancer

In [14]:
from sklearn.datasets import load_breast_cancer

In [15]:
d = load_breast_cancer()

In [16]:
X,y = d['data'], d['target']

In [18]:
P = np.array([x for idx, x in enumerate(X) if y[idx] == 1])
N = np.array([x for idx, x in enumerate(X) if y[idx] == 0])

In [28]:
x_tilde = np.array([0 if np.dot(w, s) < eps_P else 1 for s in P])


In [29]:
np.sum(x_tilde)

223

# Wine Quality Red

In [37]:
wq_red_df = pd.read_csv('./data/wine-quality/winequality-red.csv', delimiter=';')

In [38]:
wq_red_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [43]:
X, y = wq_red_df.drop(columns=['quality']).to_numpy() , wq_red_df['quality'].to_numpy()     

In [45]:
y = y >= 8

In [46]:
y = y.astype(int)

In [49]:
theta = 0.04

In [50]:
P = np.array([x for idx, x in enumerate(X) if y[idx] == 1])
N = np.array([x for idx, x in enumerate(X) if y[idx] == 0])

In [51]:
w, c = separating_hyperplane(P, N, eps_P, eps_N, eps_R, theta, lamb, num_trials=10000)

In [53]:
x_tilde = np.array([0 if np.dot(w, s) < eps_P else 1 for s in P])


In [54]:
np.sum(x_tilde)

9

# Wine Quality White

In [56]:
wq_white_df = pd.read_csv('./data/wine-quality/winequality-white.csv', delimiter=';')

In [57]:
wq_white_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [58]:
X, y = wq_white_df.drop(columns=['quality']).to_numpy() , wq_white_df['quality'].to_numpy()     

In [59]:
y = y >= 8

In [60]:
y = y.astype(int)

In [61]:
theta = 0.1

In [62]:
P = np.array([x for idx, x in enumerate(X) if y[idx] == 1])
N = np.array([x for idx, x in enumerate(X) if y[idx] == 0])

In [63]:
w, c = separating_hyperplane(P, N, eps_P, eps_N, eps_R, theta, lamb, num_trials=10000)

In [64]:
x_tilde = np.array([0 if np.dot(w, s) < eps_P else 1 for s in P])


In [65]:
np.sum(x_tilde)

32

# South German Credit

In [5]:
cols_english = ["running account", "term", "morality", "expenditure", "amount", "savings account", "possession", "installment", "family status", "guarantor", "length of residence", "assets", "age", "long-term credit", "residence", "previous credit", "occupation", "personal", "telephone", "guest worker", "credit"]

In [6]:
south_german_credit_df = pd.read_csv('./data/south-german-credit/SouthGermanCredit.asc', delimiter=' ')

In [7]:
south_german_credit_df.head()

Unnamed: 0,laufkont,laufzeit,moral,verw,hoehe,sparkont,beszeit,rate,famges,buerge,...,verm,alter,weitkred,wohn,bishkred,beruf,pers,telef,gastarb,kredit
0,1,18,4,2,1049,1,2,4,2,1,...,2,21,3,1,1,3,2,1,2,1
1,1,9,4,0,2799,1,3,2,3,1,...,1,36,3,1,2,3,1,1,2,1
2,2,12,2,9,841,2,4,2,2,1,...,1,23,3,1,1,2,2,1,2,1
3,1,12,4,0,2122,1,3,3,3,1,...,1,39,3,1,2,2,1,1,1,1
4,1,12,4,0,2171,1,3,4,3,1,...,2,38,1,2,2,2,2,1,1,1


In [8]:
south_german_credit_df.columns = cols_english

In [9]:
south_german_credit_df.head()

Unnamed: 0,running account,term,morality,expenditure,amount,savings account,possession,installment,family status,guarantor,...,assets,age,long-term credit,residence,previous credit,occupation,personal,telephone,guest worker,credit
0,1,18,4,2,1049,1,2,4,2,1,...,2,21,3,1,1,3,2,1,2,1
1,1,9,4,0,2799,1,3,2,3,1,...,1,36,3,1,2,3,1,1,2,1
2,2,12,2,9,841,2,4,2,2,1,...,1,23,3,1,1,2,2,1,2,1
3,1,12,4,0,2122,1,3,3,3,1,...,1,39,3,1,2,2,1,1,1,1
4,1,12,4,0,2171,1,3,4,3,1,...,2,38,1,2,2,2,2,1,1,1


In [12]:
X, y = south_german_credit_df.drop(columns=['credit']).to_numpy() , south_german_credit_df['credit'].to_numpy()     

In [21]:
theta = 0.9

In [23]:
P = np.array([x for idx, x in enumerate(X) if y[idx] == 1])
N = np.array([x for idx, x in enumerate(X) if y[idx] == 0])

In [24]:
w, c = separating_hyperplane(P, N, eps_P, eps_N, eps_R, theta, lamb, num_trials=10000)

In [25]:
x_tilde = np.array([0 if np.dot(w, s) < eps_P else 1 for s in P])


In [26]:
np.sum(x_tilde)

50

# Crop Mapping Dataset

In [33]:
crop_df = pd.read_csv('./data/crops_new/WinnipegDataset.txt', delimiter=',')

In [34]:
crop_df.head()

Unnamed: 0,label,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f165,f166,f167,f168,f169,f170,f171,f172,f173,f174
0,1,-13.559,-21.407,-11.404,-15.248,-11.923,-15.291,-2.1548,-7.8474,-10.002,...,0.18519,0.72602,5.3333,6.0,0.29489,9.7778,2.4444,1.677,0.20988,0.65422
1,1,-12.802,-20.335,-10.399,-14.132,-11.096,-14.361,-2.4039,-7.533,-9.9369,...,0.33333,-0.48751,2.1111,0.098765,0.83333,0.33333,0.33333,0.84869,0.50617,-0.18898
2,1,-12.431,-19.902,-10.074,-13.598,-10.829,-14.048,-2.3566,-7.4717,-9.8283,...,0.25926,0.25298,2.2222,0.17284,0.68889,0.88889,0.66667,1.273,0.30864,0.10483
3,1,-12.689,-19.529,-10.028,-13.35,-11.056,-14.014,-2.6611,-6.8396,-9.5006,...,0.16049,0.4375,4.1111,0.32099,0.83333,0.33333,0.33333,1.1491,0.38272,0.41603
4,1,-12.686,-19.278,-9.8185,-13.108,-10.932,-13.939,-2.8675,-6.5919,-9.4594,...,0.18519,0.35,4.0,0.44444,0.68889,0.88889,0.66667,1.5811,0.20988,0.5


In [36]:
X, y = crop_df.drop(columns=['label']).to_numpy() , crop_df['label'].to_numpy()     

In [43]:
idx = np.random.choice(np.arange(len(X)), int(len(X)/10), replace=False)


In [44]:
X = X[idx]
y = y[idx]

In [45]:
y = (y == 6).astype(int)

In [46]:
theta = 0.99

In [47]:
P = np.array([x for idx, x in enumerate(X) if y[idx] == 1])
N = np.array([x for idx, x in enumerate(X) if y[idx] == 0])

In [48]:
w, c = separating_hyperplane(P, N, eps_P, eps_N, eps_R, theta, lamb, num_trials=10000)

In [49]:
x_tilde = np.array([0 if np.dot(w, s) < eps_P else 1 for s in P])


In [50]:
np.sum(x_tilde)

1267