**Constrastive learning**
---

In [3]:
import pandas as pd
import numpy as np
import random
import pickle
import matplotlib.pyplot as plt

from sklearn.preprocessing import normalize
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
#from Data_challenge_fairness_2024.evaluator import *
from evaluator import *

In [4]:
#####################################################################
#
#   CONTRASTIVE LEARNING FUNCTIONS (PAIRS)
#
#   1/ generate_random_sample_pairs()
#   2/ get_pair_sample(df,df_sample_pairs_indices)
#
######################################################################

# Function to generate a random sample a pairs (K positive pairs (if possible) and K negatives pairs)

def generate_random_sample_pairs(df,K=6):
    '''GENERATES A SAMPLE OF RANDOM POSITIVE AND NEGATIVE PAIRS FOR CONTRASTIVE LEARNING

    Input : 
    - df : a pandas DataFrame with a columns 'label'
    - K : a integer representing number of positive pairs and netagive pairs to be generated for each label 
    a maximum of 2*K pairs will be generated.
    Note : maximum number of positive pairs generated for a label = n_label * (n_label - 1) / 2 
    with n_label the number of lines for this label.
    if label has 1 line => 0 pair, 2 lines =>1 pair, 3 lines => 3 pairs, 4 lines => 6 pairs, 5 lines => 10 pairs, 6 lines => 15 pairs, etc...

    Ouput : a dataframe with 2 columns 'pair' and 'value'
    - pair : a set (unordered) of 2 indices
    - value : 1 if positive pair (with same label) / 0 if negative pair (with different labels)
    '''
    labels=df['label'].unique()
    labels.sort()
    
    sample_positive_pairs=[] # To store positive pairs generated
    sample_negative_pairs=[] # To store negative pairs generated
    sample_pairs = []  # To store all generated pairs
    #print('\nlabel n°',l,'-', label, 'number of lines:',n_label_indices,'\n',label_indices)

    for l,label in enumerate(labels):
        print('label',label)
        check=(l,label)
        if l%1000==0:print("generating pair for label:",l)
       
        # GENERATION OF POSITIVE PAIRS
        #-----------------------------
        
        # save and shuffle indexes of products with same labels
        label_indices = df[df['label'] == label].index.tolist()
        n_label_indices = len(label_indices)  # Number of same-label indices
        max_positive_pairs = n_label_indices * (n_label_indices - 1) / 2 # maximum number of possible positive pair (n! /(n-2!*2!))
    
        # initialise list of positive pairs for this label
        positive_pairs = [] # pairs of indices with same label
        n_positive_pairs= 0 # Number of positive pairs generated for label
          
        # Loop for positive pairs (same labels) if more than 1 data
        if n_label_indices > 1:
            while n_positive_pairs < min(K, max_positive_pairs):
                temp=label_indices.copy()
                while (n_positive_pairs < min(K, max_positive_pairs)) and (len(temp)>=2) :
                    draw_pair = random.sample(temp , 2)
                    pair=set(draw_pair)
                    temp.remove(draw_pair[0]) 
                    temp.remove(draw_pair[1])
                    if pair not in positive_pairs:
                        positive_pairs.append(pair)
                        n_positive_pairs=len(positive_pairs)

            #print('end of positive while loop',n_positive_pairs,'pairs out of ',max_positive_pairs,'possible pairs\n',positive_pairs)
 
        # GENERATION OF NEGATIVE PAIRS
        #----------------------------- 

        # save indexes of products with different labels
        different_label_indices = df[df['label'] != label].index.tolist()
        n_different_indices=len(different_label_indices)
        
        # initialise list of negative pairs for this label
        negative_pairs = [] # pairs with different label
        n_negative_pairs=0 # Number of negative pairs generated for label
    
        # Loop for negative pairs (different label)
        while n_negative_pairs < min (K,n_different_indices):  
            temp_positive=label_indices.copy()
            temp_negative=different_label_indices.copy()
            while (n_negative_pairs < min(K, n_different_indices)) and (len(temp_positive)>0) and (len(temp_negative)>0) :
                positive = random.choice(temp_positive)
                negative = random.choice(temp_negative)
                pair = {positive,negative}
                if (pair not in negative_pairs) and (pair not in sample_negative_pairs):
                    negative_pairs.append(pair)
                    n_negative_pairs=len(negative_pairs)
                    temp_positive.remove(positive)
                    temp_negative.remove(negative)
        #print('end of negative while loop',n_negative_pairs,'pairs out of ',n_different_indices,'possible pairs\n',negative_pairs)
    
    
        sample_positive_pairs.extend(positive_pairs)
        sample_negative_pairs.extend(negative_pairs)

    sample_positive=pd.DataFrame({'pair':sample_positive_pairs,'value':1})
    sample_negative=pd.DataFrame({'pair':sample_negative_pairs,'value':0})
    sample_pairs_indices = pd.concat([sample_positive,sample_negative], ignore_index=True)
    print('number of pairs in sample:',len(sample_pairs_indices),'(',len(sample_pairs_indices[sample_pairs_indices['value']==1]),'positives and',len(sample_pairs_indices[sample_pairs_indices['value']==0]),'negatives)')

    return sample_pairs_indices


def generate_optimized_sample_pairs(df,K=6): 
    '''GENERATES A SAMPLE OF RANDOM POSITIVE AND NEGATIVE PAIRS FOR CONTRASTIVE LEARNING
            (positive pairs are far from each other, negative pairs are close)

    Input : 
    - df : a pandas DataFrame with a columns 'label'
    - K : a integer representing number of positive pairs and netagive pairs to be generated for each label 
    a maximum of 2*K pairs will be generated.
    Note : maximum number of positive pairs generated for a label = n_label * (n_label - 1) / 2 
    with n_label the number of lines for this label.
    if label has 1 line => 0 pair, 2 lines =>1 pair, 3 lines => 3 pairs, 4 lines => 6 pairs, 5 lines => 10 pairs, 6 lines => 15 pairs, etc...

    Ouput : a dataframe with 2 columns 'pair' and 'value'
    - pair : a set (unordered) of 2 indices
    - value : 1 if positive pair (with same label) / 0 if negative pair (with different labels)
  '''
    
def get_pair_sample(df,df_sample_pairs_indices):
    '''returns a dataset containing a list of 2 embeddings for each pair
    '''
    # create pandas dataframe for sample
    columns = [f'p1_{i}' for i in range(300)]+[f'p2_{i}' for i in range(300)]+['label']
    sample = pd.DataFrame(index=df_sample_pairs_indices.index,columns=columns)
    
    # extract embedding for each pair
    for i in range(df_sample_pairs_indices.shape[0]):
        if i%25000==0: print(i)
        
        # Get indices of 1st and 2nd pair
        p1 , p2 = df_sample_pairs_indices.iloc[i,0]
        
        # import embedding and label (O or 1) 
        if len(df.loc[p1,'average_embedding'])==300 :
            sample.iloc[i,0:300]=df.loc[p1,'average_embedding']
        if len(df.loc[p2,'average_embedding'])==300 :
            sample.iloc[i,300:600]=df.loc[p2,'average_embedding']
        sample.loc[i,'label']=df_sample_pairs_indices.iloc[i,1]
    
    # drop lines containing at least 1 NaN value (meaning that one of the embeddings was empty)
    print('number of pairs of indices provided:',df_sample_pairs_indices.shape[0])
    #sample_cleaned = sample.dropna()
    #print('shape of sample generated:',sample_cleaned[0])
    #print(f'{sample.shape[0]-sample_cleaned.shape[0]} lines dropped (because of NaN values)')
    return sample#_cleaned



In [5]:
#####################################################
# Load pickle file and convert to numpy array
#####################################################

with open('data-challenge-student.pickle', 'rb') as handle:
    # dat = pickle.load(handle)
    dat = pd.read_pickle(handle)
 
#Check keys()
print(dat.keys())
X = dat['X_train']
Y = dat['Y']
S = dat['S_train']

X_test_true = dat['X_test']
S_test_true = dat['S_test']

Y56= Y + 28*S
#X, X_test,Y,S, S_test = dat[1]

print(X.shape,Y.shape,S.shape,X_test_true.shape,S_test_true.shape)

dict_keys(['X_train', 'X_test', 'Y', 'S_train', 'S_test'])
(27749, 768) (27749,) (27749,) (11893, 768) (11893,)


In [6]:
##############################################################
# train_test_split with Y56 (np.arrays)
##############################################################

# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, Y56_train, Y56_test = train_test_split(X, Y56, test_size=0.2, random_state=42)
Y_train = Y56_train % 28  # reste (original Y)   ex 33% 28 = classe 5 
S_train = Y56_train//28   # facteur (original S) ex 33//28 = 1 (attribut protégé)
Y_test = Y56_test % 28  # reste (original Y)   ex 33% 28 = classe 5 
S_test = Y56_test//28   # facteur (original S) ex 33//28 = 1 (attribut protégé)

# impression des dimensions
print('train:',X_train.shape,Y_train.shape,S_train.shape)
print('test:',X_test.shape,Y_test.shape, S_test.shape)

train: (22199, 768) (22199,) (22199,)
test: (5550, 768) (5550,) (5550,)


In [7]:
##############################################################
# génération des paires pour le constractive learning
##############################################################
path_pkl='pkl/'

In [8]:
# Pair generation (K=4000) with Y (28 dimension)

K=4000
Y_train_df=pd.DataFrame({'label':Y_train})
pairs_train_4000 = generate_random_sample_pairs(Y_train_df,K)

K_test=1000 # ratio 20 / 80 %
Y_test_df=pd.DataFrame({'label':Y_test})
pairs_test_1000 = generate_random_sample_pairs(Y_test_df,K_test)

label 0
generating pair for label: 0
label 1
label 2
label 3
label 4
label 5
label 6
label 7
label 8
label 9
label 10
label 11
label 12
label 13
label 14
label 15
label 16
label 17
label 18
label 19
label 20
label 21
label 22
label 23
label 24
label 25
label 26
label 27
number of pairs in sample: 219736 ( 107736 positives and 112000 negatives)
label 0
generating pair for label: 0
label 1
label 2
label 3
label 4
label 5
label 6
label 7
label 8
label 9
label 10
label 11
label 12
label 13
label 14
label 15
label 16
label 17
label 18
label 19
label 20
label 21
label 22
label 23
label 24
label 25
label 26
label 27
number of pairs in sample: 50249 ( 22249 positives and 28000 negatives)


In [9]:
# Pair generation (K=8000) with Y (28 dimension)
# peut etre trop grand...

K=8000
Y_train_df=pd.DataFrame({'label':Y_train})
pairs_train_8000 = generate_random_sample_pairs(Y_train_df,K)

K_test=2000 # ratio 20 / 80 %
Y_test_df=pd.DataFrame({'label':Y_test})
pairs_test_2000 = generate_random_sample_pairs(Y_test_df,K_test)

label 0
generating pair for label: 0
label 1
label 2
label 3
label 4
label 5
label 6
label 7
label 8
label 9
label 10
label 11
label 12
label 13
label 14
label 15


In [None]:
# Pair generation with Y56 (56 dimension = Y x S = 28 x 2)

K = 200
Y56_train_df=pd.DataFrame({'label',Y56_train})
pairs56_train_200 = generate_random_sample_pairs(Y56_train_df,K)

K_test = 50
Y56_test_df=pd.DataFrame({'label',Y56_test})
pairs56_test_50 = generate_random_sample_pairs(Y56_test_df,K_test)

In [None]:
# Pair generation with Y56 (56 dimension = Y x S = 28 x 2)

K = 400
Y56_train_df=pd.DataFrame({'label',Y56_train})
pairs56_train_400 = generate_random_sample_pairs(Y56_train_df,K)

K_test = 100
Y56_test_df=pd.DataFrame({'label',Y56_test})
pairs56_test_100 = generate_random_sample_pairs(Y56_test_df,K_test)

In [None]:
# SAVE FILES

#with Y56 (Y = 28 labels)

with open(path_pkl+'sample_pair_indices_train_8000.pkl', 'wb') as f:
    pickle.dump(pairs_train_8000, f)
with open(path_pkl+'sample_pair_indices_test_2000.pkl', 'wb') as f:
    pickle.dump(pairs_test_2000, f)
    
with open(path_pkl+'sample_pair_indices_train_4000.pkl', 'wb') as f:
    pickle.dump(pairs_train_4000, f)
with open(path_pkl+'sample_pair_indices_test_1000.pkl', 'wb') as f:
    pickle.dump(pairs_test_1000, f)

#with Y56 (Y = 28 labels x S = 2 labels)

with open(path_pkl+'sample_pair56_indices_test_400.pkl', 'wb') as f:
    pickle.dump(pairs56_train_400, f)
with open(path_pkl+'sample_pair56_indices_test_100.pkl', 'wb') as f:
    pickle.dump(pairs56_test_100, f)
    
    
with open(path_pkl+'sample_pair56_indices_test_200.pkl', 'wb') as f:
    pickle.dump(pairs56_train_200, f)
with open(path_pkl+'sample_pair56_indices_test_50.pkl', 'wb') as f:
    pickle.dump(pairs56_test_50, f)

In [None]:
######################################################
#        GENERATE SAMPLES (size = 1537 = 2x768 + 1)
######################################################

#------------------------------------------------------
#-      RE-LOAD SAMPLE PARIS INDICES 
#            (if necessary)
#------------------------------------------------------

Pairs =[['sample_pair_indices_train_8000.pkl','sample_pair_indices_test_2000.pkl'],
        ['sample_pair_indices_train_4000.pkl','sample_pair_indices_test_1000.pkl'],
        ['sample_pair56_indices_train_400.pkl','sample_pair56_indices_test_100.pkl'],
        ['sample_pair56_indices_train_200.pkl','sample_pair56_indices_test_50.pkl'],
        ]

    
path_pkl = 'pkl/'
i = 0
sample_pairs_indices_train = pd.read_pickle(path_pkl+pairs[i][0])
sample_pairs_indices_test = pd.read_pickle(path_pkl++pairs[i][0])
print(sample_pairs_indices_train.shape,sample_pairs_indices_test.shape)

NameError: name 'pd' is not defined

In [None]:
#------------------------------------------------------
#-      GENERATE SAMPLES
#------------------------------------------------------
# 
# # Generate the sample of pairs train
# sample dim (n,1537) with 1537 = 1st embedding (dim=768) + 2nd embedding  (dim=768) + label (dim=1)
pair_sample_train=get_pair_sample(train,sample_pairs_indices_train)#.iloc[:10000,:])
print(pair_sample_train.shape)

# Generate the sample of pairs test
pair_sample_test=get_pair_sample(test,sample_pairs_indices_test)#.iloc[:10000,:])
print(pair_sample_test.shape)



# REMOVE NaN VALUES (when 'tokens_to_embed'=[] i.e. no words were embedded )

# train
pair_sample_train=pair_sample_train.dropna()
initial=sample_pairs_indices_train.shape[0]
final=pair_sample_train.shape[0]
print('size of pair_sample:',pair_sample_train.shape,f'\n{initial-final} lines dropped due to NaN values')

# test
pair_sample_test=pair_sample_test.dropna()
initial_test=sample_pairs_indices_test.shape[0]
final_test=pair_sample_test.shape[0]
print('size of pair_sample (test):',pair_sample_test.shape,f'\n{initial_test-final_test} lines dropped due to NaN values')

#------------------------------------------------------
#-      SAVE SAMPLES
#------------------------------------------------------

# SAVE PAIR SAMPLE
with open(path_pkl+'pair_sample_train.pkl', 'wb') as f:
    pickle.dump(pair_sample_train, f)
    
# SAVE PAIR SAMPLE
with open(path_pkl+'pair_sample_test.pkl', 'wb') as f:
    pickle.dump(pair_sample_test, f)

In [None]:
###################################################
#   3. PREDICTIONS
###################################################

#--------------------------------
# TO RELOAD DATA
#---------------------------------

path_pkl = 'pkl/'

# train sample of pairs
train = pd.read_pickle(path_pkl+'pair_sample_train.pkl')
print('train shape',train.shape)

X_train = train.iloc[:,:600]
y_train = train.iloc[:,600]
y_train = y_train.astype('int')

# test sample of pairs
test = pd.read_pickle(path_pkl+'pair_sample_test.pkl')
print('test shape',test.shape)

X_test = test.iloc[:,:600]
y_test = test.iloc[:,600]
y_test = y_test.astype('int')

#-----------------------------------------------
#SCALING
#-----------------------------------------------
#from sklearn import preprocessing
#scaler = preprocessing.StandardScaler().fit(X_train)
#X_train = scaler.transform(X_train)
#X_test = scaler.transform(X_test)


#-----------------------------------------------
#TRAIN LOGISITIC REGRESSION
#-----------------------------------------------

# Create and train a logistic regression classifier
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

In [None]:
#--------------------------------
# EVALUATE
#---------------------------------

# Calculate base rate
# N pairs / N positive
# target rate >> N_positive / N_total

# Calculate accuracy of train
y_train_pred = classifier.predict(X_train)
accuracy_train = accuracy_score(y_train, y_train_pred)
print('train accuracy',accuracy_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy (test):", accuracy)