In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from scipy.stats import skew

# Missing Values
# Set display options to show more columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Read the CSV file
df = pd.read_csv('df_3.csv')

# Replace 'Unavailable' and 'unknown' values with NaN
df = df.replace('Unavailable', pd.NA)
df = df.replace('Unknown', pd.NA)

# Calculate the number of missing values in each column
null_counts = df.isna().sum()

# Output the number of missing values for each column
print(null_counts)
print(len(null_counts))


FileNotFoundError: [Errno 2] No such file or directory: 'df_3.csv'

In [None]:

# Get the total number of rows
total_rows = len(df)

# Find columns with less than 2% missing values
columns_with_few_nulls = null_counts[null_counts / total_rows < 0.02].index
print(len(columns_with_few_nulls))

# Create a new DataFrame containing only columns with less than 2% missing values, and remove unnecessary columns
new_df = df[columns_with_few_nulls]
new_null_counts = new_df.isna().sum()
print(new_null_counts)



In [None]:
columns_to_drop = [
    "Country Iso Code",
    "Region",
    "Year",
    "Summary Paragraph",
    "Assessment_Level"
]

new_df = new_df.drop(columns=columns_to_drop)
new_df.to_csv('new_df.csv', index=False)


In [None]:
new_df.info()

In [None]:
# Convert 'Percent Of Working Children' column to numeric type
new_df['Percent Of Working Children'] = pd.to_numeric(new_df['Percent Of Working Children'], errors='coerce')

# Remove missing values
cleaned_data = new_df['Percent Of Working Children'].dropna()

# Plot histogram and Kernel Density Estimate (KDE) to view the distribution
plt.figure(figsize=(10, 6))
sns.histplot(cleaned_data, kde=True)
plt.title('Distribution of Percent Of Working Children (excluding missing values)')
plt.xlabel('Percent Of Working Children')
plt.ylabel('Frequency')
plt.show()

# Calculate mean and median
mean_value = cleaned_data.mean()
median_value = cleaned_data.median()

print(f"Mean: {mean_value}")
print(f"Median: {median_value}")

# Decide which value to use for imputation
if cleaned_data.skew() > 1 or cleaned_data.skew() < -1:
    print("The distribution is significantly skewed. Consider using median for imputation.")
    imputed_value = median_value
else:
    print("The distribution is not significantly skewed. Consider using mean for imputation.")
    imputed_value = mean_value

# Impute using the selected value
new_df['Percent Of Working Children'] = new_df['Percent Of Working Children'].fillna(imputed_value)



In [None]:
# Check the unique values in each column of new_df
unique_values_in_columns = {column: new_df[column].unique() for column in new_df.columns}
unique_values_in_columns


In [None]:
# Find columns with only one unique value
columns_with_single_value = []
for column in new_df.columns:
    if len(new_df[column].unique()) == 1:
        columns_with_single_value.append(column)
print(columns_with_single_value)

# Delete these columns
new_df.drop(columns=columns_with_single_value, inplace=True)

# Display the processed DataFrame to confirm the columns have been correctly removed
new_df.head()


In [None]:
# Special handling for some anomalous data
columns_to_clean = ['Enforcement Labor Authorized Assess Penalties', 'Coordination']
for column in columns_to_clean:
    # Strip whitespace, fill missing values with 'No', and replace empty strings with 'No'
    new_df[column] = new_df[column].str.strip().fillna('No').replace({'': 'No'})

# '13-Dec' should be '13', likely a recording error
new_df['Compulsory Education Age'] = new_df['Compulsory Education Age'].replace('13-Dec', '13')
# Convert the 'Compulsory Education Age' column to integer
new_df['Compulsory Education Age'] = new_df['Compulsory Education Age'].astype(int)

# Check the unique values in each column of new_df after cleaning
unique_values_in_columns = {column: new_df[column].unique() for column in new_df.columns}
unique_values_in_columns


In [None]:
# Convert specified columns to integer type
integer_columns = [
    'Country Numeric Iso Code',
    'Year Added',
    'Region Id',
    'Assessment Level Id',
    'Minimum Work Age',
    'Minimum Hazardous Work Age',
    'Minumum Voluntary Military Age',
    'Compulsory Education Age'
]

for column in integer_columns:
    # Convert columns to numeric type first to handle any non-numeric entries and then to 'Int64' to allow for NA values
    new_df[column] = pd.to_numeric(new_df[column], errors='coerce').astype('Int64')

# Check the unique values in each column of new_df after conversion
unique_values_in_columns = {column: new_df[column].unique() for column in new_df.columns}
unique_values_in_columns


In [None]:
# Binary variables encoding
binary_columns = [
    'Crc Csec Ratified',
    'Crc Ac Ratified',
    'Palermo Ratified',
    'Minimum Work Conforms Standard',
    'Minimum Hazardous Work Conforms Standard',
    'Identification Of Hazardous Occupations List',
    'Prohibition Forced Labor Conforms Standard',
    'Prohibition Child Trafficking Conforms Standard',
    'Prohibition Csec Conforms Standard',
    'Prohibition Illicit Activities Conforms Standard',
    'Compulsory Education Conforms Standard',
    'Free Public Education Conforms Standard',
    'Enforcement Labor Authorized Assess Penalties',
    'Coordination',
    'Policy',
    'Program'
]

for column in binary_columns:
    # Replace 'Yes' with 1 and 'No' with 0, then convert to integer
    new_df[column] = new_df[column].replace({'Yes': 1, 'No': 0}).astype(int)

# Check the unique values in each column of new_df after binary encoding
unique_values_in_columns = {column: new_df[column].unique() for column in new_df.columns}
unique_values_in_columns


In [None]:
# Encoding age range variables: using the lower limit of the age range for encoding
age_range_mapping = {'5 to 14': 5, '6 to 14': 6, '7 to 14': 7, '10 to 14': 10}
new_df['Age Range Of Children Attending School'] = new_df['Age Range Of Children Attending School'].map(age_range_mapping)
new_df['Age Range Of Children Working And Studying'] = new_df['Age Range Of Children Working And Studying'].map(age_range_mapping)
new_df['Age Range Of Working Children'] = new_df['Age Range Of Working Children'].map(age_range_mapping)

# Exploitation type encoding: 'DER', 'CL', 'FL', 'FCL' represent different severity levels of exploitation
# Assuming higher numbers indicate more severe exploitation
exploitation_type_mapping = {'DER': 0, 'CL': 1, 'FL': 0, 'FCL': 2}
new_df['Exploitation_Type'] = new_df['Exploitation_Type'].map(exploitation_type_mapping)

# Due diligence response encoding should reflect the level of response
# Assuming higher numbers indicate higher levels of response
due_diligence_response_mapping = {'Basic': 1, 'Heightened': 2, 'Enhanced': 3}
new_df['Due_Diligence_Reponse'] = new_df['Due_Diligence_Reponse'].map(due_diligence_response_mapping)

# Display the head of the DataFrame to confirm changes
new_df.head()


In [None]:
# Perform one-hot encoding on categorical features
categorical_columns = ['Sector', 'Good']
new_df = pd.get_dummies(new_df, columns=categorical_columns)

# Good_AÃÂ§aÃÂ­ Berries rename the unreadable stuff from the new dummies
new_df.rename(columns={'Good_AÃÂ§aÃÂ­ Berries': 'Good_Berries'}, inplace=True)

# Display the first few rows of the DataFrame to confirm changes
new_df.head()


In [None]:
unique_values_in_columns = {column: new_df[column].unique() for column in new_df.columns}
unique_values_in_columns

In [None]:
# Check for missing values in the data
null_values = new_df.isnull().sum()
print(null_values[null_values > 0])


In [None]:
# Encoding True/False Binary Columns
new_df[new_df.columns[1:]] = new_df[new_df.columns[1:]].astype('int')

new_df.head()

In [None]:
from scipy.stats import chi2_contingency, f_oneway

continuous_features  = ['Index_score', 'Legal_framework_score', 'Enforcement_score', 'Outcome_score',
                        'Percent Of Working Children',
                   'Percent Of Working Children Agriculture',
                   'Percent Of Working Children Industry',
                   'Percent Of Working Children Services']


categorical_features =  [x for x in new_df.drop(columns=['Exploitation_Type', 'Country']).columns if x not in continuous_features]

# List to store significant columns
significant_columns = []

# Catgeorical Feature Selection
for column in categorical_features:

    # Contingency table for Chi-square test
    contingency_table = pd.crosstab(new_df[column], new_df['Exploitation_Type'])

    # Perform Chi-square test
    chi2_stat, p_val, _, _ = chi2_contingency(contingency_table)

    # Reject Columns that aren't significant
    if p_val < 0.1:
        significant_columns.append(column)


# Continuous Feature selection
for column in continuous_features:
    # Perform ANOVA test
    f_stat, p_val_anova = f_oneway(new_df[new_df['Exploitation_Type'] == 0][column], new_df[new_df['Exploitation_Type'] == 1][column], new_df[new_df['Exploitation_Type'] == 2][column])

    # Reject Columns that aren't significant
    if p_val_anova < 0.1:
        significant_columns.append(column)


In [None]:
significant_columns

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

x = new_df[significant_columns]
y = new_df['Exploitation_Type']

# Split the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Perform Oversampling
smote = SMOTE(random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train,y_train)

# Train the ordinal logistic regression model
ordinal_logistic = LogisticRegression(solver='saga', max_iter=5000, C=1, multi_class='multinomial')
ordinal_logistic.fit(x_train_resampled, y_train_resampled)

# train an SVM model
from sklearn.svm import SVC
svm = SVC(kernel='rbf', C=0.1, probability=True)
svm.fit(x_train_resampled, y_train_resampled)

# Train Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=50, max_depth=20, random_state=0)
rf.fit(x_train_resampled, y_train_resampled)

# Train an AdaBoost model
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(n_estimators=100, learning_rate=0.01, random_state=0)
ada.fit(x_train_resampled, y_train_resampled)

# Train a Neural Network
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical

y_train_resampled_onehot = to_categorical(y_train_resampled, num_classes=3)
y_test_onehot = to_categorical(y_test, num_classes=3)

# Build the ANN
nn = Sequential()
nn.add(Dense(100, activation='sigmoid', input_dim=x_train_resampled.shape[1]))
nn.add(Dense(100, activation='relu'))
nn.add(Dense(100, activation='sigmoid'))
nn.add(Dense(100, activation='relu'))
nn.add(Dense(100, activation='sigmoid'))
nn.add(Dense(100, activation='relu'))
nn.add(Dense(3, activation='softmax'))

nn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = nn.fit(x_train_resampled, y_train_resampled_onehot, epochs=200, batch_size=16, validation_data=(x_test, y_test_onehot), verbose=0)

# Accessing the training and validation accuracy
train_accuracy = history.history['accuracy'][-1]
val_accuracy = history.history['val_accuracy'][-1]

# Test the model
y_pred_log = ordinal_logistic.predict(x_test)
y_tpred_log = ordinal_logistic.predict(x_train)

y_pred_svm = svm.predict(x_test)
y_tpred_svm = svm.predict(x_train)

y_pred_rf = rf.predict(x_test)
y_tpred_rf = rf.predict(x_train)

y_pred_ada = ada.predict(x_test)
y_tpred_ada = ada.predict(x_train)

y_pred_nn = nn.predict(x_test)
y_tpred_nn = nn.predict(x_train)



acc_test1 = accuracy_score(y_test, y_pred_log)
acc_train1 = accuracy_score(y_train, y_tpred_log)

acc_test2 = accuracy_score(y_test, y_pred_svm)
acc_train2 = accuracy_score(y_train, y_tpred_svm)

acc_test3 = accuracy_score(y_test, y_pred_rf)
acc_train3 = accuracy_score(y_train, y_tpred_rf)

acc_test4 = accuracy_score(y_test, y_pred_ada)
acc_train4 = accuracy_score(y_train, y_tpred_ada)

acc_test5 = val_accuracy
acc_train5 = train_accuracy


# Evaluation
f1_test = f1_score(y_test, y_pred_log, average='weighted')
print("Ordinal Logistic Regression Test F1:", f1_test)

f1_test = f1_score(y_test, y_pred_svm, average='weighted')
print("SVM Test F1:", f1_test)

f1_test = f1_score(y_test, y_pred_rf, average='weighted')
print("Random Forest Test F1:", f1_test)

f1_test = f1_score(y_test, y_pred_ada, average='weighted')
print("AdaBoost Test F1:", f1_test)

# f1_test = f1_score(y_test, y_pred_nn, average='weighted')
# Predict on the test set
y_pred_probs = nn.predict(x_test)
y_pred_conv = np.argmax(y_pred_probs, axis=1)
f1_test = f1_score(y_test, y_pred_conv, average='weighted')  # 'weighted' accounts for label imbalance
print("Neural Network Test F1:", f1_test)

print("\n----------------------\n")
## Accuracy
print("Ordinal Logistic Regression Test Accuracy:", acc_test1)
print("SVM Test Accuracy:", acc_test2)
print("Random Forest Test Accuracy:", acc_test3)
print("AdaBoost Test Accuracy:", acc_test4)
print("Neural Network Test Accuracy:", acc_test5)

# Use Ensamble model
from sklearn.ensemble import VotingClassifier

# Create the ensemble model
ensemble = VotingClassifier(estimators=[('lr', ordinal_logistic), ('svm', svm), ('rf', rf), ('ada', ada)], voting='hard')

# Fit the model
ensemble.fit(x_train_resampled, y_train_resampled)

# Evaluate the model
y_pred_ens = ensemble.predict(x_test)

# Accuracy
acc_test_ens = accuracy_score(y_test, y_pred_ens)

# Evaluation
f1_test_ens = f1_score(y_test, y_pred_ens, average='weighted')

print("Ensemble Test F1:", f1_test_ens)
print("Ensemble Test Accuracy:", acc_test_ens)


# plot training and test accuracy
import matplotlib.pyplot as plt

# Create a list of model names
model_names = ['Ordinal Logistic Regression', 'SVM', 'Random Forest', 'AdaBoost', 'Neural Network']

# Create lists for train and test accuracies
train_accuracies = [acc_train1, acc_train2, acc_train3, acc_train4, acc_train5]
test_accuracies = [acc_test1, acc_test2, acc_test3, acc_test4, acc_test5]

# Create a figure and axis object
fig, ax = plt.subplots(figsize=(10, 6))

# Set the x-axis tick positions and labels
x = np.arange(len(model_names))
ax.set_xticks(x)
ax.set_xticklabels(model_names, rotation=45, ha='right')

# Plot the train and test accuracies
bar_width = 0.35
rects1 = ax.bar(x - bar_width / 2, train_accuracies, bar_width, label='Train Accuracy')
rects2 = ax.bar(x + bar_width / 2, test_accuracies, bar_width, label='Test Accuracy')

# Add labels and legend
ax.set_ylabel('Accuracy')
ax.set_title('Train vs Test Accuracy Comparison')
ax.legend()

# Add value labels on top of the bars
for rect1, rect2, model in zip(rects1, rects2, model_names):
    ax.text(rect1.get_x() + rect1.get_width() / 2, rect1.get_height() + 0.01, f'{rect1.get_height():.2f}', ha='center', va='bottom')
    ax.text(rect2.get_x() + rect2.get_width() / 2, rect2.get_height() + 0.01, f'{rect2.get_height():.2f}', ha='center', va='bottom')

# Adjust layout and display the plot
plt.tight_layout()
plt.show()

In [None]:
unique_values, counts = np.unique(y_pred_log, return_counts=True)
print(unique_values, counts)
#
unique_values, counts = np.unique(y_pred_svm, return_counts=True)
print(unique_values, counts)
#
unique_values, counts = np.unique(y_pred_rf, return_counts=True)
print(unique_values, counts)
#
unique_values, counts = np.unique(y_pred_ada, return_counts=True)
print(unique_values, counts)
#
unique_values, counts = np.unique(y_pred_conv, return_counts=True)
print(unique_values, counts)

unique_values, counts = np.unique(y_test, return_counts=True)
print(unique_values, counts)


In [None]:
#Weights
coefficients = ordinal_logistic.coef_
print(coefficients)

bias =  ordinal_logistic.intercept_
print(bias)

classes = ordinal_logistic.classes_
print(classes)

features = ordinal_logistic.feature_names_in_
print(features)

# save it in a txt file
with open('ordinal_logistic_coefficients.txt', 'w') as f:
    for i in range(len(features)):
        f.write('Class 0 - ' + features[i] + ':'+ str(coefficients[0][i]) + '\n')
        f.write('Class 1 - ' + features[i] + ':'+ str(coefficients[1][i]) + '\n')
        f.write('Class 2 - ' + features[i] + ':'+ str(coefficients[2][i]) + '\n')
    f.write('biases:' + str(bias[0]) + ', ' + str(bias[1]) + ', '+ str(bias[2]))

# find most influential points (5) for each class
# Max absolute value of the coefficients for coefficients[0][i]
for i in range(len(classes)):
    # make coefficients[0][i] all absolute values
    abs_coef = [abs(x) for x in coefficients[i]]
    total_sum = sum(abs_coef)
    # find the top 5 indexes of the absolute values
    top_5_indexes = np.argsort(abs_coef)[-5:][::-1]
    top_sum = sum(abs(x) for x in coefficients[i][top_5_indexes])
    # save the class (0, 1, 2) and the top 5 features with their values
    print(classes[i], features[top_5_indexes], coefficients[i][top_5_indexes], top_sum, total_sum)






In [None]:
# Each Country Risk Score

country_risk_probabilities = ordinal_logistic.predict_proba(x)

# select from new_df the fields from 'Good_' + whatever text follows regex
pattern = r'^Good_'
selected_cols_goods = new_df.filter(regex=pattern)
goods_cols = selected_cols_goods.columns.tolist()
good_names = []

# Iterate over rows
for idx, row in selected_cols_goods.iterrows():
    # Find the column with value 1
    good_col = row[row == 1].index.tolist()

    # If there is a column with value 1
    if good_col:
        # Extract the good name from the column name
        good_name = good_col[0]

        # Remove the Good_ prefix from the good name
        good_name = good_name.replace('Good_', '')

        # Append the good name to the list
        good_names.append(good_name)
    else:
        # If no column has value 1, append NaN (shouldnt happen...)
        good_names.append(np.nan)

# Create a new column 'good_name' with the good names
selected_cols_goods = pd.DataFrame(good_names)

print(selected_cols_goods.shape)
print(new_df.shape)

country_risk_score = np.dot(country_risk_probabilities, [0,1,2] )

probabilities =  [(country, probability, good) for country, probability, good in zip(new_df['Country'], country_risk_probabilities, selected_cols_goods[0])]

risks = [(country, risk, good) for country, risk, good in zip(new_df['Country'], country_risk_score, selected_cols_goods[0])]

# Sort Lists

probability_sorted =sorted(probabilities, key=lambda x: x[0])
risk_sorted= sorted(risks, key=lambda x: x[0])


print("Probabilities: \n", probability_sorted)

print("Risks: \n", risk_sorted)

In [None]:
for i in probability_sorted:
    print(i[0] + ':', i[1])

In [None]:
for i in risk_sorted:
    print(i[0] + ':', i[1])

In [None]:
# Saving Risks and Probabilities
import csv

# Risks
with open('risks.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Country', 'Risk', 'Good'])  # Write header
    writer.writerows(risk_sorted)  # Write rows

# Probabilities
with open('probabilities.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
  # Header
    writer.writerow(['Country', 'Probability of no child labour', 'Probability of child labour', 'Probability of forced child labour', 'Good'])  # Write header
  # Write rows
    for i in probability_sorted:
      row=[i[0]] + list(i[1]) + [i[2]]
      writer.writerow(row)



## Influence Functions
### (Extra - have some limitations since the influence of most instances are really well distributed and at times, the inverse hessian is not computable unless using the partial inverse.)
It was seen that, by removing 60 - 80 influencial points, the model resulted in a slighlt higher F1 and accuracy scores (from 64 to 68 in some cases). However, this process needs to be checked by an expert in the field of forced labour to detect why these influential point are so important and wheather they should be removed, tweaked or left in untouched.

In [None]:
# import numpy as np
# import pandas as pd

# ## Testing Influence Functions
# # Only for Log Regression with sigmoid loss function
# class InfluenceFunction(object):
#     H_inv = None
#     theta_hat = None
#     X = None
#     y = None
#     z = None
#     influence_params = []
#     influence_loss = []

#     def __init__(self, model, X, y):
#         self.model = model
#         self.theta_hat = model.coef_[0]
#         H = self.hessian(X.values, self.theta_hat)
#         self.H_inv = np.linalg.inv(H)
#         self.X = X
#         self.y = y

#     def hessian(self, X, theta):
#         p = 1 / (1 + np.exp(-X @ theta))
#         return (X.T * p * (1 - p)) @ X

#     def grad_loss(self, x, y, theta):
#         p = 1 / (1 + np.exp(-x @ theta))
#         return (p - y) * x

#     # I_up,params(z) = -H^(-1) ∇_θ L(z, θ^)
#     def influence_function(self):
#         self.influence_params = []
#         for i in range(len(self.X)):
#             x, y_i = self.X.values[i], self.y.iloc[i]
#             grad = self.grad_loss(x, y_i, self.theta_hat)
#             influence = -self.H_inv @ grad
#             self.influence_params.append(influence)
#         return self.influence_params

#     # I_up,loss(z, z_test) = -∇_θ L(z_test, θ^)^T H^(-1) ∇_θ L(z, θ^)
#     def influence_function_test(self, z):
#         # Compute the loss gradient at the test point z
#         # ---------------
#         # # NOTES:
#         # # *Positive* influence values indicate that the corresponding training
#         # # points have a negative impact on the model's prediction for the test point.
#         # # ----
#         # # *Negative* influence values contribute to reducing the loss
#         # # or improving the model's performance for the given test point.
#         # ---------------
#         # ∇_θ L(z_test, θ^)^T
#         self.z = z
#         test_grad = self.grad_loss(z[0], z[1], self.theta_hat).T

#         # Compute the influence of each training point
#         self.influence_loss = []
#         for i in range(len(self.X)):
#             x, y_i = self.X.values[i], self.y.iloc[i]
#             # ∇_θ L(z, θ^)
#             train_grad = self.grad_loss(x, y_i, self.theta_hat)
#             influence = -test_grad @ self.H_inv @ train_grad
#             self.influence_loss.append(influence)

#         return self.influence_loss

#     def get_most_influetial_points(self, n=2, asc=False):
#         # Append 'Influence_' to each self.X.columns
#         ip = pd.DataFrame(self.influence_params, columns=['Influence_' + str(i+1) for i in range(len(self.X.columns))])
#         total_n = len(ip)

#         # Add columns for the training points
#         for i in range(len(self.X.columns)):
#             ip[f'X_{i+1}'] = self.X[self.X.columns[i]]

#         ip['influence_sum'] = ip[['Influence_' + str(i+1) for i in range(len(self.X.columns))]].abs().sum(axis=1)

#         # Calculate Aprox_param_i = θ^ - I_up,params(z) / n
#         # ip = pd.merge(ip, columns=['Aprox_param_' + self.X.columns[i] for i in range(len(self.X.columns))])
#         for i in range(len(self.X.columns)):
#             ip[f'Aprox_param_{i+1}'] = self.theta_hat[i] - (ip["Influence_" + str(i+1)] / total_n)


#         ip.sort_values(by='influence_sum', ascending=asc, inplace=True)
#         return ip.head(n)

#     def get_most_influetial_points_test(self, n=2, positives=True):
#         ip = pd.DataFrame(self.influence_loss, columns=['loss'])
#         # Add columns for the training points
#         for i in range(len(self.X.columns)):
#             ip[f'X_{i+1}'] = self.X[self.X.columns[i]]

#         ip.sort_values(by='loss', ascending=positives, inplace=True)
#         return ( ip.head(n), self.z )

In [None]:
# ifunc = InfluenceFunction(ordinal_logistic, x_train_resampled, y_train_resampled)
# inf = ifunc.influence_function()
# # test_point = x_train.iloc[13]
# # test_label = y_train[13]
# # inf_z = ifunc.influence_function_test((test_point, test_label))
# # print(pd.DataFrame(inf, columns=x_train.columns))
# # print('---------------')
# # print(pd.DataFrame(inf_z, columns=['Influence_Loss_zTest']))
# mip = ifunc.get_most_influetial_points(n=20)
# mip

In [None]:
# mip = ifunc.get_most_influetial_points(n=60)
# indexes = mip.index.values

# # remove the most influential points (indexes) form x_train
# x_train_2 = x_train_resampled.drop(indexes)
# y_train_2 = y_train_resampled.drop(indexes)


# # Train the ordinal logistic regression model
# ordinal_logistic = LogisticRegression(solver='saga', max_iter=5000, C=1, multi_class='multinomial')
# ordinal_logistic.fit(x_train_2, y_train_2)


# # Test the model
# y_pred_log = ordinal_logistic.predict(x_test)
# y_tpred_log = ordinal_logistic.predict(x_train)
# acc_test1 = accuracy_score(y_test, y_pred_log)
# acc_train1 = accuracy_score(y_train, y_tpred_log)

# f1_test = f1_score(y_test, y_pred_log, average='weighted')
# print("Ordinal Logistic Regression Test F1:", f1_test)

# ## Accuracy
# print("Ordinal Logistic Regression Test Accuracy:", acc_test1)

# # plot training and test accuracy
# import matplotlib.pyplot as plt

# # Create a list of model names
# model_names = ['Ordinal Logistic Regression', 'SVM', 'Random Forest', 'AdaBoost', 'Neural Network']

# # Create lists for train and test accuracies
# train_accuracies = [acc_train1, acc_train2, acc_train3, acc_train4, acc_train5]
# test_accuracies = [acc_test1, acc_test2, acc_test3, acc_test4, acc_test5]

# # Create a figure and axis object
# fig, ax = plt.subplots(figsize=(10, 6))

# # Set the x-axis tick positions and labels
# x = np.arange(len(model_names))
# ax.set_xticks(x)
# ax.set_xticklabels(model_names, rotation=45, ha='right')

# # Plot the train and test accuracies
# bar_width = 0.35
# rects1 = ax.bar(x - bar_width / 2, train_accuracies, bar_width, label='Train Accuracy')
# rects2 = ax.bar(x + bar_width / 2, test_accuracies, bar_width, label='Test Accuracy')

# # Add labels and legend
# ax.set_ylabel('Accuracy')
# ax.set_title('Train vs Test Accuracy Comparison')
# ax.legend()

# # Add value labels on top of the bars
# for rect1, rect2, model in zip(rects1, rects2, model_names):
#     ax.text(rect1.get_x() + rect1.get_width() / 2, rect1.get_height() + 0.01, f'{rect1.get_height():.2f}', ha='center', va='bottom')
#     ax.text(rect2.get_x() + rect2.get_width() / 2, rect2.get_height() + 0.01, f'{rect2.get_height():.2f}', ha='center', va='bottom')

# # Adjust layout and display the plot
# plt.tight_layout()
# plt.show()

In [None]:
# unique_values, counts = np.unique(y_pred_log, return_counts=True)
# print(unique_values, counts)

# unique_values, counts = np.unique(y_test, return_counts=True)
# print(unique_values, counts)