# ML Model to Predict NTRU Hyperparamters

Data Preparation

In [2]:
import pandas as pd

# Load the data
df = pd.read_csv('ntru_performance_data.csv')

# Inspect the first few rows
print(df.head())

     N    q  dF  dg  dr  mLen  encryption_time  decryption_time
0  107  128  50  50  50   100               32               20
1  107  128  50  50  50   200               16               12
2  107  128  50  50  50   300               24               15
3  107  128  50  50  50   400               12               11
4  107  128  50  50  50   500               15               11


In [3]:
# Check for missing values
print(df.isnull().sum())

# Handle missing values if necessary
df = df.dropna()


N                  0
q                  0
dF                 0
dg                 0
dr                 0
mLen               0
encryption_time    0
decryption_time    0
dtype: int64


Mean Squared Error and R-squared for the model

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load the data
df = pd.read_csv('ntru_performance_data.csv')

# Define features and target variables
X = df[['N', 'q', 'dF', 'dg', 'dr', 'mLen']]
y_encryption = df['encryption_time']
y_decryption = df['decryption_time']

# Split the data into training and testing sets
X_train, X_test, y_train_enc, y_test_enc = train_test_split(X, y_encryption, test_size=0.2, random_state=42)
X_train, X_test, y_train_dec, y_test_dec = train_test_split(X, y_decryption, test_size=0.2, random_state=42)

# Initialize and train the model for encryption time
model_enc = RandomForestRegressor(n_estimators=100, random_state=42)
model_enc.fit(X_train, y_train_enc)

# Predict and evaluate the model
y_pred_enc = model_enc.predict(X_test)
print('Encryption Time Mean Squared Error:', mean_squared_error(y_test_enc, y_pred_enc))
print('Encryption Time R-squared:', r2_score(y_test_enc, y_pred_enc))

# Initialize and train the model for decryption time
model_dec = RandomForestRegressor(n_estimators=100, random_state=42)
model_dec.fit(X_train, y_train_dec)

# Predict and evaluate the model
y_pred_dec = model_dec.predict(X_test)
print('Decryption Time Mean Squared Error:', mean_squared_error(y_test_dec, y_pred_dec))
print('Decryption Time R-squared:', r2_score(y_test_dec, y_pred_dec))


Encryption Time Mean Squared Error: 21.706591306666667
Encryption Time R-squared: 0.024132729119405005
Decryption Time Mean Squared Error: 14.502749013333332
Decryption Time R-squared: -0.06221208880558504


Only using a given set of common parameters to predict the best parameters overall.

In [11]:
import numpy as np
import warnings
warnings.filterwarnings("ignore", message="X does not have valid feature names")
# Define the range of NTRU parameters
N_values = [107, 167, 251, 347, 503]
q_values = [128, 256, 512]
dF_values = [50, 60, 70, 80, 90]
dg_values = [50, 60, 70, 80, 90]
dr_values = [50, 60, 70, 80, 90]
mLen_values = [100, 200, 300, 400, 500]

# Generate all combinations of parameters
param_combinations = [(N, q, dF, dg, dr, mLen) for N in N_values for q in q_values for dF in dF_values for dg in dg_values for dr in dr_values for mLen in mLen_values]

# Predict encryption and decryption times for all combinations
predictions = []
for params in param_combinations:
    N, q, dF, dg, dr, mLen = params
    X_new = np.array([[N, q, dF, dg, dr, mLen]])
    enc_time = model_enc.predict(X_new)[0]
    dec_time = model_dec.predict(X_new)[0]
    predictions.append((params, enc_time, dec_time))

# Find the best parameters based on the predictions
best_params_enc = min(predictions, key=lambda x: x[1])
best_params_dec = min(predictions, key=lambda x: x[2])

print('Best parameters for encryption time:', best_params_enc)
print('Best parameters for decryption time:', best_params_dec)


Best parameters for encryption time: ((503, 128, 80, 90, 90, 500), 9.1, 10.45)
Best parameters for decryption time: ((503, 512, 90, 50, 90, 300), 10.3, 8.89)


Predicting best parameters for lengths between 1 and 500 using the set of common parameters only.

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import joblib

# Step 1: Read and Clean the CSV Data
input_file = 'best_p.csv'
cleaned_lines = []

with open(input_file, 'r') as file:
    for line in file:
        cleaned_line = line.strip().rstrip(',')  # Remove trailing comma and any extra whitespace
        cleaned_lines.append(cleaned_line)
        print(cleaned_line)

# Step 2: Save the Cleaned Data
# Convert cleaned lines to a list of lists
data_list = [list(map(int, line.split(','))) for line in cleaned_lines[1:]]

# Load the cleaned data into a DataFrame
columns = ['mLen', 'N', 'q', 'dF', 'dg', 'dr', 'encryption_time', 'decryption_time']
df = pd.DataFrame(data_list, columns=columns)

# Step 3: Split Data into Features and Targets
X = df[['mLen']]
y = df[['N', 'q', 'dF', 'dg', 'dr']]

# Step 4: Train a Machine Learning Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(max_depth=10, max_features="sqrt", min_samples_leaf=1,min_samples_split=10,n_estimators=200, random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Predict optimal parameters for a new message length
new_message_length = pd.DataFrame({'mLen': [200]})
predicted_params = model.predict(new_message_length)
print(f'Predicted Parameters for message length 200: {predicted_params}')

# Save the model for future use
joblib.dump(model, 'ntru_hyperparameter_tuning_model.pkl')


mLen,N,q,dF,dg,dr,actual_encryption_time,actual_decryption_time
510,649,1442,14,14,4,24,20
42,455,2033,11,9,17,17,22
1004,1066,2299,1,4,17,17,22
43,602,580,14,2,10,19,16
1501,445,2048,8,1,8,21,18
44,897,2388,19,6,3,18,32
45,361,1523,7,7,17,18,19
511,555,2020,12,4,14,25,23
46,257,2285,19,11,18,25,21
47,278,967,8,18,16,23,19
48,302,1156,16,18,13,16,16
512,992,1628,6,15,3,27,23
49,1146,1603,7,20,18,18,18
1005,716,2012,11,16,20,19,20
2001,441,674,4,1,1,21,21
50,1276,795,18,6,3,21,22
51,531,2086,12,9,12,20,16
1502,394,2019,20,13,13,19,22
52,678,2552,10,2,11,22,20
513,487,1232,15,4,2,19,25
53,298,1515,14,20,4,17,21
54,275,891,9,20,2,16,17
55,1051,2057,12,2,19,19,23
1006,528,1670,5,10,5,21,19
2501,744,1646,20,6,16,21,22
514,519,1994,2,6,6,17,15
56,418,2088,6,13,18,20,21
57,1054,2241,3,4,7,22,21
58,627,2491,2,18,6,24,26
515,859,1972,18,19,14,34,28
59,929,1138,12,20,4,33,27
60,429,1213,4,14,19,23,21
2002,1081,1413,1,10,6,26,20
1503,1037,1244,16,12,5,19,19
61,961,2063,14,3,13,19,21
1007,613,2240

['ntru_hyperparameter_tuning_model.pkl']

In [3]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the GridSearchCV object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f'Best parameters: {best_params}')

# Train the model with the best parameters
best_model = grid_search.best_estimator_

# Evaluate the tuned model
y_pred_tuned = best_model.predict(X_test)
mse_tuned = mean_squared_error(y_test, y_pred_tuned)
print(f'Tuned Mean Squared Error: {mse_tuned}')

# Save the tuned model
joblib.dump(best_model, 'ntru_hyperparameter_tuning_model_tuned.pkl')


540 fits failed out of a total of 1620.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
184 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 96, in validate_parameter_constraints
    raise InvalidParameterError(
s

Best parameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
Tuned Mean Squared Error: 99380.30088693571


['ntru_hyperparameter_tuning_model_tuned.pkl']

In [9]:
import pandas as pd
import numpy as np
import joblib

# Load the trained model
model = joblib.load('ntru_hyperparameter_tuning_model.pkl')

# Generate 500 random message lengths between 1 and 1000
random_lengths = np.random.randint(1, 1001, size=500)

# Predict best parameters for each message length
predicted_params = model.predict(pd.DataFrame({'mLen': random_lengths}))

# Combine message lengths and predicted parameters into a DataFrame
results_df = pd.DataFrame(predicted_params, columns=['N', 'q', 'dF', 'dg', 'dr'])
results_df['mLen'] = random_lengths

# Reorder columns to have 'mLen' as the first column
results_df = results_df[['mLen', 'N', 'q', 'dF', 'dg', 'dr']]

# Save the results to a CSV file
output_file = 'predicted_parameters.csv'
results_df.to_csv(output_file, index=False)

print(f'Results saved to {output_file}')


Results saved to predicted_parameters.csv
