# Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load Data

In [None]:
# Load and preprocess data
file_path = r'C:\Users\File.csv'
data = pd.read_csv(file_path)

data = data[data['SourceISP'] != 'Rogers']

# Data Mapping

In [None]:
# Map each unique 'Source' IP to a Server ID
unique_servers = data['SourceIP'].unique()
server_mapping = {ip: idx for idx, ip in enumerate(unique_servers)}
data['ServerID'] = data['SourceIP'].map(server_mapping)


# LSTM Sort & Sequence

In [None]:
# Sort data by user and timestamp
data.sort_values(by=['User', 'AdjustedTime'], inplace=True)

# Extract features and target
features = data[['DataLength', 'ARTT', 'SourceLongitude', 'SourceLatitude', 'DestinationLongitude', 'DestinationLatitude']]
labels = data['ServerID']

# Scale features
scaler = StandardScaler()
data[['DataLength', 'ARTT', 'SourceLongitude', 'SourceLatitude', 'DestinationLongitude', 'DestinationLatitude']] = scaler.fit_transform(features)

# Group by user to create sequences
user_groups = data.groupby('User')
X_sequences = []
y_sequences = []

sequence_length = 60  # Adjust this as needed

for user, group in user_groups:
    user_features = group[['DataLength', 'ARTT', 'SourceLongitude', 'SourceLatitude', 'DestinationLongitude', 'DestinationLatitude']].values
    user_labels = group['ServerID'].values
    
    # Create rolling sequences
    for i in range(len(user_features) - sequence_length + 1):
        X_sequences.append(user_features[i:i + sequence_length])
        y_sequences.append(user_labels[i + sequence_length - 1])  # Target is last ID in the sequence

# Convert lists to numpy arrays
X_sequences = pad_sequences(X_sequences, maxlen=sequence_length, dtype='float32')
y_sequences = np.array(y_sequences)

print("Shape of X_sequences:", X_sequences.shape)
print("Shape of y_sequences:", y_sequences.shape)

# Train-Test Split

In [None]:
# Split into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_sequences, y_sequences, test_size=0.15, random_state=42)

print("Number of rows in X_train:", X_train.shape[0])
print("Number of rows in X_test:", X_test.shape[0])
print("Number of rows in y_train:", len(y_train))
print("Number of rows in y_test:", len(y_test))

# Added print statement to display the input shape
print('Input shape:', (X_train.shape[1], X_train.shape[2]))


# LSTM Model Defintion

In [None]:
# Define LSTM model
model = tf.keras.Sequential([
    tf.keras.layers.LSTM(64, activation='tanh', input_shape=(X_train.shape[1], X_train.shape[2])),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(32, activation='tanh'),
    tf.keras.layers.Dense(len(unique_servers), activation='softmax')
])

# Compile model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Add EarlyStopping callback
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=10, restore_best_weights=True
)

# Training

In [None]:
# Train the model
history = model.fit(X_train, y_train, epochs=100, validation_split=0.176, batch_size=32, callbacks=[early_stopping], verbose=1)

# Evaluate

In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("Model accuracy on test set:", accuracy)

# Predict and decode

In [None]:
# Predictions for visualization
y_pred = model.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=1)

In [None]:
# Generate predictions for the entire dataset
y_all_pred = model.predict(X_sequences)
y_all_pred_labels = np.argmax(y_all_pred, axis=1)

# Map true and predicted ServerIDs to ISP numerical labels
y_all_isp_num = [isp_to_num[serverid_to_isp.get(server_id, 'Unknown ISP')] for server_id in y_sequences]
y_all_pred_isp_num = [isp_to_num[serverid_to_isp.get(server_id, 'Unknown ISP')] for server_id in y_all_pred_labels]

# Generate the confusion matrix
conf_matrix_all = confusion_matrix(y_all_isp_num, y_all_pred_isp_num, labels=isp_labels_num)

# Calculate the overall accuracy
accuracy_all = np.sum(np.array(y_all_isp_num) == np.array(y_all_pred_isp_num)) / len(y_all_isp_num)
print("Overall Accuracy on All Data:", accuracy_all)

# Plot the confusion matrix for all predictions
plt.figure(figsize=(8, 6))  # Reduced figure size
disp_all = ConfusionMatrixDisplay(confusion_matrix=conf_matrix_all, display_labels=sorted(isp_to_num.keys()))
disp_all.plot(cmap=plt.cm.Blues, ax=plt.gca(), colorbar=False)  # Disable colorbar if not needed

# Customize the font size
plt.xlabel("Assigned Server", fontsize=12)
plt.ylabel("Best Server", fontsize=12)
plt.xticks(fontsize=10, rotation=45)
plt.yticks(fontsize=10)

# Adjust text size inside cells
for texts in disp_all.text_.ravel():
    texts.set_fontsize(10)

plt.tight_layout()  # Ensure the plot fits well
plt.show()


In [None]:
import pandas as pd
import numpy as np
from haversine import haversine, Unit  # Ensure this is installed
import matplotlib.pyplot as plt

# Save predictions and true labels
output_file = r'C:\Users\LSTMPredictions.csv'

# Create a DataFrame to save true labels, predicted labels, and related info
results_df = pd.DataFrame({
    'True ServerID': y_sequences,
    'Predicted ServerID': y_all_pred_labels,
    'True ISP': [serverid_to_isp.get(server_id, 'Unknown ISP') for server_id in y_sequences],
    'Predicted ISP': [serverid_to_isp.get(server_id, 'Unknown ISP') for server_id in y_all_pred_labels],
    'True Latitude': [data.loc[data['ServerID'] == server_id, 'SourceLatitude'].iloc[0] for server_id in y_sequences],
    'True Longitude': [data.loc[data['ServerID'] == server_id, 'SourceLongitude'].iloc[0] for server_id in y_sequences],
    'Predicted Latitude': [data.loc[data['ServerID'] == server_id, 'SourceLatitude'].iloc[0] for server_id in y_all_pred_labels],
    'Predicted Longitude': [data.loc[data['ServerID'] == server_id, 'SourceLongitude'].iloc[0] for server_id in y_all_pred_labels],
    'Destination Latitude': [data.loc[data['ServerID'] == server_id, 'DestinationLatitude'].iloc[0] for server_id in y_sequences],
    'Destination Longitude': [data.loc[data['ServerID'] == server_id, 'DestinationLongitude'].iloc[0] for server_id in y_sequences],
})

# Save to CSV
results_df.to_csv(output_file, index=False)
print(f"Results saved to {output_file}")
