In [None]:
# Import toolkits
import pandas as pd
import numpy as np
from sklearn . manifold import TSNE
import os
from sklearn . cluster import DBSCAN
import matplotlib . pyplot as plt
from sklearn . cluster import KMeans
from sklearn . metrics import silhouette_samples , silhouette_score
from sklearn . preprocessing import MinMaxScaler
import csv
from sklearn . neighbors import NearestNeighbors

In [None]:
# Read the transformed data from the CSV file
input_file = "path/to/file" # Input file directory
df = pd.read_csv(input_file)

# Initialize a dictionary to store the feature vectors
flight_feature_vectors = {}

# Iterate through each row in the dataframe and create feature vectors for each flight
for flight_id, data in df.groupby("flight_id"):
    lat_lon_pairs = data[["lat_rad", "lon_rad"]].values.flatten()
    flight_feature_vectors[flight_id] = lat_lon_pairs

# Combine all feature vectors into a single numpy array
all_feature_vectors = np.array(list(flight_feature_vectors.values()))
print(all_feature_vectors)

In [None]:
# Apply t-SNE to reduce the 2465 lat_rad and lon_rad timesteps
tsne = TSNE(n_components=2, random_state=42)
reduced_feature_vectors = tsne.fit_transform(all_feature_vectors)

# Normalize the reduced lat_rad and lon_rad values using MinMaxScaler
scaler = MinMaxScaler()
normalized_feature_vectors = scaler.fit_transform (reduced_feature_vectors)

# Print the reduced feature vectors for each flight
for i, flight_id in enumerate(flight_feature_vectors.keys()):
    lat_rad, lon_rad = normalized_feature_vectors [i]
    print (f"{ flight_id}_feature_vector = [{lat_rad}, {lon_rad}]")

# Calculate the within - cluster sum of squares (WSS) for different values of K
K_values = range (1,10) # Test K from 1 to 10
wss_values = []
for K in K_values:
    kmeans = KMeans(n_clusters=K, random_state= 42)
    kmeans.fit(normalized_feature_vectors)
    wss_values.append(kmeans.inertia_)

# Plot the WSS values against different values of K
plt.figure(figsize =(10, 6))
plt.plot(K_values, wss_values , marker ='o')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Within - Cluster Sum of Squares (WSS )')
plt.title('Elbow Method for Optimal K in K- Means')
plt.grid(True)
plt.show()

In [None]:
# Evaluate silhouette scores for different values of K
K_values = range(2, 11) # Test K from 2 to 10
silhouette_scores = []
for K in K_values :
    kmeans = KMeans (n_clusters=K, random_state =42)
    cluster_labels = kmeans.fit_predict(normalized_feature_vectors)
    silhouette_avg = silhouette_score(normalized_feature_vectors, cluster_labels)
    silhouette_scores.append(silhouette_avg)
    print (f" Silhouette Score for K={K}: { silhouette_avg }")

# Plot the silhouette scores against different values of K
plt.figure(figsize =(10, 6))
plt.plot(K_values, silhouette_scores , marker ='o')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Method for Optimal K in K- Means ')
plt.grid(True)
plt.show()

In [None]:
# Plot the reduced feature vectors on a 2D scatter plot
plt.figure (figsize =(10, 6))
for i, flight_id in enumerate (flight_feature_vectors.keys()):
    lat_rad, lon_rad = normalized_feature_vectors[i]
    plt.scatter( lat_rad , lon_rad , label = flight_id )
    
plt.xlabel("Reduced & Normalized Lat_rad")
plt.ylabel("Reduced & Normalized Lon_rad")
plt.title("t-SNE Visualization of Flight Trajectories")
plt.grid(True)
plt.show()

# Cluster Values
K = 6 #Adjust and change according to WSS and Silhouette scores
iterations = 500 #Adjust and tune accordingly

In [None]:
# Apply K- means clustering
kmeans = KMeans (n_clusters =K, max_iter = iterations, random_state=42)
cluster_labels = kmeans.fit_predict(reduced_feature_vectors)

# Calculate silhouette scores for each data point
silhouette_scores = silhouette_samples (reduced_feature_vectors, cluster_labels)

# Calculate the average silhouette score for the entire clustering
avg_silhouette_score = silhouette_score (reduced_feature_vectors, cluster_labels)

# Print the average silhouette score
print ( f" Average Silhouette Score for K={K}: { avg_silhouette_score }")

# Define colors for each cluster
colors = ['red', 'green', 'navy', 'brown', 'orange', 'purple', 'black']

# Get unique cluster labels
unique_labels = np.unique(cluster_labels)

In [None]:
# Plot the clusters and centroids
plt.figure (figsize =(10, 6))
for i, flight_id in enumerate (flight_feature_vectors.keys()):
    lat_rad, lon_rad = normalized_feature_vectors[i]
    cluster_label = cluster_labels [i]
    plt.scatter(lat_rad, lon_rad, color = colors [cluster_label], label=f"{flight_id } - Cluster {cluster_label}")

# Add legend for each cluster
for label in unique_labels :
    plt.scatter ([], [], color = colors [ label ], label =f" Cluster { label }")
plt.xlabel("Lat_rad")
plt.ylabel("Lon_rad")
plt.title("Normalized Flight Data Clustering Result")
plt.grid(True)
plt.show()

# Plot the centroids of the clusters
centroids = kmeans.cluster_centers_
normalized_centroids = scaler . transform ( centroids )
plt.scatter(normalized_centroids[:, 0], normalized_centroids[:, 1], marker ='x', color ='red', s=200, label ='Centroids')
plt.xlabel("Reduced Lat_rad")
plt.ylabel("Reduced Lon_rad")
plt.title(f"K- means Clustering Centroid with K={K}")
plt.legend ()
plt.grid ( True )

# Set the same limits as the previous plot
plt.xlim(plt.gca().get_xlim())
plt.ylim(plt.gca().get_ylim())
plt.show()

# Get the cluster labels and data points
cluster_labels = kmeans.labels_
data_points = reduced_feature_vectors

In [None]:
# Calculate the variance of each cluster
cluster_variances = []
for label in unique_labels :
    cluster_data_points = data_points [cluster_labels == label]
    cluster_mean = np.mean(cluster_data_points, axis =0)
    cluster_variance = np.mean(np.sum((cluster_data_points - cluster_mean) ** 2, axis =1))
    cluster_variances.append(cluster_variance)

In [None]:
# Plot the variances
plt.figure(figsize =(10, 6))
plt.bar(unique_labels, cluster_variances)
plt.xlabel("Cluster Label")
plt.ylabel("Variance")
plt.title(f"Variance of each cluster with K={K}")
plt.grid(True)
plt.show()

# Create a dictionary to store cluster information
cluster_info = {f" Cluster_ {i}": [] for i in range ( K )}

# Group flight IDs based on their cluster labels
for i, (flight_id, _) in enumerate (flight_feature_vectors.items()):
    cluster_info [f" Cluster_ {cluster_labels [i]}"]. append (flight_id)

# Find the maximum length of lists in the dictionary
max_length = max(len(ids) for ids in cluster_info.values())

# Pad the lists with an empty string to make them of equal length
for ids in cluster_info.values ():
    ids.extend([''] * (max_length - len(ids)))
              
# Convert the cluster_info dictionary to a pandas DataFrame
cluster_df = pd.DataFrame(cluster_info)

# Save the cluster information to a CSV file
output_file = #Put output directory
cluster_df.to_csv(output_file, index = False)