In [None]:
pip install exifread pandas numpy scikit-learn folium

: 

In [None]:
import os
import exifread
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import folium

# ---------------------
# Helper: Convert GPS to decimal
# ---------------------
def _convert_to_degrees(value):
    d = float(value[0].num) / float(value[0].den)
    m = float(value[1].num) / float(value[1].den)
    s = float(value[2].num) / float(value[2].den)
    return d + (m / 60.0) + (s / 3600.0)

# ---------------------
# Extract GPS from image
# ---------------------
def get_lat_lon(img_path):
    with open(img_path, 'rb') as f:
        tags = exifread.process_file(f, stop_tag='GPS GPSLongitude')
        
        if 'GPS GPSLatitude' in tags and 'GPS GPSLongitude' in tags:
            lat = _convert_to_degrees(tags['GPS GPSLatitude'].values)
            lon = _convert_to_degrees(tags['GPS GPSLongitude'].values)

            # Check direction (N/S, E/W)
            lat_ref = tags['GPS GPSLatitudeRef'].printable
            lon_ref = tags['GPS GPSLongitudeRef'].printable

            if lat_ref != 'N':
                lat = -lat
            if lon_ref != 'E':
                lon = -lon
            return lat, lon
    return None, None

# ---------------------
# Process all images
# ---------------------
image_folder = 'archive'
data = []

for filename in os.listdir(image_folder):
    if filename.lower().endswith('.jpg') or filename.lower().endswith('.jpeg'):
        path = os.path.join(image_folder, filename)
        lat, lon = get_lat_lon(path)
        if lat and lon:
            data.append({'filename': filename, 'Latitude': lat, 'Longitude': lon})

# Convert to DataFrame
df = pd.DataFrame(data)

if df.empty:
    print("No valid geotagged images found.")
    exit()

# ---------------------
# DBSCAN Clustering
# ---------------------
coords = df[['Latitude', 'Longitude']].values
scaler = StandardScaler()
coords_scaled = scaler.fit_transform(coords)

db = DBSCAN(eps=0.3, min_samples=3).fit(coords_scaled)
df['Cluster'] = db.labels_

# ---------------------
# Save and Show Results
# ---------------------
print(df.head())
df.to_csv('clustered_locations.csv', index=False)

# ---------------------
# Visualize with Folium
# ---------------------
m = folium.Map(location=[df['Latitude'].mean(), df['Longitude'].mean()], zoom_start=13)

colors = ['blue', 'green', 'purple', 'orange', 'darkred', 'cadetblue']
for _, row in df.iterrows():
    cluster_color = 'red' if row['Cluster'] == -1 else colors[row['Cluster'] % len(colors)]
    folium.CircleMarker(
        location=[row['Latitude'], row['Longitude']],
        radius=5,
        color=cluster_color,
        fill=True,
        fill_opacity=0.6,
        tooltip=f"Image: {row['filename']} | Cluster: {row['Cluster']}"
    ).add_to(m)

m.save('geotagged_clusters_map.html')
print("Map saved as 'geotagged_clusters_map.html'")

In [None]:
# Print all clustered results
print("\n📌 Clustering Results:")
for idx, row in df.iterrows():
    print(f"Image: {row['filename']}, Latitude: {row['Latitude']:.6f}, Longitude: {row['Longitude']:.6f}, Cluster: {row['Cluster']}")
    

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
# 📊 Matplotlib-based scatter plot
plt.figure(figsize=(10, 6))
palette = sns.color_palette('tab10', len(set(df['Cluster'])))
sns.scatterplot(
    data=df, x='Longitude', y='Latitude', hue='Cluster',
    palette=palette, s=80, edgecolor='k'
)
plt.title('DBSCAN Clustering of Geotagged Images')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.legend(title='Cluster')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Load CSV
df = pd.read_csv("clustered_locations.csv")

# Count images in each cluster
cluster_counts = df['Cluster'].value_counts().sort_index()

# Prepare data for plotting
plot_df = pd.DataFrame({
    'Cluster': cluster_counts.index.astype(str),
    'Count': cluster_counts.values
})

# Bar plot with x as hue to avoid warning
plt.figure(figsize=(8, 5))
sns.barplot(data=plot_df, x='Cluster', y='Count', hue='Cluster', palette='muted', legend=False)
plt.title('Number of Images per Cluster')
plt.xlabel('Cluster Label')
plt.ylabel('Number of Images')
plt.tight_layout()
plt.show()


In [None]:
pip install ydata-profiling

In [None]:
from ydata_profiling import ProfileReport

# Generate and save the EDA report
profile = ProfileReport(df, title="EDA Report - Clustered Image Data", explorative=True)
profile.to_file("eda_report.html")

In [None]:
# Pie chart
plt.figure(figsize=(7, 7))
plt.pie(cluster_counts, labels=cluster_counts.index, autopct='%1.1f%%', startangle=140, colors=sns.color_palette('pastel'))
plt.title('Distribution of Images by Cluster')
plt.axis('equal')  # Equal aspect ratio ensures pie is a circle
plt.show()


In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='Longitude', y='Latitude', hue='Cluster', palette='tab10')
plt.title('Image Locations Colored by Cluster')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.legend(title='Cluster')
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
sns.boxplot(data=df, x='Cluster', y='Latitude')
plt.title('Latitude Distribution per Cluster')
plt.show()

In [None]:
corr = df[['Latitude', 'Longitude', 'Cluster']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()


In [None]:
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(20,8))
ax = fig.add_subplot(111, projection='3d')
sc = ax.scatter(df['Longitude'], df['Latitude'], df['Cluster'], c=df['Cluster'], cmap='tab10')
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
ax.set_zlabel('Cluster')
plt.title('3D Scatter Plot of Locations and Clusters')
plt.colorbar(sc)
plt.show()
