## Data Mining project: Discover and describe areas of interest and events from geo-located data

In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans, kmeans_plusplus



In [35]:
data_path="flickr_data2.csv"
df=pd.read_csv(data_path)
df.head(n=5)

  df=pd.read_csv(data_path)


Unnamed: 0,id,user,lat,long,tags,title,date_taken_minute,date_taken_hour,date_taken_day,date_taken_month,date_taken_year,date_upload_minute,date_upload_hour,date_upload_day,date_upload_month,date_upload_year,Unnamed: 16,Unnamed: 17,Unnamed: 18
0,4395181099,30624617@N03,45.754858,4.82171,"chair,lyon,rhône,chaise,rhônealpes",Chaises avec vue,11.0,15,28,2,2010,23,20,28.0,2,2010.0,,,
1,4394748717,35853470@N00,45.75327,4.862953,,,51.0,17,28,2,2010,52,17,28.0,2,2010.0,,,
2,4394694699,11817998@N05,45.760655,4.846564,"365,iphone",59/365 - R46 V103 B163,29.0,17,28,2,2010,33,17,28.0,2,2010.0,,,
3,4394803790,11545749@N06,45.784,4.874072,"nin,nineinchnails,gift,screening,toiou,avott",2010-01-29 Toiou Avott Lyon,15.0,20,28,1,2010,38,12,28.0,2,2010.0,,,
4,4394803554,11545749@N06,45.784,4.874072,"lyon,nin,nineinchnails,gift,screening,toiou,avott",2010-01-28 Toiou Avott Lyon,10.0,20,28,1,2010,38,12,28.0,2,2010.0,,,


## Discover the data

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420240 entries, 0 to 420239
Data columns (total 19 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   420240 non-null  int64  
 1    user                420240 non-null  object 
 2    lat                 420240 non-null  float64
 3    long                420240 non-null  float64
 4    tags                316730 non-null  object 
 5    title               381911 non-null  object 
 6    date_taken_minute   420239 non-null  float64
 7    date_taken_hour     420240 non-null  int64  
 8    date_taken_day      420240 non-null  int64  
 9    date_taken_month    420240 non-null  int64  
 10   date_taken_year     420240 non-null  int64  
 11   date_upload_minute  420228 non-null  object 
 12   date_upload_hour    420238 non-null  object 
 13   date_upload_day     420238 non-null  float64
 14   date_upload_month   420240 non-null  int64  
 15   date_upload_year

We notice that the Unnamed columns are all almost empty ,but are filled when data is incoherent . Finally , we decided to just eraze the rows where Unnamed columns are defined , and then we dropped the three columns .

In [37]:
df = df.drop_duplicates(subset=['id'], keep='first')
df = df[df[['Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18']].isnull().all(axis=1)]


df.drop(columns=['Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18'],inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 168050 entries, 0 to 419137
Data columns (total 16 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   168050 non-null  int64  
 1    user                168050 non-null  object 
 2    lat                 168050 non-null  float64
 3    long                168050 non-null  float64
 4    tags                126069 non-null  object 
 5    title               152273 non-null  object 
 6    date_taken_minute   168050 non-null  float64
 7    date_taken_hour     168050 non-null  int64  
 8    date_taken_day      168050 non-null  int64  
 9    date_taken_month    168050 non-null  int64  
 10   date_taken_year     168050 non-null  int64  
 11   date_upload_minute  168050 non-null  object 
 12   date_upload_hour    168050 non-null  object 
 13   date_upload_day     168050 non-null  float64
 14   date_upload_month   168050 non-null  int64  
 15   date_upload_year    1

We also thought about working on data points located in the Lyon region in a defined radius . We define this zone by       

lat_min, lat_max = 45.65, 45.85


lon_min, lon_max = 4.75, 4.95

In [38]:


# Define the bounding box coordinates for the Lyon region
lat_min, lat_max = 45.65, 45.85
lon_min, lon_max = 4.75, 4.95

# Calculate the center of the bounding box
center_lat = (lat_min + lat_max) / 2
center_lon = (lon_min + lon_max) / 2

# Create a map centered on the Lyon region
lyon_map = folium.Map(location=[center_lat, center_lon], zoom_start=12)

# Define the coordinates for the bounding box
bounding_box_coords = [
    [lat_min, lon_min],  # Bottom-left
    [lat_min, lon_max],  # Bottom-right
    [lat_max, lon_max],  # Top-right
    [lat_max, lon_min],  # Top-left
    [lat_min, lon_min]   # Close the polygon
]

# Add the bounding box to the map
folium.PolyLine(bounding_box_coords, color='blue', weight=2).add_to(lyon_map)

# Display the map
lyon_map.save("lyon_region_map.html")
lyon_map  # This will render the map in a Jupyter Notebook environment if supported



In [39]:


# Filter the DataFrame based on latitude and longitude ranges
filtered_df = df[(df[' lat'] >= lat_min) & (df[' lat'] <= lat_max) &
                 (df[' long'] >= lon_min) & (df[' long'] <= lon_max)]

# Display the filtered DataFrame
filtered_df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 161654 entries, 0 to 419137
Data columns (total 16 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   161654 non-null  int64  
 1    user                161654 non-null  object 
 2    lat                 161654 non-null  float64
 3    long                161654 non-null  float64
 4    tags                121622 non-null  object 
 5    title               146375 non-null  object 
 6    date_taken_minute   161654 non-null  float64
 7    date_taken_hour     161654 non-null  int64  
 8    date_taken_day      161654 non-null  int64  
 9    date_taken_month    161654 non-null  int64  
 10   date_taken_year     161654 non-null  int64  
 11   date_upload_minute  161654 non-null  object 
 12   date_upload_hour    161654 non-null  object 
 13   date_upload_day     161654 non-null  float64
 14   date_upload_month   161654 non-null  int64  
 15   date_upload_year    1

In [45]:
sampled_df=filtered_df.sample(n=50000,random_state=42)
def lat_long_to_cartesian(lat, long):
    lat_rad = np.radians(lat)
    long_rad = np.radians(long)
    x = np.cos(lat_rad) * np.cos(long_rad)
    y = np.cos(lat_rad) * np.sin(long_rad)
    z = np.sin(lat_rad)
    return np.array([x, y, z])


coords = sampled_df[[' lat', ' long']].to_numpy()
cartesian_coords = np.array([lat_long_to_cartesian(lat, long) for lat, long in coords])
x_squared_norms = np.sum(cartesian_coords ** 2, axis=1)

In [46]:

# num_clusters = 3  # Adjust this value as needed
# init_centers, _ = kmeans_plusplus(cartesian_coords, n_clusters=num_clusters, random_state=42, x_squared_norms=x_squared_norms)

# # Perform KMeans clustering with initialized centers
# kmeans = KMeans(n_clusters=num_clusters, init=init_centers, n_init=1, random_state=42)
# filtered_df['cluster'] = kmeans.fit_predict(cartesian_coords)

In [47]:
# Define the range of cluster numbers to test
cluster_range = range(20, 40)
inertia_values = []
silhouette_scores = []

# Compute inertia and silhouette scores for each number of clusters
for k in cluster_range:
    # Use kmeans_plusplus to initialize centroids
    init_centers, _ = kmeans_plusplus(cartesian_coords, n_clusters=k, random_state=42, x_squared_norms=x_squared_norms)
    print("here")
    # Fit KMeans with initialized centroids
    kmeans = KMeans(n_clusters=k, init=init_centers, n_init=5, random_state=42,verbose=1)
    kmeans.fit(cartesian_coords)
    
    # Store inertia value
    inertia_values.append(kmeans.inertia_)
    
    # Compute silhouette score for k > 1
    if k > 1:
        silhouette_scores.append(silhouette_score(cartesian_coords, kmeans.labels_))
    else:
        silhouette_scores.append(None)  # Silhouette score is undefined for k=1
    
    print(f'Processed {k} clusters')

# Plot the elbow curve (inertia values)
plt.figure(figsize=(10, 6))
plt.plot(cluster_range, inertia_values, marker='o', label='Inertia (Elbow Method)')
plt.title('Elbow Curve for Optimal Clusters', fontsize=16)
plt.xlabel('Number of Clusters (k)', fontsize=14)
plt.ylabel('Inertia', fontsize=14)
plt.xticks(cluster_range)
plt.legend()
plt.grid(True)
plt.show()

# Plot the silhouette scores
plt.figure(figsize=(10, 6))
plt.plot(cluster_range[1:], silhouette_scores[1:], marker='o', label='Silhouette Score')
plt.title('Silhouette Scores for Optimal Clusters', fontsize=16)
plt.xlabel('Number of Clusters (k)', fontsize=14)
plt.ylabel('Silhouette Score', fontsize=14)
plt.xticks(cluster_range[1:])
plt.legend()
plt.grid(True)
plt.show()


here
Initialization complete
Iteration 0, inertia 0.0007480710093542384.
Iteration 1, inertia 0.0005961693729844035.
Iteration 2, inertia 0.0005779977907309793.
Iteration 3, inertia 0.0005733438905291333.
Iteration 4, inertia 0.0005711845832463303.
Iteration 5, inertia 0.0005701234700092443.
Iteration 6, inertia 0.000569931603757094.
Converged at iteration 6: center shift 5.3841971823386e-12 within tolerance 1.0708077377648978e-11.


  super()._check_params_vs_input(X, default_n_init=10)


Processed 20 clusters
here
Initialization complete
Iteration 0, inertia 0.000683634605284803.
Iteration 1, inertia 0.0005567150589537079.
Iteration 2, inertia 0.0005360086287817089.
Iteration 3, inertia 0.0005263267724195473.
Iteration 4, inertia 0.0005203776887720079.
Iteration 5, inertia 0.0005164573871566026.
Iteration 6, inertia 0.0005138705171595548.
Iteration 7, inertia 0.0005120100547241005.
Iteration 8, inertia 0.0005116148099953188.
Iteration 9, inertia 0.0005112358690265357.
Iteration 10, inertia 0.0005108752188700459.
Iteration 11, inertia 0.0005106389346137676.
Iteration 12, inertia 0.0005103660607237105.
Iteration 13, inertia 0.0005101910517998082.
Iteration 14, inertia 0.0005100535972343095.
Iteration 15, inertia 0.0005099371030569905.
Converged at iteration 15: center shift 9.196279030854037e-12 within tolerance 1.0708077377648978e-11.


  super()._check_params_vs_input(X, default_n_init=10)


Processed 21 clusters
here
Initialization complete
Iteration 0, inertia 0.000668179627951738.
Iteration 1, inertia 0.0005389023984702626.
Iteration 2, inertia 0.0005142096787994787.
Iteration 3, inertia 0.000502198795440498.
Iteration 4, inertia 0.0004956563282729417.
Iteration 5, inertia 0.0004918530269110384.
Iteration 6, inertia 0.0004894389384126512.
Iteration 7, inertia 0.0004873561332562048.
Iteration 8, inertia 0.0004869534241404133.
Iteration 9, inertia 0.0004868037772690742.
Iteration 10, inertia 0.0004866432665542023.
Iteration 11, inertia 0.0004864587980167842.
Iteration 12, inertia 0.0004862460396173887.
Iteration 13, inertia 0.0004860074648608536.
Iteration 14, inertia 0.00048591443382338596.
Iteration 15, inertia 0.0004858677363487462.
Converged at iteration 15: center shift 7.592198010145135e-12 within tolerance 1.0708077377648978e-11.


  super()._check_params_vs_input(X, default_n_init=10)


Processed 22 clusters
here
Initialization complete
Iteration 0, inertia 0.0006116453327467699.
Iteration 1, inertia 0.0005050932818853756.
Iteration 2, inertia 0.0004887686635209956.
Iteration 3, inertia 0.0004832327591767578.
Iteration 4, inertia 0.0004812965916255245.
Iteration 5, inertia 0.0004762107400717015.
Iteration 6, inertia 0.00047307372665037807.
Iteration 7, inertia 0.0004717885051268819.
Iteration 8, inertia 0.0004703568663711954.
Iteration 9, inertia 0.0004692485215357079.
Iteration 10, inertia 0.00046886029160512653.
Iteration 11, inertia 0.000468686708832187.
Iteration 12, inertia 0.00046856785504773187.
Iteration 13, inertia 0.0004684384524753364.
Iteration 14, inertia 0.0004682735775940698.
Iteration 15, inertia 0.0004681660556474885.
Iteration 16, inertia 0.0004680957170782784.
Converged at iteration 16: center shift 6.607951171055542e-12 within tolerance 1.0708077377648978e-11.


  super()._check_params_vs_input(X, default_n_init=10)


In [None]:
# # Define the range of cluster numbers to test
# cluster_range = range(10, 30)
# inertia_values = []
# silhouette_scores = []

# # Use latitude and longitude directly

# coords = filtered_df.head(161654)[[' lat', ' long']].to_numpy()

# # Compute inertia and silhouette scores for each number of clusters
# for k in cluster_range:
#     # Use kmeans_plusplus to initialize centroids

#     # init_centers, _ = kmeans_plusplus(coords, n_clusters=k, random_state=42)
#     # print("Initializing centroids for", k, "clusters...")
    
#     # Fit KMeans with initialized centroids
#     kmeans = KMeans(n_clusters=k, init='k-means++', random_state=42, verbose=1)
#     kmeans.fit(coords)
    
#     # Store inertia value
#     inertia_values.append(kmeans.inertia_)
    
#     # Compute silhouette score for k > 1
#     if k > 1:
#         silhouette_scores.append(silhouette_score(coords, kmeans.labels_))
#     else:
#         silhouette_scores.append(None)  # Silhouette score is undefined for k=1
    
#     print(f'Processed {k} clusters')

# # Plot the elbow curve (inertia values)
# plt.figure(figsize=(10, 6))
# plt.plot(cluster_range, inertia_values, marker='o', label='Inertia (Elbow Method)')
# plt.title('Elbow Curve for Optimal Clusters', fontsize=16)
# plt.xlabel('Number of Clusters (k)', fontsize=14)
# plt.ylabel('Inertia', fontsize=14)
# plt.xticks(cluster_range)
# plt.legend()
# plt.grid(True)
# plt.show()

# # Plot the silhouette scores
# plt.figure(figsize=(10, 6))
# plt.plot(cluster_range[1:], silhouette_scores[1:], marker='o', label='Silhouette Score')
# plt.title('Silhouette Scores for Optimal Clusters', fontsize=16)
# plt.xlabel('Number of Clusters (k)', fontsize=14)
# plt.ylabel('Silhouette Score', fontsize=14)
# plt.xticks(cluster_range[1:])
# plt.legend()
# plt.grid(True)
# plt.show()


Initialization complete
Iteration 0, inertia 27.63059086726737.
Iteration 1, inertia 23.674859674762555.
Iteration 2, inertia 23.282456838718737.
Iteration 3, inertia 22.995890117928244.
Iteration 4, inertia 22.823838607816825.
Iteration 5, inertia 22.766841031485317.
Iteration 6, inertia 22.728917879791624.
Iteration 7, inertia 22.69943367758419.
Iteration 8, inertia 22.686519827559273.
Iteration 9, inertia 22.681036889090993.
Iteration 10, inertia 22.67473415455425.
Iteration 11, inertia 22.66265552409562.
Iteration 12, inertia 22.646241484075983.
Iteration 13, inertia 22.638021284738066.
Iteration 14, inertia 22.627356860363015.
Iteration 15, inertia 22.618478845397068.
Iteration 16, inertia 22.613843613780844.
Iteration 17, inertia 22.611423986479448.
Converged at iteration 17: center shift 2.992399006511701e-08 within tolerance 6.445941133387281e-08.
Processed 10 clusters
Initialization complete
Iteration 0, inertia 24.755826247921714.
Iteration 1, inertia 21.287991626884846.
Iter

KeyboardInterrupt: 