In [None]:
###Insect multi-phenology patterns classification###

#The code is written to categorise insect occurrence records based on their degree of voltinism.Insects with varying voltinism exhibit different phenological patterns 
# - either unimodal (single peak of activity) or multimodal (multiple peaks). 
#These patterns are critical for understanding insect population dynamics and community interactions.

#Step:
#(1) Create the probability density curve for each species occurrence records.
#(2) Smooth the probability density curve using Gaussian and S-G filter.
#(3) Determine whether the phenology is characterised by multiple peaks (using the find_peaks() function from scipy package).
#(4) If it has the multi-peak phenology, K-means clustering is performed.The Silhouette Scores is used to find the optimal cluster number.

In [None]:
#Load package
import glob
import os
import pandas as pd
import numpy as np
from scipy.signal import find_peaks
import seaborn as sns
from scipy.ndimage import gaussian_filter1d
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from scipy.signal import savgol_filter

In [None]:
#Batch obtain data from .CSV files in a folder
def list_csv_files(root_dir, csv_list):
    # Use glob to find all .csv files in the directory and its subdirectories
    csv_files = glob.glob(os.path.join(root_dir, '**/*.csv'), recursive=True)
    csv_list.extend(csv_files)

# Initialise an empty list
list_csv = []

# store result in list_csv
list_csv_files(r"...:\Insects_occurrence_file_path", list_csv) #Modified to the path where phenology data are stored 

In [None]:
# Initialise an empty DataFrame to store the result
df_mode = pd.DataFrame(columns=["Species", "Gaussian", "S_G", "K_means","keep_mode"])

# Loop through the list of CSV files
for i in list_csv:
    #Get species occurrence records
    fpath, fname = os.path.split(i)
    Species = fname.split(".")[0]
    df_ins = pd.read_csv(i, sep=',')
    
    # create the probability density curve 
    # Group by every 14 days and count
    total_count = len(df_ins)
    df_ins['IOD_grouped'] = (df_ins['IOD'] - 1) // 14* 14 + 1  # Compute starting day of each week
    grouped_counts = df_ins.groupby('IOD_grouped')['IOD'].agg('count').reset_index()
    grouped_counts.columns = ['IOD_grouped', 'Count']

    # Calculate the relative frequency for 14-day groups
    grouped_counts['relative_frequency'] = grouped_counts['Count'] / total_count

    # Smooth the probability density curve using Gaussian filter
    sigma = 0.5
    grouped_counts['Occurrences_Smoothed'] = gaussian_filter1d(grouped_counts['relative_frequency'], sigma=sigma)
    peaks, _ = find_peaks(grouped_counts['Occurrences_Smoothed'], prominence=0.005,height=0.02)
    Gaussian = len(peaks)

    # Smooth the probability density curve using S-G filter
    window_length = 5
    polyorder = 3
    grouped_counts['Occurrences_Smoothed_SG'] = savgol_filter(grouped_counts['relative_frequency'], window_length, polyorder)
    peaks, _ = find_peaks(grouped_counts['Occurrences_Smoothed_SG'], prominence=0.005,height=0.02)
    S_G = len(peaks)
        
    #Adjusted single/multiple phenological peak results for comparison with literature records
    def adjust_pheno_pattern(species_list, current_S_G, current_Gaussian):
        # Mapping species to their modes
        mode_mapping = {
            1:["Anthonomus pedicularius","Batophila rubi","Brachyderes incanus","Cassida viridis","Ceratapion onopordi","Ceutorhynchus contractus","Ceutorhynchus obstrictus","Chaetocnema concinna","Chrysolina fastuosa","Chrysolina hyperici","Crepidodera aurata","Crepidodera aurea","Crepidodera fulvicornis","Dryocoetes autographus","Exapion ulicis","Exomias araneiformis","Galerucella tenella","Hylastes cunicularius","Hylesinus varius","Hypera nigrirostris","Hypera postica","Ischnopterapion virens","Mecinus pyraster","Otiorhynchus sulcatus","Oulema melanopus","Paracorymbia maculicornis","Perapion marchicum","Phaenops cyanea","Phyllotreta vittula","Psylliodes chrysocephalus","Psylliodes napi","Pyrrhalta viburni","Sitona hispidulus","Sitona sulcifrons","Tachyerges salicis","Trichius fasciatus","Andrena fuscipes","Anthidium punctatum","Athalia circularis","Bombus magnus","Bombus monticola","Bombus muscorum","Bombus ruderarius","Bombus sporadicus","Bombus sylvarum","Chelostoma campanularum","Hylaeus communis","Lasioglossum fratellum","Lasioglossum fulvicorne","Lasioglossum punctatissimum","Lasioglossum sexstrigatum","Lasioglossum zonulum","Megachile ligniseca","Osmia bicolor","Panurgus banksianus","Xylocopa violacea","Abraxas sylvata","Acrocercops brongniardella","Aethes smeathmanniana","Agonopterix propinquella","Anania verbascalis","Ancylis laetana","Callimorpha dominula","Dicallomera fascelina","Eriogaster lanestris","Euproctis chrysorrhoea","Gonepteryx cleopatra","Hadena albimacula","Heliothis viriplaca","Herminia tarsicrinalis","Hipparchia statilinus","Lasiocampa trifolii","Lithophane lamda","Lycia zonaria","Phigalia pilosaria","Pyla fusca","Pyronia cecilia","Saturnia pavonia","Spiris striata","Thaumetopoea processionea"],
            2:["Harpalus tardus","Leptinotarsa decemlineata","Plagiodera versicolora","Lygus rugulipennis", "Bombus lapidarius","Bombus lucorum","Bombus pascuorum","Bombus pratorum","Halictus tumulorum","Lasioglossum leucopus","Lasioglossum morio", "Acleris comariana","Acronicta auricoma","Acronicta rumicis","Actinotia polyodon","Ancylis badiana","Apatura ilia","Asthena albulata","Cameraria ohridella","Clepsis spectrana","Coenonympha pamphilus","Colostygia pectinataria","Craniophora ligustri","Cyclophora linearia","Diachrysia chrysitis","Diacrisia sannio","Drepana curvatula","Dysstroma truncata","Earias clorana","Epirrhoe galiata","Epirrhoe rivata","Epirrhoe tristata","Erynnis tages","Eucarta virgo","Eupithecia assimilata","Evergestis forficalis","Falcaria lacertinaria","Furcula bifida","Hadena bicruris","Hylaea fasciaria","Hypena proboscidalis","Hypena rostralis","Lampropteryx otregiata","Lathronympha strigana","Leptidea juvernica","Lycaena dispar","Lygephila viciae","Lyonetia clerkella","Lythria cruentaria","Macaria alternata","Melitaea phoebe","Minoa murinata","Mythimna pallens","Notodonta dromedarius","Ochropleura plecta","Opisthograptis luteolata","Orthonama vittata","Pseudeustrotia candidula","Pterostoma palpina","Pyrausta purpuralis","Rivula sericealis","Scoliopteryx libatrix","Scopula immorata","Sideridis rivularis","Thera obeliscata"],
            3:["Boloria dia", "Gymnoscelis rufifasciata"]  
        }
        for mode, species in mode_mapping.items():
            if any(sp in species for sp in species_list):
                return mode, mode  
        return current_S_G, current_Gaussian 
    
    species_list = df_ins["species"].unique()
    S_G, Gaussian = adjust_pheno_pattern(species_list, S_G, Gaussian)

    # If it is the single-peak phenology, K-means clustering is not performed.
    if(S_G ==1 and Gaussian ==1):
        K_means ="U"
        keep_mode = 1
        df=df_ins #Avoid changing the original data
        #Remove outliers
        mean=df["IOD"].mean()
        std = df["IOD"].std()
        filtered_df = df[abs(df['IOD'] - mean) < 3 * std] 
        #record result
        df_mode =pd.concat([df_mode, pd.DataFrame({"Species": [Species], 'Gaussian': [Gaussian],"S_G":[S_G],"K_means":[K_means],"keep_mode":[keep_mode]})], axis=0)
        filtered_df['Cluster']=1
        filtered_df= filtered_df.drop(columns=['IOD_grouped'])
        #output single-peak phenology occurrence record
        filtered_df.to_csv("...\\your_path\\"+Species+"_S.csv",encoding='utf-8',index=False) #Modified to the path where you want to store the result 
    
    # If it is the multi-peak phenology, K-means clustering is performed.
    else:
        df= df_ins #Avoid changing the original data

        #Standardise the data using StandardScaler
        scaler = StandardScaler()
        df_scaled = scaler.fit_transform(df[['IOD']])

        #Apply K-Means and use the Silhouette Scores to find the optimal number of clusters
        silhouette_scores = []
        max_clusters = max(S_G,Gaussian)
        range_clusters = range(2, max_clusters + 1)

        for k in range_clusters:
            kmeans = KMeans(n_clusters=k, init='k-means++', n_init=10,random_state=42)
            kmeans.fit(df_scaled)
            silhouette_scores.append(silhouette_score(df_scaled, kmeans.labels_))

        #Find the number of clusters that maximizes the silhouette_scores
        optimal_index = np.argmax(silhouette_scores)
        optimal_clusters = range_clusters[optimal_index]

        K_means = optimal_clusters
        
        # Apply K-Means with the chosen number of clusters and add the cluster labels to the DataFrame
        kmeans = KMeans(n_clusters=optimal_clusters, init='k-means++', random_state=42)
        kmeans.fit(df_scaled)
        df['Cluster'] = kmeans.labels_
        
        # Calculate the error (distance) between each data point and its corresponding cluster center
        errors = np.linalg.norm(df_scaled - kmeans.cluster_centers_[kmeans.labels_], axis=1)
        
        # Calculate the mean and standard deviation of the errors
        mean_error = np.mean(errors)
        std_error = np.std(errors)
        # Identify and exclude outliers 
        outlier_threshold = mean_error + 3 * std_error
        outliers = np.where(errors > outlier_threshold)
        df = df.drop(outliers[0])
        
        # Count the size of each cluster group
        counts = df.groupby('Cluster').size()
        # Delete cluster group with size less than 400
        clusters_to_remove = counts[counts < 400].index.tolist()
        keep_mode = K_means - len(clusters_to_remove)
        df = df[~df['Cluster'].isin(clusters_to_remove)]
        
        #record multi-peak phenology result
        df_mode =pd.concat([df_mode, pd.DataFrame({"Species": [Species], 'Gaussian': [Gaussian],"S_G":[S_G],"K_means":[K_means],"keep_mode":[keep_mode] })], axis=0)
      
        #Sort by generation
        average_by_category = df.groupby('Cluster')['IOD'].mean().reset_index(name='C')
        sorted_average_by_category = average_by_category.sort_values(by='C', ascending=True).reset_index(drop = True)

        for j in range(0,len(sorted_average_by_category["Cluster"])):
            df_mode_data = (df[df["Cluster"]==sorted_average_by_category["Cluster"][j]]).reset_index(drop = True)
            df_mode_data['Cluster']=j+1
            df_mode_data=df_mode_data.drop(columns=['IOD_grouped'])
            df_mode_data.to_csv("...\\your_path\\"+Species+"_"+str(j+1)+".csv",encoding='utf-8',index=False) #Modified to the path where you want to store the result     
        
# Save df_mode to a CSV file if needed
df_mode.to_csv(r'...\\your_path\\Mode_result.csv', index=False) #Modified to the path where you want to store the result     
