In [5]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('data-availability-20250710115327.csv', index_col='Year/Site ID')

# Define European countries (based on site prefixes)
european_prefixes = ['AT', 'BE', 'CH', 'CZ', 'DE', 'DK', 'FI', 'FR', 'IT', 'NL', 'SE']

# Filter European sites - CORRECTED version
europe_sites = [site for site in df.index if any(site.startswith(prefix) for prefix in european_prefixes)]
europe_df = df.loc[europe_sites]

# Function to calculate continuous years
def max_continuous_years(row):
    years = [int(col) for col in df.columns if str(row[col]) in ['+', 'Tier 2']]
    if not years:
        return 0
    years_sorted = sorted(years)
    max_streak = current_streak = 1
    for i in range(1, len(years_sorted)):
        if years_sorted[i] == years_sorted[i-1] + 1:
            current_streak += 1
            max_streak = max(max_streak, current_streak)
        else:
            current_streak = 1
    return max_streak

# Apply calculations
results = []
for site in europe_df.index:
    total_years = europe_df.loc[site].isin(['+', 'Tier 2']).sum()
    continuous_years = max_continuous_years(europe_df.loc[site])
    results.append({
        'Site': site,
        'Country': site.split('-')[0],
        'Total Years': total_years,
        'Max Continuous Years': continuous_years
    })

# Convert to DataFrame and save
results_df = pd.DataFrame(results)
results_df.to_csv('european_sites_analysis.csv', index=False)
print(results_df)

      Site Country  Total Years  Max Continuous Years
0   AT-Neu      AT           11                    11
1   BE-Bra      BE           19                    19
2   BE-Lon      BE           11                    11
3   BE-Vie      BE           19                    19
4   CH-Cha      CH           10                    10
..     ...     ...          ...                   ...
57  IT-SRo      IT           14                    14
58  IT-Tor      IT            7                     7
59  NL-Hor      NL            8                     8
60  NL-Loo      NL           19                    19
61  SE-St1      SE            3                     3

[62 rows x 4 columns]


In [6]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('data-availability-20250710115327.csv', index_col='Year/Site ID')

# Define European countries (based on site prefixes)
european_prefixes = ['AT', 'BE', 'CH', 'CZ', 'DE', 'DK', 'FI', 'FR', 'IT', 'NL', 'SE']

# Filter European sites
europe_sites = [site for site in df.index if any(site.startswith(prefix) for prefix in european_prefixes)]
europe_df = df.loc[europe_sites]

# Function to calculate year statistics
def get_year_stats(row):
    years = [int(col) for col in df.columns if str(row[col]) in ['+', 'Tier 2']]
    if not years:
        return 0, 0, 0, "No data"
    
    years_sorted = sorted(years)
    first_year = years_sorted[0]
    last_year = years_sorted[-1]
    year_range = f"{first_year}-{last_year}"
    
    # Calculate continuous years
    max_streak = current_streak = 1
    for i in range(1, len(years_sorted)):
        if years_sorted[i] == years_sorted[i-1] + 1:
            current_streak += 1
            max_streak = max(max_streak, current_streak)
        else:
            current_streak = 1
            
    return len(years), max_streak, first_year, last_year, year_range

# Apply calculations
results = []
for site in europe_df.index:
    total_years, continuous_years, first_year, last_year, year_range = get_year_stats(europe_df.loc[site])
    results.append({
        'Site': site,
        'Country': site.split('-')[0],
        'Total Years': total_years,
        'Max Continuous Years': continuous_years,
        'First Year': first_year,
        'Last Year': last_year,
        'Year Range': year_range
    })

# Convert to DataFrame and save
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='Total Years', ascending=False)  # Sort by total years
results_df.to_csv('european_sites_analysis_with_years.csv', index=False)
print(results_df)

      Site Country  Total Years  Max Continuous Years  First Year  Last Year  \
1   BE-Bra      BE           19                    19        1996       2014   
3   BE-Vie      BE           19                    19        1996       2014   
26  DE-Tha      DE           19                    19        1996       2014   
44  IT-Col      IT           19                    19        1996       2014   
31  FI-Hyy      FI           19                    19        1996       2014   
..     ...     ...          ...                   ...         ...        ...   
48  IT-La2      IT            3                     3        2000       2002   
47  IT-Isp      IT            2                     2        2013       2014   
27  DE-Zrk      DE            2                     2        2013       2014   
56  IT-SR2      IT            2                     2        2013       2014   
29  DK-Fou      DK            1                     1        2005       2005   

   Year Range  
1   1996-2014  
3   199

In [7]:
import os
import pandas as pd
from collections import defaultdict

def analyze_flux_files(directory):
    results = []
    country_stats = defaultdict(lambda: {'sites': set(), 'years': set()})
    processed_files = 0
    
    for filename in os.listdir(directory):
        if not (filename.startswith('FLX_') and filename.endswith('.csv')):
            continue
            
        try:
            # Split filename into components
            parts = filename.split('_')
            if len(parts) < 5:
                print(f"Skipping {filename}: not enough components")
                continue
                
            # Extract site information
            site_code = parts[1]
            country = site_code.split('-')[0]
            
            # Find the part that contains the year range
            year_part = None
            for part in parts:
                if '-' in part and part.replace('-', '').isdigit():
                    year_part = part
                    break
            
            if not year_part:
                print(f"Skipping {filename}: no year range found")
                continue
                
            # Extract years
            start_year, end_year = map(int, year_part.split('-'))
            duration = end_year - start_year + 1
            
            # Update statistics
            country_stats[country]['sites'].add(site_code)
            country_stats[country]['years'].update(range(start_year, end_year + 1))
            
            results.append({
                'Filename': filename,
                'Site Code': site_code,
                'Country': country,
                'Start Year': start_year,
                'End Year': end_year,
                'Duration': duration,
                'Year Range': f"{start_year}-{end_year}"
            })
            processed_files += 1
            
        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")
    
    print(f"\nSuccessfully processed {processed_files} files")
    return results, country_stats

def generate_outputs(results, country_stats, directory):
    if not results:
        print("\nNo valid flux tower files were processed.")
        return
    
    # Create detailed dataframe
    df = pd.DataFrame(results)
    df['Max Continuous Years'] = df['Duration']  # Filenames contain continuous ranges
    detailed_cols = ['Site Code', 'Country', 'Duration', 'Max Continuous Years', 'Year Range']
    df = df[detailed_cols].sort_values(['Country', 'Site Code'])
    
    # Create summary dataframe
    summary_data = []
    for country, stats in country_stats.items():
        years = sorted(stats['years'])
        summary_data.append({
            'Country': country,
            'Number of Sites': len(stats['sites']),
            'Sites': ', '.join(sorted(stats['sites'])),
            'First Year': min(years),
            'Last Year': max(years),
            'Total Years': len(years),
            'Coverage': f"{min(years)}-{max(years)}"
        })
    
    summary_df = pd.DataFrame(summary_data).sort_values('Country')
    
    # Save outputs
    output_dir = os.path.join(directory, 'analysis_results')
    os.makedirs(output_dir, exist_ok=True)
    
    site_output = os.path.join(output_dir, 'site_statistics.csv')
    country_output = os.path.join(output_dir, 'country_statistics.csv')
    
    df.to_csv(site_output, index=False)
    summary_df.to_csv(country_output, index=False)
    
    print("\n=== Analysis Complete ===")
    print(f"Site-level statistics saved to: {site_output}")
    print(f"Country-level statistics saved to: {country_output}")
    
    print("\nSample of Site Statistics:")
    print(df.head())
    
    print("\nCountry Summary Statistics:")
    print(summary_df)

# Run the analysis
directory = r'C:\Deepak\icos\icos'
print(f"Analyzing files in: {directory}")
results, country_stats = analyze_flux_files(directory)
generate_outputs(results, country_stats, directory)

Analyzing files in: C:\Deepak\icos\icos

Successfully processed 31 files

=== Analysis Complete ===
Site-level statistics saved to: C:\Deepak\icos\icos\analysis_results\site_statistics.csv
Country-level statistics saved to: C:\Deepak\icos\icos\analysis_results\country_statistics.csv

Sample of Site Statistics:
  Site Code Country  Duration  Max Continuous Years Year Range
0    BE-Bra      BE        25                    25  1996-2020
1    BE-Dor      BE        10                    10  2011-2020
2    BE-Lon      BE        17                    17  2004-2020
3    BE-Maa      BE         5                     5  2016-2020
4    BE-Vie      BE        25                    25  1996-2020

Country Summary Statistics:
  Country  Number of Sites                                           Sites  \
0      BE                5          BE-Bra, BE-Dor, BE-Lon, BE-Maa, BE-Vie   
1      CH                1                                          CH-Dav   
2      CZ                2                     

In [10]:
import os
import pandas as pd
from collections import defaultdict
from datetime import datetime

def analyze_icos_files(csv_path):
    # Load the CSV file
    df = pd.read_csv(csv_path)
    
    results = []
    site_stats = defaultdict(lambda: {'products': set(), 'years': set(), 'size': 0})
    country_stats = defaultdict(lambda: {'sites': set(), 'years': set(), 'size': 0})
    
    for _, row in df.iterrows():
        try:
            # Extract site information from filename
            filename = row['fileName']
            if not filename.startswith('ICOSETC_'):
                continue
                
            parts = filename.split('_')
            site_code = parts[1]
            country = site_code.split('-')[0]
            
            # Extract product type from spec URL
            product_type = row['spec'].split('/')[-1]
            
            # Parse dates
            start_date = datetime.strptime(row['timeStart'], '%Y-%m-%dT%H:%M:%SZ')
            end_date = datetime.strptime(row['timeEnd'], '%Y-%m-%dT%H:%M:%SZ')
            start_year = start_date.year
            end_year = end_date.year
            duration = end_year - start_year + 1
            
            # Update statistics
            site_stats[site_code]['products'].add(product_type)
            site_stats[site_code]['years'].update(range(start_year, end_year + 1))
            site_stats[site_code]['size'] += row['size']
            
            country_stats[country]['sites'].add(site_code)
            country_stats[country]['years'].update(range(start_year, end_year + 1))
            country_stats[country]['size'] += row['size']
            
            results.append({
                'Filename': filename,
                'Site Code': site_code,
                'Country': country,
                'Product Type': product_type,
                'Start Year': start_year,
                'End Year': end_year,
                'Duration': duration,
                'Year Range': f"{start_year}-{end_year}",
                'Size (bytes)': row['size'],
                'Submission Date': row['submTime']
            })
            
        except Exception as e:
            print(f"Error processing row {_}: {str(e)}")
            continue
    
    print(f"\nSuccessfully processed {len(results)} entries")
    return results, site_stats, country_stats

def generate_outputs(results, site_stats, country_stats, output_dir):
    if not results:
        print("\nNo valid entries were processed.")
        return
    
    # Create detailed dataframe
    df = pd.DataFrame(results)
    detailed_cols = ['Site Code', 'Country', 'Product Type', 'Duration', 
                    'Year Range', 'Size (bytes)', 'Submission Date']
    df = df[detailed_cols].sort_values(['Country', 'Site Code', 'Product Type'])
    
    # Create site summary dataframe
    site_data = []
    for site, stats in site_stats.items():
        years = sorted(stats['years'])
        site_data.append({
            'Site Code': site,
            'Country': site.split('-')[0],
            'First Year': min(years),
            'Last Year': max(years),
            'Total Years': len(years),
            'Coverage': f"{min(years)}-{max(years)}",
            'Total Size (MB)': round(stats['size'] / (1024*1024), 2)
        })
    
    site_summary_df = pd.DataFrame(site_data).sort_values(['Country', 'Site Code'])
    
    # Create country summary dataframe
    country_data = []
    for country, stats in country_stats.items():
        years = sorted(stats['years'])
        country_data.append({
            'Country': country,
            'Number of Sites': len(stats['sites']),
            'Sites': ', '.join(sorted(stats['sites'])),
            'First Year': min(years),
            'Last Year': max(years),
            'Total Years': len(years),
            'Coverage': f"{min(years)}-{max(years)}",
            'Total Size (GB)': round(stats['size'] / (1024*1024*1024), 2)
        })
    
    country_summary_df = pd.DataFrame(country_data).sort_values('Country')
    
    # Save outputs
    os.makedirs(output_dir, exist_ok=True)
    
    detailed_output = os.path.join(output_dir, 'detailed_statistics.csv')
    site_output = os.path.join(output_dir, 'site_summary.csv')
    country_output = os.path.join(output_dir, 'country_summary.csv')
    
    df.to_csv(detailed_output, index=False)
    site_summary_df.to_csv(site_output, index=False)
    country_summary_df.to_csv(country_output, index=False)
    
    print("\n=== Analysis Complete ===")
    print(f"Detailed statistics saved to: {detailed_output}")
    print(f"Site summary saved to: {site_output}")
    print(f"Country summary saved to: {country_output}")
    
    print("\nSample of Detailed Statistics:")
    print(df.head())
    
    print("\nSite Summary Statistics:")
    print(site_summary_df.head())
    
    print("\nCountry Summary Statistics:")
    print(country_summary_df)

# Run the analysis
csv_path = 'Carbon_Portal_Search_Result.csv'  # Update with your actual path
output_dir = 'icos_analysis_results'

print(f"Analyzing ICOS Carbon Portal data from: {csv_path}")
results, site_stats, country_stats = analyze_icos_files(csv_path)
generate_outputs(results, site_stats, country_stats, output_dir)

Analyzing ICOS Carbon Portal data from: Carbon_Portal_Search_Result.csv

Successfully processed 421 entries

=== Analysis Complete ===
Detailed statistics saved to: icos_analysis_results\detailed_statistics.csv
Site summary saved to: icos_analysis_results\site_summary.csv
Country summary saved to: icos_analysis_results\country_summary.csv

Sample of Detailed Statistics:
    Site Code Country       Product Type  Duration Year Range  Size (bytes)  \
15     BE-Bra      BE  etcArchiveProduct         6  2020-2025      98188780   
420    BE-Bra      BE       etcL2AuxData         6  2020-2025       7040375   
419    BE-Bra      BE        etcL2Fluxes         6  2019-2024      16907118   
416    BE-Bra      BE       etcL2Fluxnet         6  2019-2024      41948459   
418    BE-Bra      BE         etcL2Meteo         6  2019-2024       7141933   

              Submission Date  
15   2025-04-28T14:27:00.546Z  
420  2025-04-01T08:45:16.036Z  
419  2025-04-01T08:49:13.247Z  
416  2025-04-01T08:56:25

In [12]:
import os
import pandas as pd
from collections import defaultdict

def analyze_fluxnet_files(directory):
    results = []
    site_stats = defaultdict(lambda: {'years': set(), 'size': 0, 'versions': set()})
    country_stats = defaultdict(lambda: {'sites': set(), 'years': set(), 'size': 0})
    
    for filename in os.listdir(directory):
        if not (filename.startswith('FLX_') and filename.endswith('.zip')):
            continue
            
        try:
            # Split filename into components
            parts = filename.split('_')
            if len(parts) < 5:
                print(f"Skipping {filename}: not enough components")
                continue
                
            # Extract site information
            site_code = parts[1]
            country = site_code.split('-')[0]
            
            # Extract dataset version
            version_part = parts[-1].replace('.zip', '')
            version = version_part.split('-')[-1]  # e.g., beta-3
            
            # Extract year range - it's in the 4th part (index 3) after splitting
            year_part = parts[4]  # Changed from parts[3] to parts[4]
            start_year, end_year = map(int, year_part.split('-'))
            duration = end_year - start_year + 1
            
            # Get file size
            file_size = os.path.getsize(os.path.join(directory, filename))
            
            # Update statistics
            site_stats[site_code]['years'].update(range(start_year, end_year + 1))
            site_stats[site_code]['size'] += file_size
            site_stats[site_code]['versions'].add(version)
            
            country_stats[country]['sites'].add(site_code)
            country_stats[country]['years'].update(range(start_year, end_year + 1))
            country_stats[country]['size'] += file_size
            
            results.append({
                'Filename': filename,
                'Site Code': site_code,
                'Country': country,
                'Dataset': 'FLUXNET2015_FULLSET',
                'Version': version,
                'Start Year': start_year,
                'End Year': end_year,
                'Duration (years)': duration,
                'Year Range': f"{start_year}-{end_year}",
                'Size (bytes)': file_size,
                'Size (MB)': round(file_size / (1024*1024), 2)
            })
            
        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")
            continue
    
    print(f"\nSuccessfully processed {len(results)} files")
    return results, site_stats, country_stats

def generate_outputs(results, site_stats, country_stats, output_dir):
    if not results:
        print("\nNo valid FLUXNET files were processed.")
        return
    
    # Create detailed dataframe
    df = pd.DataFrame(results)
    detailed_cols = ['Filename', 'Site Code', 'Country', 'Dataset', 'Version', 
                    'Duration (years)', 'Year Range', 'Size (MB)']
    df = df[detailed_cols].sort_values(['Country', 'Site Code'])
    
    # Create site summary dataframe
    site_data = []
    for site, stats in site_stats.items():
        years = sorted(stats['years'])
        site_data.append({
            'Site Code': site,
            'Country': site.split('-')[0],
            'Dataset Version': ', '.join(sorted(stats['versions'])),
            'First Year': min(years),
            'Last Year': max(years),
            'Total Years': len(years),
            'Coverage': f"{min(years)}-{max(years)}",
            'Total Size (GB)': round(stats['size'] / (1024*1024*1024), 2)
        })
    
    site_summary_df = pd.DataFrame(site_data).sort_values(['Country', 'Site Code'])
    
    # Create country summary dataframe
    country_data = []
    for country, stats in country_stats.items():
        years = sorted(stats['years'])
        country_data.append({
            'Country': country,
            'Number of Sites': len(stats['sites']),
            'Sites': ', '.join(sorted(stats['sites'])),
            'First Year': min(years),
            'Last Year': max(years),
            'Total Years': len(years),
            'Coverage': f"{min(years)}-{max(years)}",
            'Total Size (GB)': round(stats['size'] / (1024*1024*1024), 2)
        })
    
    country_summary_df = pd.DataFrame(country_data).sort_values('Country')
    
    # Save outputs
    os.makedirs(output_dir, exist_ok=True)
    
    detailed_output = os.path.join(output_dir, 'fluxnet_detailed_stats.csv')
    site_output = os.path.join(output_dir, 'fluxnet_site_summary.csv')
    country_output = os.path.join(output_dir, 'fluxnet_country_summary.csv')
    
    df.to_csv(detailed_output, index=False)
    site_summary_df.to_csv(site_output, index=False)
    country_summary_df.to_csv(country_output, index=False)
    
    print("\n=== Analysis Complete ===")
    print(f"Detailed statistics saved to: {detailed_output}")
    print(f"Site summary saved to: {site_output}")
    print(f"Country summary saved to: {country_output}")
    
    print("\nSample of Detailed Statistics:")
    print(df.head())
    
    print("\nSite Summary Statistics:")
    print(site_summary_df.head())
    
    print("\nCountry Summary Statistics:")
    print(country_summary_df)

# Run the analysis
directory = r'C:\Deepak\Drought-2018 ecosystem eddy covariance flux product for 52 stations in FLUXNET-Archive format—release 2019-2'
output_dir = os.path.join(directory, 'analysis_results')

print(f"Analyzing FLUXNET files in: {directory}")
results, site_stats, country_stats = analyze_fluxnet_files(directory)
generate_outputs(results, site_stats, country_stats, output_dir)

Analyzing FLUXNET files in: C:\Deepak\Drought-2018 ecosystem eddy covariance flux product for 52 stations in FLUXNET-Archive format—release 2019-2

Successfully processed 52 files

=== Analysis Complete ===
Detailed statistics saved to: C:\Deepak\Drought-2018 ecosystem eddy covariance flux product for 52 stations in FLUXNET-Archive format—release 2019-2\analysis_results\fluxnet_detailed_stats.csv
Site summary saved to: C:\Deepak\Drought-2018 ecosystem eddy covariance flux product for 52 stations in FLUXNET-Archive format—release 2019-2\analysis_results\fluxnet_site_summary.csv
Country summary saved to: C:\Deepak\Drought-2018 ecosystem eddy covariance flux product for 52 stations in FLUXNET-Archive format—release 2019-2\analysis_results\fluxnet_country_summary.csv

Sample of Detailed Statistics:
                                            Filename Site Code Country  \
0  FLX_BE-Bra_FLUXNET2015_FULLSET_1996-2018_beta-...    BE-Bra      BE   
1  FLX_BE-Lon_FLUXNET2015_FULLSET_2004-2018_be

In [13]:
import os
import pandas as pd
from collections import defaultdict

def analyze_fluxnet_files(directory):
    results = []
    site_stats = defaultdict(lambda: {'years': set(), 'size': 0, 'versions': set()})
    country_stats = defaultdict(lambda: {'sites': set(), 'years': set(), 'size': 0})
    
    for filename in os.listdir(directory):
        if not (filename.startswith('FLX_') and filename.endswith('.zip')):
            continue
            
        try:
            # Split filename into components
            parts = filename.split('_')
            if len(parts) < 5:
                print(f"Skipping {filename}: not enough components")
                continue
                
            # Extract site information
            site_code = parts[1]
            country = site_code.split('-')[0]
            
            # Extract dataset version
            version_part = parts[-1].replace('.zip', '')
            version = version_part.split('-')[-1]  # e.g., beta-3
            
            # Extract year range - it's in the 4th part (index 3) after splitting
            year_part = parts[4]  # Changed from parts[3] to parts[4]
            start_year, end_year = map(int, year_part.split('-'))
            duration = end_year - start_year + 1
            
            # Get file size
            file_size = os.path.getsize(os.path.join(directory, filename))
            
            # Update statistics
            site_stats[site_code]['years'].update(range(start_year, end_year + 1))
            site_stats[site_code]['size'] += file_size
            site_stats[site_code]['versions'].add(version)
            
            country_stats[country]['sites'].add(site_code)
            country_stats[country]['years'].update(range(start_year, end_year + 1))
            country_stats[country]['size'] += file_size
            
            results.append({
                'Filename': filename,
                'Site Code': site_code,
                'Country': country,
                'Dataset': 'FLUXNET2015_FULLSET',
                'Version': version,
                'Start Year': start_year,
                'End Year': end_year,
                'Duration (years)': duration,
                'Year Range': f"{start_year}-{end_year}",
                'Size (bytes)': file_size,
                'Size (MB)': round(file_size / (1024*1024), 2)
            })
            
        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")
            continue
    
    print(f"\nSuccessfully processed {len(results)} files")
    return results, site_stats, country_stats

def generate_outputs(results, site_stats, country_stats, output_dir):
    if not results:
        print("\nNo valid FLUXNET files were processed.")
        return
    
    # Create detailed dataframe
    df = pd.DataFrame(results)
    detailed_cols = ['Filename', 'Site Code', 'Country', 'Dataset', 'Version', 
                    'Duration (years)', 'Year Range', 'Size (MB)']
    df = df[detailed_cols].sort_values(['Country', 'Site Code'])
    
    # Create site summary dataframe
    site_data = []
    for site, stats in site_stats.items():
        years = sorted(stats['years'])
        site_data.append({
            'Site Code': site,
            'Country': site.split('-')[0],
            'Dataset Version': ', '.join(sorted(stats['versions'])),
            'First Year': min(years),
            'Last Year': max(years),
            'Total Years': len(years),
            'Coverage': f"{min(years)}-{max(years)}",
            'Total Size (GB)': round(stats['size'] / (1024*1024*1024), 2)
        })
    
    site_summary_df = pd.DataFrame(site_data).sort_values(['Country', 'Site Code'])
    
    # Create country summary dataframe
    country_data = []
    for country, stats in country_stats.items():
        years = sorted(stats['years'])
        country_data.append({
            'Country': country,
            'Number of Sites': len(stats['sites']),
            'Sites': ', '.join(sorted(stats['sites'])),
            'First Year': min(years),
            'Last Year': max(years),
            'Total Years': len(years),
            'Coverage': f"{min(years)}-{max(years)}",
            'Total Size (GB)': round(stats['size'] / (1024*1024*1024), 2)
        })
    
    country_summary_df = pd.DataFrame(country_data).sort_values('Country')
    
    # Save outputs
    os.makedirs(output_dir, exist_ok=True)
    
    detailed_output = os.path.join(output_dir, 'fluxnet_detailed_stats.csv')
    site_output = os.path.join(output_dir, 'fluxnet_site_summary.csv')
    country_output = os.path.join(output_dir, 'fluxnet_country_summary.csv')
    
    df.to_csv(detailed_output, index=False)
    site_summary_df.to_csv(site_output, index=False)
    country_summary_df.to_csv(country_output, index=False)
    
    print("\n=== Analysis Complete ===")
    print(f"Detailed statistics saved to: {detailed_output}")
    print(f"Site summary saved to: {site_output}")
    print(f"Country summary saved to: {country_output}")
    
    print("\nSample of Detailed Statistics:")
    print(df.head())
    
    print("\nSite Summary Statistics:")
    print(site_summary_df.head())
    
    print("\nCountry Summary Statistics:")
    print(country_summary_df)

# Run the analysis
directory = r'C:\Deepak\Warm Winter 2020 ecosystem eddy covariance flux product for 73 stations in FLUXNET-Archive format—release 2022-1'
output_dir = os.path.join(directory, 'analysis_results')

print(f"Analyzing FLUXNET files in: {directory}")
results, site_stats, country_stats = analyze_fluxnet_files(directory)
generate_outputs(results, site_stats, country_stats, output_dir)

Analyzing FLUXNET files in: C:\Deepak\Warm Winter 2020 ecosystem eddy covariance flux product for 73 stations in FLUXNET-Archive format—release 2022-1

Successfully processed 73 files

=== Analysis Complete ===
Detailed statistics saved to: C:\Deepak\Warm Winter 2020 ecosystem eddy covariance flux product for 73 stations in FLUXNET-Archive format—release 2022-1\analysis_results\fluxnet_detailed_stats.csv
Site summary saved to: C:\Deepak\Warm Winter 2020 ecosystem eddy covariance flux product for 73 stations in FLUXNET-Archive format—release 2022-1\analysis_results\fluxnet_site_summary.csv
Country summary saved to: C:\Deepak\Warm Winter 2020 ecosystem eddy covariance flux product for 73 stations in FLUXNET-Archive format—release 2022-1\analysis_results\fluxnet_country_summary.csv

Sample of Detailed Statistics:
                                            Filename Site Code Country  \
0  FLX_BE-Bra_FLUXNET2015_FULLSET_1996-2020_beta-...    BE-Bra      BE   
1  FLX_BE-Dor_FLUXNET2015_FULL

In [18]:
import pandas as pd
from collections import defaultdict

def load_and_preprocess(file_path, name):
    df = pd.read_csv(file_path)
    
    # Standardize column names
    if 'Site' in df.columns:
        df = df.rename(columns={'Site': 'Site Code'})
    
    # Convert size columns to GB for consistency
    if 'Total Size (MB)' in df.columns:
        df['Total Size (GB)'] = df['Total Size (MB)'] / 1024
        df = df.drop(columns=['Total Size (MB)'])
    
    # Create year range column for each source
    df[name + '_Years'] = df['First Year'].astype(str) + '-' + df['Last Year'].astype(str)
    
    return df[['Site Code', 'Country', name + '_Years']]

# Load all files
warm_df = load_and_preprocess('Warm_icos_.csv', 'Warm')
drought_df = load_and_preprocess('drought_icos.csv', 'Drought')
fluxcom_df = load_and_preprocess('fluxcom_sites.csv', 'Fluxcom')
icos2025_df = load_and_preprocess('2025_icos.csv', 'ICOS2025')

# Merge all data
combined = pd.merge(
    pd.merge(
        pd.merge(
            icos2025_df, 
            drought_df, 
            on=['Site Code', 'Country'], 
            how='outer'
        ),
        fluxcom_df,
        on=['Site Code', 'Country'], 
        how='outer'
    ),
    warm_df,
    on=['Site Code', 'Country'], 
    how='outer'
)

# Function to parse year ranges and combine them
def combine_years(row):
    sources = ['ICOS2025', 'Drought', 'Fluxcom', 'Warm']
    all_years = set()
    
    for source in sources:
        year_range = row.get(source + '_Years')
        if pd.notna(year_range):
            try:
                start, end = map(int, year_range.split('-'))
                all_years.update(range(start, end + 1))
            except:
                pass
    
    if not all_years:
        return None, None, None, None
    
    first_year = min(all_years)
    last_year = max(all_years)
    total_years = len(all_years)
    coverage = f"{first_year}-{last_year}"
    
    return first_year, last_year, total_years, coverage

# Apply the combining function
combined[['Combined_First_Year', 'Combined_Last_Year', 
          'Combined_Total_Years', 'Combined_Coverage']] = combined.apply(
    lambda row: pd.Series(combine_years(row)), axis=1)

# Reorder columns and sort
output_columns = [
    'Site Code', 'Country', 
    'ICOS2025_Years', 'Drought_Years', 'Fluxcom_Years', 'Warm_Years',
    'Combined_First_Year', 'Combined_Last_Year', 'Combined_Total_Years', 'Combined_Coverage'
]

result = combined[output_columns].sort_values(by=['Country', 'Site Code'])

# Save to CSV
result.to_csv('combined_sites_years.csv', index=False)
print("Combined file saved as 'combined_sites_years.csv'")

Combined file saved as 'combined_sites_years.csv'


In [16]:
import pandas as pd

# Load the final combined CSV (if already generated)
df = pd.read_csv('combined_sites_years.csv')

# OPTION 1: Sort by Combined First Year (oldest to newest)
df_sorted = df.sort_values(by='Combined_Total_Years' ,ascending=False)

# OPTION 2: Sort by Combined Total Years (longest to shortest coverage)
# df_sorted = df.sort_values(by='Combined_Total_Years', ascending=False)

# Save the sorted version
df_sorted.to_csv('combined_sites_years_sorted.csv', index=False)
print("Sorted file saved as 'combined_sites_years_sorted.csv'")

Sorted file saved as 'combined_sites_years_sorted.csv'


In [19]:
import pandas as pd
from collections import defaultdict

def load_and_preprocess(file_path, name):
    df = pd.read_csv(file_path)
    
    # Standardize column names
    if 'Site' in df.columns:
        df = df.rename(columns={'Site': 'Site Code'})
    
    # Convert size columns to GB for consistency
    if 'Total Size (MB)' in df.columns:
        df['Total Size (GB)'] = df['Total Size (MB)'] / 1024
        df = df.drop(columns=['Total Size (MB)'])
    
    # Create year range column for each source
    df[name + '_Years'] = df['First Year'].astype(str) + '-' + df['Last Year'].astype(str)
    
    return df[['Site Code', 'Country', name + '_Years']]

# Load all files EXCEPT drought
warm_df = load_and_preprocess('Warm_icos_.csv', 'Warm')
fluxcom_df = load_and_preprocess('fluxcom_sites.csv', 'Fluxcom')
icos2025_df = load_and_preprocess('2025_icos.csv', 'ICOS2025')

# Merge data WITHOUT drought
combined = pd.merge(
    pd.merge(
        icos2025_df,
        fluxcom_df,
        on=['Site Code', 'Country'], 
        how='outer'
    ),
    warm_df,
    on=['Site Code', 'Country'], 
    how='outer'
)

# Function to parse year ranges and combine them (excluding drought)
def combine_years(row):
    sources = ['ICOS2025', 'Fluxcom', 'Warm']  # Removed 'Drought'
    all_years = set()
    
    for source in sources:
        year_range = row.get(source + '_Years')
        if pd.notna(year_range):
            try:
                start, end = map(int, year_range.split('-'))
                all_years.update(range(start, end + 1))
            except:
                pass
    
    if not all_years:
        return None, None, None, None
    
    first_year = min(all_years)
    last_year = max(all_years)
    total_years = len(all_years)
    coverage = f"{first_year}-{last_year}"
    
    return first_year, last_year, total_years, coverage

# Apply the combining function
combined[['Combined_First_Year', 'Combined_Last_Year', 
          'Combined_Total_Years', 'Combined_Coverage']] = combined.apply(
    lambda row: pd.Series(combine_years(row)), axis=1)

# Reorder columns (removed Drought_Years)
output_columns = [
    'Site Code', 'Country', 
    'ICOS2025_Years', 'Fluxcom_Years', 'Warm_Years',
    'Combined_First_Year', 'Combined_Last_Year', 'Combined_Total_Years', 'Combined_Coverage'
]

result = combined[output_columns].sort_values(by='Combined_Total_Years' ,ascending=False)

# Save to CSV
result.to_csv('dataset_availability_final.csv', index=False)
print("Combined file (without drought) saved as 'combined_sites_years_no_drought.csv'")

Combined file (without drought) saved as 'combined_sites_years_no_drought.csv'


In [20]:
import pandas as pd
from collections import defaultdict

def load_and_preprocess(file_path, name):
    df = pd.read_csv(file_path)
    
    # Standardize column names
    if 'Site' in df.columns:
        df = df.rename(columns={'Site': 'Site Code'})
    
    # Convert size columns to GB for consistency
    if 'Total Size (MB)' in df.columns:
        df['Total Size (GB)'] = df['Total Size (MB)'] / 1024
        df = df.drop(columns=['Total Size (MB)'])
    
    # Create year range column for each source
    df[name + '_Years'] = df['First Year'].astype(str) + '-' + df['Last Year'].astype(str)
    
    return df[['Site Code', 'Country', name + '_Years']]

# Load only Warm and ICOS2025 data (no drought or fluxcom)
warm_df = load_and_preprocess('Warm_icos_.csv', 'Warm')
icos2025_df = load_and_preprocess('2025_icos.csv', 'ICOS2025')

# Merge only the two datasets
combined = pd.merge(
    icos2025_df,
    warm_df,
    on=['Site Code', 'Country'], 
    how='outer'
)

# Function to parse year ranges and combine them (only ICOS2025 and Warm)
def combine_years(row):
    sources = ['ICOS2025', 'Warm']  # Only these two sources
    all_years = set()
    
    for source in sources:
        year_range = row.get(source + '_Years')
        if pd.notna(year_range):
            try:
                start, end = map(int, year_range.split('-'))
                all_years.update(range(start, end + 1))
            except:
                pass
    
    if not all_years:
        return None, None, None, None
    
    first_year = min(all_years)
    last_year = max(all_years)
    total_years = len(all_years)
    coverage = f"{first_year}-{last_year}"
    
    return first_year, last_year, total_years, coverage

# Apply the combining function
combined[['Combined_First_Year', 'Combined_Last_Year', 
          'Combined_Total_Years', 'Combined_Coverage']] = combined.apply(
    lambda row: pd.Series(combine_years(row)), axis=1)

# Reorder columns (only ICOS2025 and Warm)
output_columns = [
    'Site Code', 'Country', 
    'ICOS2025_Years', 'Warm_Years',
    'Combined_First_Year', 'Combined_Last_Year', 'Combined_Total_Years', 'Combined_Coverage'
]

result = combined[output_columns].sort_values(by='Combined_Total_Years' ,ascending=False)

# Save to CSV
result.to_csv('combined_sites_years_warm_icos_only.csv', index=False)
print("Combined file (Warm + ICOS2025 only) saved as 'combined_sites_years_warm_icos_only.csv'")

Combined file (Warm + ICOS2025 only) saved as 'combined_sites_years_warm_icos_only.csv'


In [22]:
import pandas as pd

# Load the datasets
site_stats = pd.read_csv('site_statistics.csv')
data_avail = pd.read_csv('dataset_availability_2025.csv')

# Find sites in data_avail that aren't in site_stats
missing_sites = data_avail[~data_avail['Site Code'].isin(site_stats['Site Code'])]

# Display all missing sites (11 total)
print("Missing Sites (11):")
print(missing_sites[['Site Code', 'Country', 'Combined_Coverage']].to_string(index=False))

Missing Sites (11):
Site Code Country Combined_Coverage
   GF-Guy      GF         2004-2025
   IT-OXm      IT         2004-2025
   DE-Gri      DE         2004-2025
   DE-Kli      DE         2004-2025
   FR-Aur      FR         2005-2025
   CZ-wet      CZ         2006-2025
   IT-Tor      IT         2008-2025
   FI-Let      FI         2009-2025
   DE-Hzd      DE         2010-2025
   DE-RuR      DE         2010-2025
   IT-TrF      IT         2011-2025
   DE-RuW      DE         2011-2025
   IT-Lsn      IT         2015-2025
   CH-BaK      CH         2015-2025


FLX_BE-Bra_FLUXNET2015_FULLSET_DD_1996-2020_beta-3.csv
FLX_BE-Bra_FLUXNET2015_FULLSET_HH_1996-2020_beta-3.csv
FLX_BE-Bra_FLUXNET2015_FULLSET_MM_1996-2020_beta-3.csv
FLX_BE-Bra_FLUXNET2015_FULLSET_WW_1996-2020_beta-3.csv
FLX_BE-Bra_FLUXNET2015_FULLSET_YY_1996-2020_beta-3.csv
