In [42]:
import pandas as pd
import re


def get_cml_metadata_convention():
    df = pd.read_csv(
        'https://raw.githubusercontent.com/OpenSenseAction/OS_data_format_conventions/main/netCDF_CML.adoc',
        #'data_PL_202302__DIST_NAME-WALBRZYSKI__NCODE-79551.csv',
        sep='|',
        skiprows=9,
    )

    df = df.drop(columns='Unnamed: 0').drop(0).set_index('Unnamed: 1')
    df.index.name = 'Dimensions'

    return df


def check_required_cml_metadata(ds):
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'

    df = get_cml_metadata_convention()
    df_required = df[(df.Requisite == ' Required') | (df.Requisite == ' Required*')]
    print('Checking required variables...')
    error_count = 0
    reqFieldsNames=[]
    for row in df_required.iterrows():
        split = re.split(r'\(|\)', row[0])
        var_name, dims = split[0].strip(), tuple(split[1].split(','))
        print(f' {var_name}')
        reqFieldsNames.append(var_name)

        try:
            ds[var_name]
            if ds[var_name].dims != dims:
                print(f"  {FAIL}dims of variable '{var_name}' are {ds[var_name].dims} but have to be {dims}{ENDC}")
                error_count += 1
            else:
                print(f"  {OKGREEN}OK{ENDC}")
        except:
            print(f"  {FAIL}Required variable '{var_name}' is missing{ENDC}")
            error_count += 1

    print()
    print(f"{FAIL}{error_count} errors found{ENDC}")
    print("=======================")
    print(reqFieldsNames)
    print("=======================")







In [43]:
df2 = get_cml_metadata_convention()
df2.shape
df2

Unnamed: 0_level_0,Type,Attributes,Requisite,Comments
Dimensions,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
time,,,,"Unlimited size, enforce UTC seconds since 197..."
cml_id,,,,Minimum length is 1
sublink_id,,,,Minimum length is 1
*Coordinate variables (dimension)*,,,,
time (time),int/float/double,"units = ""seconds since 1970-01-01 00:00:00 UT...",Required,
cml_id (cml_id),string,long_name = “commercial_microwave_link_identi...,Required,cml_id has to be unique across the network
sublink_id (sublink_id),string,long_name = “sublink_identifier”,Required,sublink_id does not have to be unique across ...
*Auxiliary coordinate variables (dimension)*,,,,
site_0_lat (cml_id),float/double,"units = degrees_in_WGS84_projection, long_nam...",Required,
site_0_lon (cml_id),float/double,"units = degrees_in_WGS84_projection, long_nam...",Required,


In [None]:
check_required_cml_metadata(df2)

# Second Section

In [65]:


def getRequiredFieldNames(ds):
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'

    df = get_cml_metadata_convention()
    df_required = df[(df.Requisite == ' Required') | (df.Requisite == ' Required*')]
    #print('Checking required variables...')
    error_count = 0
    reqFieldsNames=[]
    for row in df_required.iterrows():
        split = re.split(r'\(|\)', row[0])
        var_name, dims = split[0].strip(), tuple(split[1].split(','))
        print(f' {var_name}')
        reqFieldsNames.append(var_name)
        try:
            ds[var_name]
            if ds[var_name].dims != dims:
                #print(f"  {FAIL}dims of variable '{var_name}' are {ds[var_name].dims} but have to be {dims}{ENDC}")
                error_count += 1
            else:
                print(f"  {OKGREEN}OK{ENDC}")
        except:
            #print(f"  {FAIL}Required variable '{var_name}' is missing{ENDC}")
            error_count += 1
    return reqFieldsNames

def checkColumnNames(liste, required_fields):
    # Tüm aranan elemanların listede olup olmadığını kontrol et
    result = all(eleman in liste for eleman in required_fields)
    return result

def convertLowerCase(liste):
    # List comprehension ile tüm elemanları küçük harfe çevir
    _lowercase = [eleman.lower() for eleman in liste if isinstance(eleman, str)]
    return _lowercase


def getMissingFields(liste1, liste2):
    # Liste1'in elemanlarını kontrol et ve liste2'de bulunmayanları yeni bir listeye ekle
    missingFields = [eleman for eleman in liste1 if eleman not in liste2]
    return missingFields





In [69]:
fileToCheck="metadata_whole_June2021_0and1.csv"
seperatorCharacter=";"

df2 = get_cml_metadata_convention()
requiredFields=getRequiredFieldNames(df2)
requiredFieldsLower= convertLowerCase(requiredFields)


 time
 cml_id
 sublink_id
 site_0_lat
 site_0_lon
 site_1_lat
 site_1_lon
 frequency
 tsl
 rsl
 tsl_max
 tsl_min
 rsl_max
 rsl_min


In [70]:
df = pd.read_csv(
        #'https://raw.githubusercontent.com/OpenSenseAction/OS_data_format_conventions/main/netCDF_CML.adoc',
        fileToCheck,
        sep=seperatorCharacter
    )

columnNames=df.columns.tolist()
print("Required Field Names:")
print(requiredFieldsLower)
print()
print("Original Field Names:")
print(columnNames)
print()
# convert lower case
columnNamesLower = convertLowerCase(columnNames)
print("Lower Cased Field Names:")
print(columnNamesLower)

Required Field Names:
['time', 'cml_id', 'sublink_id', 'site_0_lat', 'site_0_lon', 'site_1_lat', 'site_1_lon', 'frequency', 'tsl', 'rsl', 'tsl_max', 'tsl_min', 'rsl_max', 'rsl_min']

Original Field Names:
['cml_id', 'isSublink', 'nRecords', 'nNAs', 'timeOfFirstAvailValue', 'timeOfLastAvailValue', 'band', 'frequency', 'polarisation', 'site_0_lon', 'site_0_lat ', 'site_1_lon', 'site_1_lat', 'length', 'id1', 'id2', 'isPairingUnique', 'nRecords.1']

Lower Cased Field Names:
['cml_id', 'issublink', 'nrecords', 'nnas', 'timeoffirstavailvalue', 'timeoflastavailvalue', 'band', 'frequency', 'polarisation', 'site_0_lon', 'site_0_lat ', 'site_1_lon', 'site_1_lat', 'length', 'id1', 'id2', 'ispairingunique', 'nrecords.1']


In [71]:
check_result = checkColumnNames(columnNamesLower, requiredFieldsLower)

if check_result:
    print("All required fields are OK.")
else:
    print("Some required fields are missing!")
    missingOnes = getMissingFields(requiredFieldsLower, columnNamesLower)
    print()
    print("Missing fields:", missingOnes)

Some required fields are missing!

Missing fields: ['time', 'sublink_id', 'site_0_lat', 'tsl', 'rsl', 'tsl_max', 'tsl_min', 'rsl_max', 'rsl_min']


## Section Three

In [None]:
!pip install netCDF4

Collecting netCDF4
  Downloading netCDF4-1.6.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m36.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cftime (from netCDF4)
  Downloading cftime-1.6.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m69.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: cftime, netCDF4
Successfully installed cftime-1.6.3 netCDF4-1.6.5


In [None]:
from netCDF4 import Dataset
import numpy as np

In [10]:
# CSV dosyasını doğru ayırıcı ile oku

csv_dosya_yolu="/content/data_PL_202302__DIST_NAME-WALBRZYSKI__NCODE-79551.csv"
df = pd.read_csv(csv_dosya_yolu, sep=';')

# Verilerin yapılandırılması
# Örnek olarak, 'START_TIME' ve 'DEVICEID'ü boyut olarak,
# 'RSL_MAX', 'RSL_MIN', 'RSL_AVG', 'TSL_MAX', 'TSL_MIN', 'TSL_AVG' değerlerini değişken olarak kullanacağız.

# Zaman boyutunu oluşturmak için benzersiz 'START_TIME' değerlerini sıralı bir şekilde alalım
start_times_unique = pd.to_datetime(df['START_TIME']).dt.strftime('%Y-%m-%d %H:%M:%S').unique()
start_times_unique.sort()  # Zamanı sırala

#
device_ids_unique = df['DEVICEID'].unique()

# create netCDF
nc_dosyasi = '/content/converted_to_nc.nc'
with Dataset(nc_dosyasi, 'w', format='NETCDF4') as nc:
    # Boyutları tanımla
    nc.createDimension('time', len(start_times_unique))
    nc.createDimension('device', len(device_ids_unique))

    # Zaman ve cihaz ID değişkenlerini oluştur
    times = nc.createVariable('time', int, ('time',))
    devices = nc.createVariable('device', str, ('device',))

    # RSL ve TSL değişkenlerini oluştur
    rsl_max = nc.createVariable('RSL_MAX', np.float32, ('time', 'device'))
    rsl_min = nc.createVariable('RSL_MIN', np.float32, ('time', 'device'))
    rsl_avg = nc.createVariable('RSL_AVG', np.float32, ('time', 'device'))
    tsl_max = nc.createVariable('TSL_MAX', np.float32, ('time', 'device'))
    tsl_min = nc.createVariable('TSL_MIN', np.float32, ('time', 'device'))
    tsl_avg = nc.createVariable('TSL_AVG', np.float32, ('time', 'device'))

    # Verileri zaman ve cihaz ID'ye göre indeksleyip değişkenlere atama yapılır
    # Bu örnek, basitlik adına, değişkenler için doğrudan değer ataması yapmamaktadır.
    # Gerçek bir uygulamada, verilerin her bir değişken için uygun şekilde ataması gerekecektir.

    # Öznitelikler ekleyelim
    nc.description = 'Data converted from CSV file'
    times.units = 'YYYY-MM-DD HH:MM:SS'
    devices.description = 'Device ID'

nc_dosyasi

'/content/converted_to_nc.nc'