# Library and Setup

In [1]:
import pandas as pd # Data management tools
import numpy as np # Mathematical operations
import psycopg2 # Access to SQL
import re # Regex and other lookup tools

In [2]:
def fetch_table_to_df(conn, query):
    """"Extracts entire table(s) from an SQL database. 
    
    conn: Should be a connect function from psycopg2.
    query: Tables to be extracted from the database.

    """
    cur = conn.cursor()
    cur.execute(query)
    rows = cur.fetchall()
    column_names = [desc[0] for desc in cur.description]
    df = pd.DataFrame(rows, columns=column_names).replace({np.nan}, None)
    cur.close()
    return df

In [3]:
conn = psycopg2.connect(
    database="testing",
    user="postgres",
    password="postgres",
    host="localhost",
    port="5432"
)

In [4]:
tables = "tree_monitorings", "measurement_informations", "biomass_formulas", "tree_biomasses", 'taxonomies' 
for table in tables:
    table_name = f"{table}"
    globals()[table_name] = fetch_table_to_df(conn, f"SELECT * FROM {table}")

In [5]:
conn.close()

# Preparation

Python supports $pow()$, only need to remove $ sign from the equation

In [6]:
#replacements = {'pow' : '', ',' : '**'}
#biomass_formulas['formula_python'] = biomass_formulas['formula'].replace(replacements, regex = True).str.replace('$','')

biomass_formulas['formula_python'] = biomass_formulas['formula'].str.replace('$','') #Simpler, pow() exists in python as well

Changing data types for superfluous operations

In [7]:
tree_monitorings[['tree_height', 'tree_dbh']] = tree_monitorings[['tree_height', 'tree_dbh']].astype('float')
taxonomies['taxonomy_id'] = taxonomies['taxonomy_id'].astype('int')
taxonomies['wood_density'] = taxonomies['wood_density'].astype('float')
tree_monitorings['taxonomy_id'] = tree_monitorings['taxonomy_id'].astype('int')

Current data shows combination of hasHeight+noDBH, noHeight+noDBH, hasHeight+hasDBH but no noHeight+hasDBH

In [8]:
tree_monitorings[['tree_height','tree_dbh']].notna().value_counts()

tree_height  tree_dbh
True         False       6301
False        False       4193
True         True        3234
Name: count, dtype: int64

Standardizing tree_species names (unfinished, need to deal with NAs)

In [9]:
tree_species = ['Suren', 'Kopi Liberika', 'Citrus', 'Casuarina','Other', 'Meranti','Soursop','Gaharu','Mango','Rosewood','Orange','Tengkurung','Durian','Cajuput','Jackfruit','Rambutan','Clove','Coffee', 'Lamtoro','Meranti Bakau', 'Meranti Bunga', 'Asam Gelugur','Avocado','Cempedak', 'Asam', 'Tampui', 'Sirsak', 'Kuras','Bitterbean','Mentangor']

In [10]:

#x = [next(iter(x), np.nan) 
#          for x in map(lambda x: difflib.get_close_matches(x, tree_species, cutoff = 0.7), tree_monitorings['tree_species']) if x]

Adding column of biomass_formula_id for simpler referencing in upcoming operations

In [11]:
dbh_exist = tree_monitorings['tree_dbh'].notna()
height_exist = tree_monitorings['tree_height'].notna()

trees_wood_density = [taxonomies.loc[taxonomies['taxonomy_id'] == tax_id, 'wood_density'].values[0] 
                if tax_id in taxonomies['taxonomy_id'].values else None
                for tax_id in tree_monitorings['taxonomy_id']]

In [12]:
for index, (dbh, height, wood) in enumerate(zip(dbh_exist, height_exist, trees_wood_density)):
    if dbh == 1 and height == 1:
        if (wood != wood or wood == None):
            tree_monitorings.loc[index, 'biomass_formulas_id'] = 7
        else:
            tree_monitorings.loc[index, 'biomass_formulas_id'] = 3
    elif (dbh == 1 and height == 0):
        tree_monitorings.loc[index, 'biomass_formulas_id'] = 5
    elif (dbh == 0 and height == 1):
        tree_monitorings.loc[index, 'biomass_formulas_id'] = 4
    else:
        tree_monitorings.loc[index, 'biomass_formulas_id'] = None

In [13]:
tree_monitorings.biomass_formulas_id.value_counts(dropna= False)

biomass_formulas_id
4.0    6301
NaN    4193
7.0    3178
3.0      56
Name: count, dtype: int64

Filtering data from measurement_information which also exist in tree_monitorings

In [None]:
type1_measurement = measurement_informations[(measurement_informations['monitoring_id'].isin(tree_monitorings['id'])== True) & measurement_informations['monitoring_type'] == 1]

In [59]:
tree_monitorings['tree_cond'].value_counts(dropna = False)

tree_cond
0       8011
1       3084
None    2156
2        477
Name: count, dtype: int64

In [34]:
measurement = measurement_informations[(measurement_informations['monitoring_id'].isin(tree_monitorings['id'])== True)]

Calculation of biomass, still unoptimized

In [153]:
taxonomy_dict = taxonomies.set_index('taxonomy_id')['wood_density'].to_dict() #Set wood identity for referral from taxonomy_id

biomass_index = [] #empty list for storing valid biomass ids
result = [] #empty list for storing results
for index_used in measurement['monitoring_id']:
    tree_dbh     = tree_monitorings.loc[tree_monitorings['id'] == index_used,'tree_dbh'].values[0]
    tree_height  = tree_monitorings.loc[tree_monitorings['id'] == index_used,'tree_height'].values[0]
    biomass_id   = tree_monitorings.loc[tree_monitorings['id'] == index_used,'biomass_formulas_id'].values[0]
    tax_id       = tree_monitorings.loc[tree_monitorings['id'] == index_used,'taxonomy_id'].values[0]
    wood_density = taxonomy_dict.get(tax_id)
    if (biomass_id != biomass_id or biomass_id == None):
        continue
    else:
        biomass_form = biomass_formulas.loc[biomass_formulas['id'] == biomass_id, 'formula_python']
        for biomass in biomass_form:
            biomass_result = eval(biomass)
            result.append(biomass_result)
            biomass_index.append(index_used)
#            print("dbh:", tree_dbh,
#                  "height:", tree_height,
##                  "index:", index_used, 
 #                 "formula", biomass_form,
 #                 "result:", biomass_result)

# Creating Biomass Table/Dataframe

In [154]:
tree_biomasses_python = measurement[measurement['monitoring_id'].isin(biomass_index) == True]

In [155]:
tree_biomasses_python = tree_biomasses_python.drop(['id','timenow','start', 'end','username','notes','_xform_id','_xform_id_string','monitoring_order','month_monitoring','phase'], axis = 1)


In [156]:
tree_biomasses_python['result'] = result

In [157]:
tree_biomasses_python['taxonomy_id'] = [tree_monitorings.loc[tree_monitorings['id'] == mon_id, 'taxonomy_id'].values[0] 
                if mon_id in tree_monitorings['id'].values else None
                for mon_id in tree_biomasses_python['monitoring_id']]

In [159]:
tree_biomasses_python['tree_species'] = [tree_monitorings.loc[tree_monitorings['id'] == mon_id, 'tree_species'].values[0] 
                if mon_id in tree_monitorings['id'].values else None
                for mon_id in tree_biomasses_python['monitoring_id']]

In [160]:
tree_biomasses_python['tree_id'] = [tree_monitorings.loc[tree_monitorings['id'] == mon_id, 'tree_id'].values[0] 
                if mon_id in tree_monitorings['id'].values else None
                for mon_id in tree_biomasses_python['monitoring_id']]

In [161]:
tree_biomasses_python['submission_time'] = [tree_monitorings.loc[tree_monitorings['id'] == mon_id, 'submission_time'].values[0] 
                if mon_id in tree_monitorings['id'].values else None
                for mon_id in tree_biomasses_python['monitoring_id']]

In [162]:
import datetime
tree_biomasses_python['updated_at'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')


In [163]:
carbon = []
for res in result:
    if (res != res or res == None):
        continue
    else:
        carbon.append(res * 0.5 * 3.67) # biomass * 0.5 * 3.67
tree_biomasses_python['carbon'] = carbon

In [164]:
tree_biomasses_python['id'] = range(1,len(tree_biomasses_python)+1)

In [166]:
tree_biomasses_python = tree_biomasses_python[['id','monitoring_id','tree_id','tree_species','taxonomy_id','result','carbon','date_monitoring','submission_time','created_at','updated_at']]
tree_biomasses_python

Unnamed: 0,id,monitoring_id,tree_id,tree_species,taxonomy_id,result,carbon,date_monitoring,submission_time,created_at,updated_at
582,1,506,AA04T0046,KOPI LIBERIKA,7,0.000000,0.000000,2019-06-07,2019-09-06 08:29:18,2021-08-01 19:19:22,2024-11-18 10:45:04
583,2,507,AA04T0047,KOPI LIBERIKA,7,0.000000,0.000000,2019-06-07,2019-09-06 08:29:31,2021-08-01 19:19:22,2024-11-18 10:45:04
584,3,509,AA04T0049,KOPI LIBERIKA,7,0.000000,0.000000,2019-06-07,2019-09-06 08:29:50,2021-08-01 19:19:22,2024-11-18 10:45:04
585,4,510,AA04T0050,KOPI LIBERIKA,7,0.000000,0.000000,2019-06-07,2019-09-06 08:30:00,2021-08-01 19:19:22,2024-11-18 10:45:04
586,5,511,AA04T0051,KOPI LIBERIKA,7,0.000000,0.000000,2019-06-07,2019-09-06 08:30:34,2021-08-01 19:19:22,2024-11-18 10:45:04
...,...,...,...,...,...,...,...,...,...,...,...
18791,11631,36980,CL-10-0-1843,suren,64,0.473252,0.868417,2023-08-21,2024-09-11 00:00:00,2024-09-11 21:22:56,2024-11-18 10:45:04
18792,11632,36981,CL-10-0-1844,suren,64,0.577320,1.059382,2023-08-21,2024-09-11 00:00:00,2024-09-11 21:22:56,2024-11-18 10:45:04
18793,11633,36982,CL-10-0-1845,suren,64,0.943740,1.731763,2023-08-21,2024-09-11 00:00:00,2024-09-11 21:22:56,2024-11-18 10:45:04
18794,11634,36983,CL-10-0-1846,suren,64,0.224837,0.412575,2023-08-21,2024-09-11 00:00:00,2024-09-11 21:22:56,2024-11-18 10:45:04


In [None]:
#tree_biomasses_python['tree_cond'] = [tree_monitorings.loc[tree_monitorings['id'] == mon_id, 'tree_cond'].values[0] 
#                if mon_id in tree_monitorings['id'].values else None
#                for mon_id in tree_biomasses_python['monitoring_id']]

In [None]:
#print(tree_biomasses_python['result'].value_counts(dropna= False))

result
0.000000    1504
0.005493     162
0.088504     155
0.009661     142
0.130719     137
            ... 
0.698863       1
0.827691       1
0.540066       1
0.197447       1
0.224837       1
Name: count, Length: 1926, dtype: int64


In [None]:
#original = []
#new = []
#condition = []
#for index, row in tree_biomasses_python.iterrows():
#    ids = row['monitoring_id']
#    original.append(tree_biomasses.loc[tree_biomasses['monitoring_id'] == ids, 'result'].values[0])
#    new.append(round(row['result'],5))
#    condition.append(tree_monitorings.loc[tree_monitorings['id'] == ids, 'tree_cond'])
#    print(tree_biomasses.loc[tree_biomasses['monitoring_id'] == ids, 'result'].values[0] == round(row['result'],5))

#comp = pd.DataFrame(np.column_stack([original, new, condition]))

In [177]:
tree_biomasses_python

Unnamed: 0,id,monitoring_id,tree_id,tree_species,taxonomy_id,result,carbon,date_monitoring,submission_time,created_at,updated_at,tree_cond
582,1,506,AA04T0046,KOPI LIBERIKA,7,0.000000,0.000000,2019-06-07,2019-09-06 08:29:18,2021-08-01 19:19:22,2024-11-18 10:45:04,
583,2,507,AA04T0047,KOPI LIBERIKA,7,0.000000,0.000000,2019-06-07,2019-09-06 08:29:31,2021-08-01 19:19:22,2024-11-18 10:45:04,
584,3,509,AA04T0049,KOPI LIBERIKA,7,0.000000,0.000000,2019-06-07,2019-09-06 08:29:50,2021-08-01 19:19:22,2024-11-18 10:45:04,
585,4,510,AA04T0050,KOPI LIBERIKA,7,0.000000,0.000000,2019-06-07,2019-09-06 08:30:00,2021-08-01 19:19:22,2024-11-18 10:45:04,
586,5,511,AA04T0051,KOPI LIBERIKA,7,0.000000,0.000000,2019-06-07,2019-09-06 08:30:34,2021-08-01 19:19:22,2024-11-18 10:45:04,
...,...,...,...,...,...,...,...,...,...,...,...,...
18791,11631,36980,CL-10-0-1843,suren,64,0.473252,0.868417,2023-08-21,2024-09-11 00:00:00,2024-09-11 21:22:56,2024-11-18 10:45:04,0
18792,11632,36981,CL-10-0-1844,suren,64,0.577320,1.059382,2023-08-21,2024-09-11 00:00:00,2024-09-11 21:22:56,2024-11-18 10:45:04,0
18793,11633,36982,CL-10-0-1845,suren,64,0.943740,1.731763,2023-08-21,2024-09-11 00:00:00,2024-09-11 21:22:56,2024-11-18 10:45:04,0
18794,11634,36983,CL-10-0-1846,suren,64,0.224837,0.412575,2023-08-21,2024-09-11 00:00:00,2024-09-11 21:22:56,2024-11-18 10:45:04,0


In [178]:
tree_biomasses

Unnamed: 0,id,monitoring_id,tree_id,tree_species,taxonomy_id,result,carbon,date_monitoring,submission_time,created_at,updated_at
0,1,32192,BB-06-0-1981,Mango,63,0.13072,0.06536,2024-08-15,2024-09-11 00:00:00,2024-09-11 22:42:35,2024-09-11 22:42:35
1,2,1253,P-1-051880062,Jelutung,17,,,2019-08-25,2021-07-28 00:00:00,2024-09-11 22:42:35,2024-09-11 22:42:35
2,3,2064,P-1-051880063,Jelutung,17,,,2020-09-19,2021-07-28 00:00:00,2024-09-11 22:42:35,2024-09-11 22:42:35
3,4,1254,P-1-051880063,Jelutung,17,,,2019-08-25,2021-07-28 00:00:00,2024-09-11 22:42:35,2024-09-11 22:42:35
4,5,2065,P-1-051880064,Jelutung,17,,,2020-09-19,2021-07-28 00:00:00,2024-09-11 22:42:35,2024-09-11 22:42:35
...,...,...,...,...,...,...,...,...,...,...,...
10639,10640,37979,CL-10-0-1844,Suren,64,,,2024-08-19,2024-09-11 00:00:00,2024-09-11 22:42:55,2024-09-11 22:42:55
10640,10641,37980,CL-10-0-1845,Suren,64,,,2024-08-19,2024-09-11 00:00:00,2024-09-11 22:42:55,2024-09-11 22:42:55
10641,10642,37981,CL-10-0-1846,Suren,64,,,2024-08-19,2024-09-11 00:00:00,2024-09-11 22:42:55,2024-09-11 22:42:55
10642,10643,37982,CL-10-0-1847,Suren,64,,,2024-08-19,2024-09-11 00:00:00,2024-09-11 22:42:55,2024-09-11 22:42:55


In [None]:
#tree_biomasses_python.tree_cond.value_counts(dropna= False)

tree_cond
0       8008
None    3150
2        477
Name: count, dtype: int64

In [168]:
#comp[2].value_counts(dropna= False)

2
0       8008
None    3150
2        477
Name: count, dtype: int64

# Data Analytics

x