# Scoring System

Required files:
- `VL_farm_geo_w.csv`: A cleaned version of the raw Excel datasets

Steps:
- Initialize a new scoring function with `score = init_score(df)`, where df is the `cleaned.csv` pandas dataframe
    - You can also set df to be any dataframe with the same column names, but you may have to change the default values for crops, regions, and communities when you call the score function.
- To create a new score object, call `result = score(region, community)`, where region and community are optional parameters
- To get rankings of n crops, call `result.get_best_composite(n)` to get list of crops with best composite scores (includes percent diseased, condition, region, and community scores (if applicable)), `result.get_best_region(n)` to get list of crops with best region scores (if region was specified in the above step), and `result.get_best_community(n)` to get list of crops with best community scores (if community was specified in the above step).
    - By default, n is the number of unique crops in the dataset
    - Highest scoring crops are listed first in the returned list
    
Function Descriptions:
- `get_best_composite`: Ranks crops using composite scores based on condition, percent diseased, region (if applicable), and community (if applicable). The ranking of each crop corresponds to its order in the returned array (i.e. best to worst order). Uses the dictionary `comp_scores` to inform its rankings (the higher the score, the better the crop's rank).
- `get_best_region`: Ranks crops using composite scores based on region (if applicable). The ranking of each crop corresponds to its order in the returned array. Uses the dictionaries `reg_cond_scores` (for conditions) and `reg_dis_scores` (for percent diseased) to inform its rankings.
- `get_best_community`: Ranks crops using composite scores based on community (if applicable). The ranking of each crop corresponds to its order in the returned array. Uses the dictionaries `com_cond_scores` (for conditions) and `com_dis_scores` (for percent diseased) to inform its rankings.
- `get_best_type_composite`: Ranks crop types (e.g. "Veg", "Grians", etc.) using composite scores based on condition, percent diseased, region (if applicable), and community (if applicable). The ranking of each crop type corresponds to its order in the returned array. Uses the dictionary `type_comp_scores` to inform its rankings (the higher the score, the better the crop type's rank).
- `get_best_type_region`: Ranks crop types using composite scores based on region (if applicable). The ranking of each crop type corresponds to its order in the returned array. Uses the dictionaries `type_reg_cond_scores` (for conditions) and `type_reg_dis_scores` (for percent diseased) to inform its rankings.
- `get_best_type_community`: Ranks crop types using composite scores based on community (if applicable). The ranking of each crop type corresponds to its order in the returned array. Uses the dictionaries `type_com_cond_scores` (for conditions) and `type_com_dis_scores` (for percent diseased) to inform its rankings.

### Import Libraries

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot

from xgboost import XGBClassifier

from imblearn.over_sampling import SMOTE

In [None]:
df = pd.read_csv('./data/VL_farm_geo_w.csv')
df["Condition"].unique()

array(['Bueno', '0', 'Promedio', 'Excelente', 'Pobre', 'Excel', 'crisopa'],
      dtype=object)

In [None]:
class ScoreResult:
    def __init__(
        self, 
        comp_scores, 
        cond_scores, 
        per_dis_scores, 
        reg_cond_scores, 
        reg_per_dis_scores, 
        com_cond_scores, 
        com_per_dis_scores,
        type_comp_scores,
        type_cond_scores,
        type_per_dis_scores,
        type_reg_cond_scores,
        type_reg_per_dis_scores,
        type_com_cond_scores,
        type_com_per_dis_scores,
        crops,
        types
    ):
        self.comp_scores = comp_scores            # Overall crop composite scores
        self.cond_scores = cond_scores            # Overall crop condition scores
        self.per_dis_scores = per_dis_scores      # Overall crop percent diseased scores
        self.reg_cond_scores = reg_cond_scores    # Crop condition scores taking into account the specific region inputted into score function
        self.reg_dis_scores = reg_per_dis_scores  # Crop percent diseased scores taking into account the specific region inputted into score function
        self.com_cond_scores = com_cond_scores    # Crop condition scores taking into account the specific community inputted into score function
        self.com_dis_scores = com_per_dis_scores  # Crop percent diseased scores taking into account the specific community inputted into score function
        
        self.type_comp_scores = type_comp_scores             # Overall crop type composite scores
        self.type_cond_scores = type_cond_scores             # Overall crop type condition scores
        self.type_per_dis_scores = type_per_dis_scores       # Overall crop type percent diseased scores
        self.type_reg_cond_scores = type_reg_cond_scores     # Crop type condition scores taking into account the specific region inputted into score function
        self.type_reg_dis_scores = type_reg_per_dis_scores   # Crop type percent diseased scores taking into account the specific region inputted into score function
        self.type_com_cond_scores = type_com_cond_scores     # Crop type condition scores taking into account the specific community inputted into score function
        self.type_com_dis_scores = type_com_per_dis_scores   # Crop type percent diseased scores taking into account the specific community inputted into score function
        
        self.crops = crops
        self.types = types
        
    def get_best_composite(self, n=None):
        if n == None:
            n = len(self.crops)
        crops = self.crops.copy()
        crops.sort(key=lambda x: -self.comp_scores[x])
        return crops[:n]
    
    def get_best_region(self, n=None):
        if n == None:
            n = len(self.crops)
        crops = self.crops.copy()
        crops.sort(key=lambda x: -(self.reg_cond_scores[x] + self.reg_dis_scores[x]))
        return crops[:n]
    
    def get_best_community(self, n=None):
        if n == None:
            n = len(self.crops)
        crops = self.crops.copy()
        crops.sort(key=lambda x: -(self.com_cond_scores[x] + self.com_dis_scores[x]))
        return crops[:n]
    
    def get_best_type_composite(self, n=None):
        if n == None:
            n = len(self.types)
        types = self.types.copy()
        types.sort(key=lambda x: -self.type_comp_scores[x])
        return types[:n]
    
    def get_best_type_region(self, n=None):
        if n == None:
            n = len(self.types)
        types = self.types.copy()
        types.sort(key=lambda x: -(self.type_reg_cond_scores[x] + self.type_reg_dis_scores[x]))
        return types[:n]
    
    def get_best_type_community(self, n=None):
        if n == None:
            n = len(self.types)
        types = self.types.copy()
        types.sort(key=lambda x: -(self.type_com_cond_scores[x] + self.type_com_dis_scores[x]))
        return types[:n]
        
def normalize(d, target=1.0):
    raw = sum(d.values())
    factor = target/raw
    return {key:value*factor for key,value in d.items()}

def init_score(df,  
               crops=None, 
               types=None, 
               regions=None, 
               communities=None, 
               conds = None,
               cond_weights={
                   "good_cond": 1, 
                   "Bueno" : 1,
                   "excellent_cond": 2, 
                   "Excelente": 2,
                   "Excel": 2,
                   "average_cond": -1, 
                   "Promedio": -1,
                   "poor_cond": -2, 
                   "Pobre": -2,
                   "crisopa": -2,
                   0: 0, 
                   '0': 0}):
    if crops == None:
        crops = list(df["Crop"].unique())
    if regions == None:
        regions = list(df["Region"].unique())
    if types == None:
        types = list(df["Type"].unique())
    if communities == None:
        communities = list(df["Community"].unique())
    if conds == None:
        conds = list(df["Condition"].unique())
        
        
    def score(region=None, community=None):
        if region != None and region not in regions:
            raise ValueError(f"region is not valid, valid inputs include: {', '.join(regions)}")
        if community != None and community not in communities:
            raise ValueError(f"community is not valid, valid inputs include: {', '.join(communities)}")
        
        comp_scores = dict.fromkeys(crops, 0)
        cond_scores = dict.fromkeys(crops, 0)
        per_dis_scores = dict.fromkeys(crops, 0)
        
        reg_cond_scores = dict.fromkeys(crops, 0)
        reg_per_dis_scores = dict.fromkeys(crops, 0)
        
        com_cond_scores = dict.fromkeys(crops, 0)
        com_per_dis_scores = dict.fromkeys(crops, 0)
        
        for crop in crops:
            cond_total = 0
            n = 0
            crop_df = df[df["Crop"] == crop]
            cond_counts = crop_df["Condition"].value_counts().to_dict()
            for cond in conds:
                if cond in cond_counts:
                    cond_total += cond_counts[cond] * cond_weights[cond]
                    n += cond_counts[cond]
            if n == 0:
                cond_scores[crop] = 0
            else:
                cond_scores[crop] = cond_total / n
            per_dis_scores[crop] = 100 - crop_df["% Disease"].mean()
        cond_scores = normalize(cond_scores)
        per_dis_scores = normalize(per_dis_scores)
        
        if region != None:
            region_df = df[df["Region"] == region]
            for crop in crops:
                cond_total = 0
                n = 0
                crop_df = region_df[region_df["Crop"] == crop]
                cond_counts = crop_df["Condition"].value_counts().to_dict()
                for cond in conds:
                    if cond in cond_counts:
                        cond_total += cond_counts[cond] * cond_weights[cond]
                        n += cond_counts[cond]
                if n == 0:
                    reg_cond_scores[crop] = 0
                else:
                    reg_cond_scores[crop] = cond_total / n
                if isinstance(100 - crop_df["% Disease"].mean(), np.float64):
                    reg_per_dis_scores[crop] = 100 - crop_df["% Disease"].mean()
                else:
                    reg_per_dis_scores[crop] = 0
            reg_cond_scores = normalize(reg_cond_scores)
            reg_per_dis_scores = normalize(reg_per_dis_scores)
                
        if community != None:
            com_df = df[df["Community"] == community]
            for crop in crops:
                cond_total = 0
                n = 0
                crop_df = com_df[com_df["Crop"] == crop]
                cond_counts = crop_df["Condition"].value_counts().to_dict()
                for cond in conds:
                    if cond in cond_counts:
                        cond_total += cond_counts[cond] * cond_weights[cond]
                        n += cond_counts[cond]
                if n == 0:
                    com_cond_scores[crop] = 0
                else:
                    com_cond_scores[crop] = cond_total / n
                if isinstance(100 - crop_df["% Disease"].mean(), np.float64):
                    com_per_dis_scores[crop] = 100 - crop_df["% Disease"].mean()
                else:
                    com_per_dis_scores[crop] = 0
            com_cond_scores = normalize(com_cond_scores)
            com_per_dis_scores = normalize(com_per_dis_scores)
        
        for crop in crops:
            comp_scores[crop] += cond_scores[crop] + per_dis_scores[crop]
            if region != None:
                comp_scores[crop] += reg_cond_scores[crop] + reg_per_dis_scores[crop]
            if community != None:
                comp_scores[crop] += com_cond_scores[crop] + com_per_dis_scores[crop]
        comp_scores = normalize(comp_scores)
        
        
        type_comp_scores = dict.fromkeys(types, 0)
        type_cond_scores = dict.fromkeys(types, 0)
        type_per_dis_scores = dict.fromkeys(types, 0)
        
        type_reg_cond_scores = dict.fromkeys(types, 0)
        type_reg_per_dis_scores = dict.fromkeys(types, 0)
        
        type_com_cond_scores = dict.fromkeys(types, 0)
        type_com_per_dis_scores = dict.fromkeys(types, 0)
        
        for _type in types:
            cond_total = 0
            n = 0
            type_df = df[df["Type"] == _type]
            cond_counts = type_df["Condition"].value_counts().to_dict()
            for cond in conds:
                if cond in cond_counts:
                    cond_total += cond_counts[cond] * cond_weights[cond]
                    n += cond_counts[cond]
            if n == 0:
                type_cond_scores[_type] = 0
            else:
                type_cond_scores[_type] = cond_total / n
            type_per_dis_scores[_type] = 100 - type_df["% Disease"].mean()
        type_cond_scores = normalize(type_cond_scores)
        type_per_dis_scores = normalize(type_per_dis_scores)
        
        if region != None:
            region_df = df[df["Region"] == region]
            for _type in types:
                cond_total = 0
                n = 0
                type_df = region_df[region_df["Type"] == _type]
                cond_counts = type_df["Condition"].value_counts().to_dict()
                for cond in conds:
                    if cond in cond_counts:
                        cond_total += cond_counts[cond] * cond_weights[cond]
                        n += cond_counts[cond]
                if n == 0:
                    type_reg_cond_scores[_type] = 0
                else:
                    type_reg_cond_scores[_type] = cond_total / n
                if isinstance(100 - type_df["% Disease"].mean(), np.float64):
                    type_reg_per_dis_scores[_type] = 100 - type_df["% Disease"].mean()
                else:
                    type_reg_per_dis_scores[_type] = 0
            type_reg_cond_scores = normalize(type_reg_cond_scores)
            type_reg_per_dis_scores = normalize(type_reg_per_dis_scores)
                
        if community != None:
            com_df = df[df["Community"] == community]
            for _type in types:
                cond_total = 0
                n = 0
                type_df = com_df[com_df["Type"] == _type]
                cond_counts = type_df["Condition"].value_counts().to_dict()
                for cond in conds:
                    if cond in cond_counts:
                        cond_total += cond_counts[cond] * cond_weights[cond]
                        n += cond_counts[cond]
                if n == 0:
                    type_com_cond_scores[_type] = 0
                else:
                    type_com_cond_scores[_type] = cond_total / n
                if isinstance(100 - type_df["% Disease"].mean(), np.float64):
                    type_com_per_dis_scores[_type] = 100 - type_df["% Disease"].mean()
                else:
                    type_com_per_dis_scores[_type] = 0
            type_com_cond_scores = normalize(type_com_cond_scores)
            type_com_per_dis_scores = normalize(type_com_per_dis_scores)
        
        for _type in types:
            type_comp_scores[_type] += type_cond_scores[_type] + type_per_dis_scores[_type]
            if region != None:
                type_comp_scores[_type] += type_reg_cond_scores[_type] + type_reg_per_dis_scores[_type]
            if community != None:
                type_comp_scores[_type] += type_com_cond_scores[_type] + type_com_per_dis_scores[_type]
        type_comp_scores = normalize(type_comp_scores)
        
        return ScoreResult(
            comp_scores, 
            cond_scores, 
            per_dis_scores, 
            reg_cond_scores, 
            reg_per_dis_scores, 
            com_cond_scores, 
            com_per_dis_scores,
            type_comp_scores,
            type_cond_scores,
            type_per_dis_scores,
            type_reg_cond_scores,
            type_reg_per_dis_scores,
            type_com_cond_scores,
            type_com_per_dis_scores,
            crops,
            types
        )
    
    return score


In [None]:
score = init_score(df)
result = score()
print(result.get_best_composite(n = 5))
print(result.comp_scores)

['Repollo', 'Remolacha', 'Caña', 'verengena', 'Calabasa']
{'Calala': 0.019424576547294303, 'Papaya': 0.020815315541324698, 'Tomate': 0.021157561316023258, 'Plátano': 0.023611730117614086, 'Jamaica': 0.020036839839589086, 'Sandia': 0.02269644440506117, 'Melon': 0.020858555143752337, 'Sandia y melon': 0.021116677705433226, 'Verenjena': 0.018272257936496845, 'Granadilla': 0.02295793095719522, 'Pina': 0.026083912387749555, 'Guayaba': 0.027209124708328904, 'Repollo': 0.031011156686250683, 'Remolacha': 0.031011156686250683, 'Caña': 0.031011156686250683, 'Chile': 0.02469618818013658, 'Chiltoma': 0.02237659867232447, 'Pepino': 0.023455533580541713, 'Yuca': 0.025748937150533134, 'Rábano': 0.023095573501596717, 'Camote': 0.02996410326911878, 'Cebolla': 0.026173293296812294, 'Ayote': 0.019734980784907066, 'Pipian': 0.02166766215731849, 'Verengena': 0.022373342732938915, 'verenjena': 0.02123865812635296, 'Verngena': 0.02087271686359375, 'verengena': 0.031011156686250683, 'Chile..': 0.0212386581263

In [None]:
score = init_score(df)
result2 = score(region="Troilo", community="Ceiba 1")
print(result2.get_best_composite(n = 5))
print(result2.get_best_region(n = 5))
print(result2.get_best_community(n = 5))

['Repollo', 'Remolacha', 'Caña', 'verengena', 'zanahoria']
['Repollo', 'Remolacha', 'Caña', 'verengena', 'zanahoria']
['Jamaica', 'Repollo', 'Remolacha', 'Caña', 'verengena']


In [None]:
score = init_score(df)
result = score(community="Ceiba 1")
print(result.get_best_type_composite())
print(result.get_best_type_region())
print(result.get_best_type_community())
print(result.type_comp_scores)
print(result.type_reg_cond_scores)
print(result.type_com_cond_scores)

['Veg', 'Grains', 'Legumes Seeds', 'Fruit']
['Fruit', 'Veg', 'Legumes Seeds', 'Grains']
['Veg', 'Grains', 'Fruit', 'Legumes Seeds']
{'Fruit': 0.24437862764034918, 'Veg': 0.25803020078487043, 'Legumes Seeds': 0.2451765404689487, 'Grains': 0.25241463110583173}
{'Fruit': 0, 'Veg': 0, 'Legumes Seeds': 0, 'Grains': 0}
{'Fruit': 0.2419420955157847, 'Veg': 0.2638473409353156, 'Legumes Seeds': 0.24137613879463654, 'Grains': 0.2528344247542631}
