### Imports

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from utils import add_Loss, clean_data
import umap
from sklearn.preprocessing import PowerTransformer
from tqdm.notebook import tqdm, trange
from sklearn.metrics import davies_bouldin_score
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import kmodes
from kmodes.kprototypes import KPrototypes
import plotly.graph_objects as go
import warnings
warnings.filterwarnings("ignore")

ModuleNotFoundError: No module named 'utils'

## Import and clean data 

In [2]:
#Select the dataset of one season of one year
YEAR = 2019
SEASON = "Rabi" # or "Kharif" 

#Path to the dataset
pathData_R = f"Data\RawDataUnified\RawData_{YEAR}_Rabi"
pathData_K= f"Data\RawDataUnified\RawData_{YEAR}_Kharif"


df_R = pd.read_csv(pathData_R)
df_K=pd.read_csv(pathData_K)

In [3]:
#Clean an add loss to data
df_R=add_Loss(clean_data(df_R))
df_K=add_Loss(clean_data(df_K))

df_K.head()

Unnamed: 0_level_0,Crop,Area Sown (Ha),Area Insured (Ha),SI Per Ha (Inr/Ha),Lp_2011,Lp_2012,Lp_2013,Lp_2014,Lp_2015,Lp_2016,Lp_2017,Loss
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
andhra pradesh_anantapur_agali__agali,Arhar,216.609811,0.168514,28750.0,0.843602,0.804674,0.874302,0.470271,0.840183,0.822226,0.0,14230.814316
andhra pradesh_anantapur_agali__akkagaladevarahalli,Arhar,216.609811,0.168514,28750.0,0.842151,0.802861,0.894883,0.579534,0.782517,0.820577,0.0,14464.016467
andhra pradesh_anantapur_agali__hulikeradevarahalli,Arhar,216.609811,0.168514,28750.0,0.843602,0.804674,0.879689,0.470271,0.840183,0.822226,0.0,14230.814316
andhra pradesh_anantapur_agali__inagalore,Arhar,216.609811,0.168514,28750.0,0.842151,0.802861,0.887634,0.579534,0.782517,0.820577,0.0,14464.016467
andhra pradesh_anantapur_agali__kodihalli,Arhar,216.609811,0.168514,28750.0,0.843602,0.804674,0.874302,0.470271,0.840183,0.822226,0.0,14230.814316


## Load optimal clusters

In [4]:
#Load Data preproccesed
data_R=pd.read_csv("Outputs/data_preprocessedKmeans_Rabi").set_index("key")
data_K=pd.read_csv("Outputs/data_preprocessedKmeans_Kharif").set_index("key")

# #One-Hot-Encoding
# data_R = pd.get_dummies(df_R)
# data_K = pd.get_dummies(df_K)

# #Pre-processing
# for c in tqdm(data_R.columns):
#     pt = PowerTransformer()
#     data_R.loc[:, c] = pt.fit_transform(np.array(data_R[c]).reshape(-1, 1))
# # data_R.to_csv("Outputs/data_preprocessedKmeans_Rabi")

# for c in tqdm(data_K.columns):
#     pt = PowerTransformer()
#     data_K.loc[:, c] = pt.fit_transform(np.array(data_K[c]).reshape(-1, 1))
# # data_K.to_csv("Outputs/data_preprocessedKmeans_Kharif")

In [5]:
#load df_pred for Rabi and Kharif
df_pred_R=pd.read_csv('Data/03_Prediction/GP_Pred_Rabi.csv').set_index("key")
df_pred_K=pd.read_csv('Data/03_Prediction/GP_Pred_Kharif.csv').set_index("key")


#load df_clusters for Rabi and Kharif
nb_clusters_R=7
df_clusters_R=pd.read_csv("Outputs/kmeans_labels_Rabi").set_index('key').rename(columns={'0':'Cluster'})
clust_R=np.array(df_clusters_R)
db_index = davies_bouldin_score(data_R, clust_R)
print(f"db index for Rabi with k = {nb_clusters_R} : ", db_index)

nb_clusters_K=8
df_clusters_K=pd.read_csv("Outputs/kmeans_labels_Kharif").set_index('key').rename(columns={'0':'Cluster'})
clust_K=np.array(df_clusters_K)
db_index = davies_bouldin_score(data_K, clust_K)
print(f"db index for Kharif with k = {nb_clusters_K} : ", db_index)

db index for Rabi with k = 7 :  1.7727066119292147
db index for Kharif with k = 8 :  2.150293215302705


In [218]:
random_state=43
taille_echantillon =500

def add_clusters_pred(df_pred,df_clusters, random_state=42,taille_echantillon =500):
    #df_clusters=df_clusters.reset_index().drop_duplicates(keep = 'first').set_index('key')
    df_pred=df_pred.sample(frac=1, random_state=random_state)[:taille_echantillon]

    #fill df_pred['Cluster'] with df_clusters
    for index in tqdm(df_pred.index) :
        if index in df_clusters.index and str(type(df_clusters.loc[index,'Cluster']))== "<class 'numpy.int64'>" :
            df_pred.loc[index,'Cluster']=df_clusters.loc[index,'Cluster']
        if index in df_clusters.index and str(type(df_clusters.loc[index,'Cluster']))!= "<class 'numpy.int64'>":  
            cluster=df_clusters.loc[index,'Cluster'].value_counts().argmax()
            df_pred.loc[index,'Cluster'] = cluster

    #fill NA with most seen values
    for admin in tqdm(['Block','Sub-District', 'District']) :
        for subd in pd.unique(df_pred[admin]):
            df=df_pred[df_pred[admin]==subd]['Cluster']
            if (len(df_pred[df_pred[admin]==subd]['Cluster'].value_counts())>0):
                valeur=df.value_counts().argmax()
                df_pred.loc[df_pred[admin]==subd,['Cluster']]=df.fillna(valeur)
    
    df_pred['Cluster']=df_pred['Cluster'].fillna(-1)

    return df_pred
    

#### TEST

In [221]:
test=add_clusters_pred(df_pred_R,df_clusters_R,random_state=100,taille_echantillon=2000)
test

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0_level_0,State,District,Sub-District,Block,GP,Cluster
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
uttar pradesh_azamgarh_sadar_sathiyav_mahuvamurarpur,uttar pradesh,azamgarh,sadar,sathiyav,mahuvamurarpur,0.0
maharashtra_akola_akola_borgaon manju_,maharashtra,akola,akola,borgaon manju,,-1.0
andhra pradesh_chittoor_irala__all villages,andhra pradesh,chittoor,irala,,all villages,0.0
uttar pradesh_auraiya_achhalda_ghasara_ghasara,uttar pradesh,auraiya,achhalda,ghasara,ghasara,-1.0
karnataka_bijapur_muddebihal_davalagi_madikeshwara,karnataka,bijapur,muddebihal,davalagi,madikeshwara,0.0
...,...,...,...,...,...,...
tamil nadu_thanjavur_thiruvonam_kavalipatti_91 krishnapuram,tamil nadu,thanjavur,thiruvonam,kavalipatti,91 krishnapuram,-1.0
uttar pradesh_sultanpur_motigarpur_aalapur_bairavpur,uttar pradesh,sultanpur,motigarpur,aalapur,bairavpur,0.0
uttar pradesh_bulandshahr_araniya_agaura amirpur_baragaon,uttar pradesh,bulandshahr,araniya,agaura amirpur,baragaon,0.0
uttar pradesh_muzaffarnagar_jansath_mandaur_behrassa,uttar pradesh,muzaffarnagar,jansath,mandaur,behrassa,0.0


In [222]:
print(test['Cluster'].value_counts())
print('nb of NA =', test['Cluster'].isna().sum())

 0.0    1138
-1.0     658
 1.0     187
 2.0       6
 4.0       5
 5.0       4
 3.0       2
Name: Cluster, dtype: int64
nb of NA = 0
