In [2]:
import os
from pathlib import Path

In [3]:
root = str(Path(os.getcwd()).absolute().parent)
os.chdir(root)

In [4]:
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm, trange

from clean import main_clean


In [3]:
root

'/Users/maximebonnin/Documents/Projects/SCOR/Datathon'

In [5]:
pathData = Path('Data/RawData')

In [172]:
def extract_infos(filename):
    state, season = filename.split("_")[1], filename.split("_")[-1].replace(".xlsx","")
    return state, season

def key_state_season(filename):
    filename = str(filename)
    return filename.split("_")[1]+"_"+filename.split("_")[-1].replace(".xlsx","")


allcolumns = ['State', 'Cluster', 'District', 'Sub-District', 'Block', 'GP', 'Season',
       'Crop', 'Area Sown (Ha)', 'Area Insured (Ha)', 'SI Per Ha (Inr/Ha)',
       'Sum Insured (Inr)', 'Indemnity Level', '2000 Yield', '2001 Yield', '2002 Yield', '2003 Yield',
       '2004 Yield', '2005 Yield', '2006 Yield', '2007 Yield', '2008 Yield',
       '2009 Yield', '2010 Yield', '2011 Yield', '2012 Yield', '2013 Yield',
       '2014 Yield', '2015 Yield', '2016 Yield',
       '2017 Yield', '2018 Yield']

columns_to_keep = ['Season',
       'Crop', 'Area Sown (Ha)', 'Area Insured (Ha)', 'SI Per Ha (Inr/Ha)',
       'Sum Insured (Inr)', 'Indemnity Level', 'key_str']

yields = ['State', 'Cluster', 'District', 'Sub-District', 'Block', 'GP', 'Season',
        '2000 Yield', '2001 Yield', '2002 Yield', '2003 Yield', '2004 Yield', 
        '2005 Yield', '2006 Yield', '2007 Yield', '2008 Yield', '2009 Yield', 
        '2010 Yield', '2011 Yield', '2012 Yield', '2013 Yield', '2014 Yield', 
        '2015 Yield', '2016 Yield', '2017 Yield', '2018 Yield', 'key_str'
]


def add_key_str(df):

    df["GP"] = df["GP"].fillna("")
    df["Block"] = df["Block"].fillna("").astype(str)
    

    df["key_str"] = df["State"].astype(str) + "_" + df["District"].astype(str)
    df["key_str"] += "_" + df["Sub-District"].astype(str) + "_"
    df["key_str"] += df["Block"].astype(str) + "_"
    df["key_str"] += df["GP"].astype(str)

    df["key_str"] = df["key_str"].str.lower()

    return df


def empty_df(keys, season):
    df = pd.DataFrame(columns=allcolumns)
    df["key_str"] = list(keys)
    df["State"] = df["key_str"].apply(lambda x: x.split("_")[0])
    df["District"] = df["key_str"].apply(lambda x: x.split("_")[1])
    df["Sub-District"] = df["key_str"].apply(lambda x: x.split("_")[2])
    df["Block"] = df["key_str"].apply(lambda x: x.split("_")[3])
    df["GP"] = df["key_str"].apply(lambda x: x.split("_")[4])

    df["Season"] = season

    return df

def store_years(pathData, state,season):
    paths = [pathData / f"{year}" / f"{year}_{state}_{season}.xlsx" for year in range(2017,2020)]

    indices = []
    df_years = []
    for i, path in enumerate(paths):
        if os.path.exists(path):
            df_years.append(pd.read_excel(path))
        else:
            df_years.append(None)
            indices.append(i)

    return df_years, indices
        
        
def adding_key_str_to_empty_df(df_years, index):
    keys = set()
    season = None
    for df in df_years:
        if not df is None:
            for key in df["key_str"].unique():
                keys.add(key)
            if season is None:
                season = df["Season"].unique()[0]

    for index in indices:
        df_years[index] = empty_df(keys, season)          

    return df_years

def cleaning_years(df_years, indices):
    for i in range(3):
        if not i in indices:
            df_years[i] = main_clean(df_years[i], normalization=False)
            df_years[i] = add_key_str(df_years[i])

    return df_years


def merge_years_no_yields(df_years):
    df_merged = df_years[0].copy()[columns_to_keep]
    df_inter = df_merged.merge(df_years[1][columns_to_keep], how='outer', on=["Season", "key_str"],suffixes=('_2017', '_2018'))
    print(df_merged.shape)
    df_merged = df_inter.merge(df_years[2][columns_to_keep],  how='outer', on=["Season", "key_str"],suffixes=("_2018", '_2019'))
    print(df_merged.shape)
    return df_inter, df_merged

def adding_yields(df_years, df_merged):
    # We assume that df_years[0] is 2017
    # We assume that df_years[1] is 2018
    # We assume that df_years[2] is 2019
    df_merged = df_merged.merge(df_years[-1][yields], how="inner", on=["Season", "key_str"])
    return df_merged

In [173]:
allStatesAndSeason = {key_state_season(x) for x in pathData.glob("*/*.xlsx")}
print(len(allStatesAndSeason))
allStatesAndSeason = {key_state_season(x) for x in pathData.glob("2017/*.xlsx")}
print(len(allStatesAndSeason))
allStatesAndSeason = {key_state_season(x) for x in pathData.glob("2018/*.xlsx")}
print(len(allStatesAndSeason))
allStatesAndSeason = {key_state_season(x) for x in pathData.glob("2019/*.xlsx")}
print(len(allStatesAndSeason))

30
26
27
27


In [163]:
for state_season in {key_state_season(x) for x in pathData.glob("*/*.xlsx")}:
    for year in range(2017,2020):
        if not state_season in {key_state_season(x) for x in pathData.glob(f"{year}/{year}*.xlsx")}:
            print(f"En {year} : {state_season}")

    

En 2017 : Tamil Nadu_Rabi
En 2018 : Tamil Nadu_Rabi
En 2017 : Odisha_Rabi
En 2018 : Odisha_Rabi
En 2019 : Bihar_Rabi
En 2017 : Uttarakhand_Kharif
En 2018 : Uttarakhand_Kharif
En 2019 : Jharkhand_Rabi
En 2019 : Bihar_Kharif
En 2017 : Chhattisgarh_Kharif


In [134]:
allStatesAndSeason = list({key_state_season(x) for x in pathData.glob("*/*.xlsx")})
print(len(allStatesAndSeason))
allStatesAndSeason[0:4]

30


['Jharkhand_Kharif', 'Rajasthan_Kharif', 'Chhattisgarh_Rabi', 'Gujarat_Kharif']

In [135]:
state, season =  "Chhattisgarh_Kharif".split("_") #allStatesAndSeason[0].split("_")
state, season

('Chhattisgarh', 'Kharif')

In [136]:
df_years, indices = store_years(pathData, state,season)
df_years = cleaning_years(df_years, indices)
df_years = adding_key_str_to_empty_df(df_years, indices)
# df_years

100%|██████████| 792/792 [00:01<00:00, 727.68it/s]
100%|██████████| 22776/22776 [00:33<00:00, 679.55it/s]


In [181]:
len(df_years[-1]["key_str"].unique())

14589

In [149]:
df_years[1].head()

Unnamed: 0,State,Cluster,District,Sub-District,Block,GP,Season,Crop,Area Sown (Ha),Area Insured (Ha),...,2012 Yield,2013 Yield,2014 Yield,2015 Yield,2016 Yield,2000 Yield,2001 Yield,2017 Yield,2018 Yield,key_str
0,Chhattisgarh,2,Balod,DONDILOHARA,Khertha,,Kharif,arhar,680.0,,...,600.0,309.0,578.0,,,,,,,chhattisgarh_balod_dondilohara_khertha_
1,Chhattisgarh,3,Balrampur,RAMCHANDRAPUR,Ramchandrapur,,Kharif,arhar,1223.714286,,...,859.0,131.0,712.0,,,,,,,chhattisgarh_balrampur_ramchandrapur_ramchandr...
2,Chhattisgarh,3,Balrampur,BALRAMPUR,Balrampur,,Kharif,arhar,1223.714286,,...,497.0,404.0,712.0,,,,,,,chhattisgarh_balrampur_balrampur_balrampur_
3,Chhattisgarh,3,Balrampur,WADRAFNAGAR,Badrafnagar,,Kharif,arhar,1223.714286,,...,173.0,380.0,712.0,,,,,,,chhattisgarh_balrampur_wadrafnagar_badrafnagar_
4,Chhattisgarh,3,Balrampur,WADRAFNAGAR,Raghunath Nagar,,Kharif,arhar,1223.714286,,...,322.0,402.0,712.0,,,,,,,chhattisgarh_balrampur_wadrafnagar_raghunath n...


In [178]:
df_inter, df_merged = merge_years_no_yields(df_years)
df_inter
df_merged

(14967, 8)
(23555, 22)


Unnamed: 0,Season_2017,Crop_2017,Area Sown (Ha)_2017,Area Insured (Ha)_2017,SI Per Ha (Inr/Ha)_2017,Sum Insured (Inr)_2017,Indemnity Level_2017,key_str,Season_2018,Crop_2018,...,SI Per Ha (Inr/Ha)_2018,Sum Insured (Inr)_2018,Indemnity Level_2018,Season,Crop,Area Sown (Ha),Area Insured (Ha),SI Per Ha (Inr/Ha),Sum Insured (Inr),Indemnity Level
0,Kharif,,,,,,,chhattisgarh_kanker_pakhanjur__rada,,,...,,,,Kharif,paddy un-irr,,201.888426,30000.0,6.056653e+06,0.8
1,Kharif,,,,,,,chhattisgarh_gariyaband_bindranavagarh(gariyab...,,,...,,,,Kharif,paddy un-irr,,165.554236,30000.0,4.966627e+06,0.8
2,Kharif,,,,,,,chhattisgarh_surajpur_premnagar__goragaanv,,,...,,,,Kharif,paddy un-irr,,148.682773,27000.0,4.014435e+06,0.8
3,Kharif,,,,,,,chhattisgarh_kabirdham_kawardha__kotachhaal,,,...,,,,Kharif,paddy un-irr,,132.997639,34500.0,4.588419e+06,0.8
4,Kharif,,,,,,,chhattisgarh_bilaspur_takhatpur__katakona,,,...,,,,Kharif,paddy un-irr,,75.244898,31250.0,2.351403e+06,0.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23550,Kharif,,,,,,,chhattisgarh_korba_kartala__sutar,,,...,,,,Kharif,paddy un-irr,,74.355275,31250.0,2.323602e+06,0.8
23551,Kharif,,,,,,,chhattisgarh_mungeli_pathariya__borada,,,...,,,,Kharif,paddy un-irr,,79.008414,31250.0,2.469013e+06,0.8
23552,Kharif,,,,,,,chhattisgarh_bilaspur_pendra__babee,,,...,,,,Kharif,paddy un-irr,,75.244898,31250.0,2.351403e+06,0.8
23553,Kharif,,,,,,,chhattisgarh_mungeli_lormi__airaadabaan,,,...,,,,Kharif,paddy un-irr,,79.008414,31250.0,2.469013e+06,0.8


In [146]:
len(df_years[2])

22776

In [179]:
df_final = adding_yields(df_years, df_merged)

In [171]:
len(df_final["key_str"].unique())

14589

In [180]:
df_final

Unnamed: 0,Season_2017,Crop_2017,Area Sown (Ha)_2017,Area Insured (Ha)_2017,SI Per Ha (Inr/Ha)_2017,Sum Insured (Inr)_2017,Indemnity Level_2017,key_str,Season_2018,Crop_2018,...,2009 Yield,2010 Yield,2011 Yield,2012 Yield,2013 Yield,2014 Yield,2015 Yield,2016 Yield,2017 Yield,2018 Yield
0,Kharif,,,,,,,chhattisgarh_kanker_pakhanjur__rada,,,...,1179.0,2430.0,2592.0,4800.0,4286.0,4127.0,1617.0,5511.0,4810.000000,
1,Kharif,,,,,,,chhattisgarh_gariyaband_bindranavagarh(gariyab...,,,...,602.0,2111.0,1344.0,2127.0,2245.0,2509.0,873.0,2778.0,1990.337893,
2,Kharif,,,,,,,chhattisgarh_surajpur_premnagar__goragaanv,,,...,1971.0,1533.0,1629.0,2767.0,3438.0,2982.0,1867.0,3673.0,4680.000000,
3,Kharif,,,,,,,chhattisgarh_kabirdham_kawardha__kotachhaal,,,...,491.0,2385.0,2473.0,2823.0,3917.0,1976.0,1109.0,2793.0,1926.000000,
4,Kharif,,,,,,,chhattisgarh_bilaspur_takhatpur__katakona,,,...,930.0,2811.0,3064.0,3492.0,3533.0,2785.0,1271.0,3479.0,2025.000000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
786154,Kharif,,,,,,,chhattisgarh_korba_kartala__sutar,,,...,2033.0,2720.0,3626.0,3100.0,2671.0,2398.0,1118.0,2468.0,2090.000000,
786155,Kharif,,,,,,,chhattisgarh_mungeli_pathariya__borada,,,...,729.0,2177.0,2500.0,2536.0,2826.0,2712.0,2185.0,4087.0,482.000000,
786156,Kharif,,,,,,,chhattisgarh_bilaspur_pendra__babee,,,...,815.0,2574.0,2171.0,2542.0,1555.0,2785.0,1271.0,3400.0,1400.000000,
786157,Kharif,,,,,,,chhattisgarh_mungeli_lormi__airaadabaan,,,...,658.0,1665.0,2529.0,2162.0,2645.0,2712.0,2185.0,2837.0,2401.000000,


In [177]:
786159/23555

33.37546168541711