In [3]:
from time import time
import collections
import os
from pathlib import Path
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm, trange

In [4]:
root = str(Path(os.getcwd()).absolute().parent)
os.chdir(root)

In [5]:
root

'/Users/maximebonnin/Documents/Projects/SCOR/Datathon'

## Try to merge data after concatenate data by year

for year in [2017,2018,2019]:
    pathData = Path(f"RawData/{year}/")
    paths = [pathData / name for name in os.listdir(pathData)]
    dfs = [pd.read_excel(path) for path in paths]
    df_summary = pd.concat(dfs, ignore_index=True)
    df_summary["Year"] = year
    df_summary.to_csv(pathData / f"{year}_summary.csv", index=False)

df_test = pd.read_csv(pathData / f"{year}_summary.csv")
df_test

In [19]:
def add_key_str(df):

    df["GP"] = df["GP"].fillna("")
    df["Block"] = df["Block"].fillna("").astype(str)
    

    df["key_str"] = df["State"].astype(str) + "_" + df["District"].astype(str)
    df["key_str"] += "_" + df["Sub-District"].astype(str) + "_"
    df["key_str"] += df["Block"].astype(str) + "_"
    df["key_str"] += df["GP"].astype(str)

    df["key_str"] = df["key_str"].str.lower()

    return df

columns_to_keep = ['Season',
       'Crop', 'Area Sown (Ha)', 'Area Insured (Ha)', 'SI Per Ha (Inr/Ha)',
       'Sum Insured (Inr)', 'Indemnity Level', 'key_str', 'Loss']

yields = ['State', 'Cluster', 'District', 'Sub-District', 'Block', 'GP', 'Season',
        '2000 Yield', '2001 Yield', '2002 Yield', '2003 Yield', '2004 Yield', 
        '2005 Yield', '2006 Yield', '2007 Yield', '2008 Yield', '2009 Yield', 
        '2010 Yield', '2011 Yield', '2012 Yield', '2013 Yield', '2014 Yield', 
        '2015 Yield', '2016 Yield', '2017 Yield', '2018 Yield', 'key_str'
]

def merge_year(dfs):
    df_merged = dfs[0].copy()[columns_to_keep]
    df_merged = df_merged.merge(dfs[1][columns_to_keep], on=["Season", "key_str"],suffixes=('_2017', '_2018'))
    df_merged = df_merged.merge(dfs[2][columns_to_keep], on=["Season", "key_str"],suffixes=("_2018", '_2019'))

    return df_merged

INDEX = [
    "Area Sown (Ha)",
    "Area Insured (Ha)",
    "SI Per Ha (Inr/Ha)",
    "Sum Insured (Inr)",
    "Indemnity Level",
]
INDEX.extend([f"{year} Yield" for year in range(2000,2019)])


def main_clean(df, transformation=None):
    """
    This function cleans the dataframe and returns a cleaned dataframe.
    """

    df_new = df.copy()
    start = time()

    stats = compute_mean_by_crop(df_new)
    print(time() - start)
    start = time()

    df_new = precleaning_area_sown(df_new)
    print(time() - start)
    start = time()

    print(len(df_new))

    df_new = precleaning_yield(df_new)
    print(time() - start)
    start = time()

    print(len(df_new))

    df_new = fill_NaN(df_new, stats)
    print(time() - start)
    start = time()

    print(len(df_new))

    if transformation=="normalization":
        # Can be modified to take into account
        # other normalization methods
        df_new = normalization(df_new)

    elif transformation=="standardization":
        df_new = standardization(df_new)

    elif isinstance(transformation, collections.abc.Callable):
        df_new = transformation(df_new)

    return df_new



def compute_mean_by_crop(df):
    """
    This function computes the mean of the different index by crop.
    """

    stats = {}

    df = df.copy()
    df["Crop"] = df["Crop"].str.lower()
    
    crops = df["Crop"].unique()

    for crop in crops:
        sub_df = df.query(f"Crop == '{crop}'")
        stats[crop] = {}
        for index in INDEX:
            stats[crop][index] = {}
            try:
                N = int(len(sub_df) - pd.isna(sub_df[index]).sum())
            
            except KeyError:
                N = 0 

            if N == 0:
                stats[crop][index]["average"] = np.nan
                stats[crop][index]["N"] = 0

            else:
                stats[crop][index]["average"] = sub_df[index].mean()
                stats[crop][index]["N"] = N

    


    # for i in trange(df.shape[0]):
    #     crop = df.loc[i, "Crop"]
    #     if crop not in stats:
    #         stats[crop] = {}
    #         for index in INDEX:
    #             try:
    #                 stats[crop][index] = {"N": 0, "sum": 0} 

    #             except KeyError:
    #                 continue

    #         # stats[crop]["Area Sown (Ha)"] = {"N": 0, "sum": 0} 
    #         # stats[crop]["Area Insured (Ha)"] = {"N": 0, "sum": 0} 
    #         # stats[crop]["SI Per Ha (Inr/Ha)"] = {"N": 0, "sum": 0} 
    #         # stats[crop]["Sum Insured (Inr)"] = {"N": 0, "sum": 0} 
    #         # stats[crop]["Indemnity Level"] = {"N": 0, "sum": 0} 
            

    #         # for year in range(2000,2019):
    #         #     try:
    #         #         stats[crop][f"{year} Yield"] = {"N": 0, "sum": 0} 
    #         #     except KeyError:
    #         #         continue

    #     for index in INDEX:
    #         try:
    #             value = df.loc[i,index]

    #         except KeyError:
    #             continue

    #         if not pd.isna(value):
    #             stats[crop][index]["N"] += 1
    #             stats[crop][index]["sum"] += value


    results = {}

    # Adding mean by crop for every index
    for crop in stats.keys():
        results[crop] = {}

        for index in stats[crop].keys():
            if index != "N":
                if stats[crop][index]["N"] == 0:
                    results[crop][index] = np.nan
                else:
                    results[crop][index] = stats[crop][index]["average"]

    #Adding overall mean for every index
    results["overall"] = {}
    for crop in stats.keys():
        for index in stats[crop].keys():
            if index not in results["overall"]:
                results["overall"][index] = {}
                results["overall"][index]["N"] = 0
                results["overall"][index]["sum"] = 0  

            results["overall"][index]["N"] += stats[crop][index]["N"]
            results["overall"][index]["sum"] += stats[crop][index]["average"]*stats[crop][index]["N"]

    for index in results["overall"].keys():
        if results["overall"][index]["N"] == 0:
            results["overall"][index] = np.nan
        else:
            results["overall"][index] = results["overall"][index]["sum"] / results["overall"][index]["N"]
            
    return results

def filler(df, stats, col):
    """
    This function adds the mean of the different index by crop.
    """
    df_copy = df.copy()
    df_copy["Bool"] = df_copy[col].isna().astype(int)

    for crop in df["Crop"].unique():
        
        index = df_copy.query(f"Crop == '{crop}' and Bool == 1").index
        
        df_copy.loc[index,col] = stats[crop][col]
        
    df[col] = df_copy[col]

    return df



def precleaning_area_sown(df):
    """
    This function cleans the column Area Sown (Ha) and put NaN if the value is not numeric.
    It may contains non numeric values.
    """
    newValues = []
    for value in df["Area Sown (Ha)"]:
        try:
            value = float(value)
            newValues.append(value)
        except ValueError:
            newValues.append(np.NaN)
    df["Area Sown (Ha)"] = newValues
    return df
    

def precleaning_yield(df):
    
    """
    This function cleans all columns '20XX Yield' and put NaN if the value is not numeric.
    It may contains non numeric values.
    """
    for year in range(2000,2019):
        
        try:
            df[f"{year} Yield"]
        except KeyError:
            continue
        
        newValues = []
        for value in df[f"{year} Yield"]:
            try:
                value = float(value)
                newValues.append(value)
            except ValueError:
                newValues.append(np.NaN)
        df[f"{year} Yield"] = newValues

        
        
    return df

def fill_NaN(df, stats):

    """
    This function fills the missing vlaues for every index 
    using statistics on crop and overall statistics if needed.
    """
    df["Crop"] = df["Crop"].str.lower()

    # filling index ...
    for index in INDEX:
        try:
            df = filler(df, stats, index)

        except KeyError:
            df[index] = [np.nan] * len(df)
            continue

    # df = filler(df, stats, "Area Sown (Ha)")
    # df = filler(df, stats, "Area Insured (Ha)")
    # df = filler(df, stats, "SI Per Ha (Inr/Ha)")
    # df = filler(df, stats, "Sum Insured (Inr)")
    # df = filler(df, stats, "Indemnity Level")

    # # filling yields 
    # for year in range(2000,2019):
    #     try:
    #         df = filler(df, stats, f"{year} Yield")
    #     except KeyError:
    #         df[f"{year} Yield"] = [np.nan]*len(df)
    #         continue

    for index in stats["overall"].keys():
        if index != "N":
            try:
                df[index].fillna(stats["overall"][index], inplace=True)

            except KeyError:
                continue

    return df


def normalization(df):
    """
    This function normalizes all index.
    """
    for index in INDEX:
        try:
            df[index] = (df[index] - df[index].min()) / (df[index].max() - df[index].min())
        except KeyError:
            continue

    return df

def standardization(df):
    """
    This function standardizes all index.
    """
    for index in INDEX:
        try:
            df[index] = (df[index] - df[index].mean()) / df[index].std()
        except KeyError:
            continue

    return df

In [10]:
df = pd.read_excel("Data/RawData/2018/2018_Bihar_Kharif.xlsx")
df

Unnamed: 0,State,Cluster,District,Sub-District,Block,GP,Season,Crop,Area Sown (Ha),Area Insured (Ha),...,2007 Yield,2008 Yield,2009 Yield,2010 Yield,2011 Yield,2012 Yield,2013 Yield,2014 Yield,2015 Yield,2016 Yield
0,Bihar,3,Sheohar,,PURNAHIYA,Adauree,Kharif,Paddy,,172.528302,...,436.0,1506.0,469.0,67.0,3584.0,3086.0,1290.0,2539.0,271.0,2816.000000
1,Bihar,3,Sheohar,,PURNAHIYA,Bakhaar Chandiha,Kharif,Paddy,,172.528302,...,436.0,1506.0,469.0,67.0,3584.0,3086.0,1290.0,2539.0,271.0,3246.778261
2,Bihar,3,Sheohar,,PURNAHIYA,Basant Jagajeevan,Kharif,Paddy,,172.528302,...,436.0,1506.0,469.0,67.0,3584.0,3086.0,1290.0,2539.0,271.0,3695.000000
3,Bihar,3,Sheohar,,PURNAHIYA,Basant Pattee,Kharif,Paddy,,172.528302,...,436.0,1506.0,469.0,67.0,3584.0,3086.0,1290.0,2539.0,271.0,4078.000000
4,Bihar,3,Sheohar,,PURNAHIYA,Baraahee Jagadeesh,Kharif,Paddy,,172.528302,...,436.0,1506.0,469.0,67.0,3584.0,3086.0,1290.0,2539.0,271.0,2266.758226
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7304,Bihar,1,Siwan,,,,Kharif,Maize,,6176.000000,...,,,3416.0,1805.0,2304.0,2513.0,2602.0,2528.0,1941.0,2749.000000
7305,Bihar,1,Vaishali,,,,Kharif,Maize,,7249.000000,...,,,1738.0,2041.0,1627.0,3156.0,3637.0,3857.0,2784.0,4261.000000
7306,Bihar,1,Saharsa,,,,Kharif,Maize,,2277.000000,...,,,2822.0,3984.0,4960.0,5213.0,4840.0,4448.0,2622.0,3767.000000
7307,Bihar,1,Katihar,,,,Kharif,Maize,,6013.000000,...,,,1610.0,3218.0,3338.0,7071.0,7426.0,4556.0,7455.0,9848.000000


In [11]:
print(df.query("Crop == 'Paddy'")['2008 Yield'].mean())
sub_df = df.query("Crop == 'Paddy'")
len(sub_df) - pd.isna(sub_df["2008 Yield"]).sum()

2279.8393127147765


7275

In [20]:
df_new = main_clean(df, transformation=None)

0.00920724868774414
0.0012197494506835938
7309
0.01699519157409668
7309
0.1014249324798584
7309


In [22]:
df_new[["Crop", "Area Insured (Ha)", "SI Per Ha (Inr/Ha)", "Sum Insured (Inr)", "Indemnity Level",
 "2008 Yield", "2009 Yield", "2010 Yield", "2011 Yield", "2012 Yield", "2013 Yield", "2014 Yield", "2015 Yield", "2016 Yield"]].isna().sum()

Crop                   0
Area Insured (Ha)      0
SI Per Ha (Inr/Ha)     0
Sum Insured (Inr)      0
Indemnity Level        0
2008 Yield            28
2009 Yield             0
2010 Yield             0
2011 Yield             0
2012 Yield             0
2013 Yield             0
2014 Yield             0
2015 Yield             0
2016 Yield             0
dtype: int64

In [18]:
df_new.query("Crop != 'paddy'").index

Int64Index([7281, 7282, 7283, 7284, 7285, 7286, 7287, 7288, 7289, 7290, 7291,
            7292, 7293, 7294, 7295, 7296, 7297, 7298, 7299, 7300, 7301, 7302,
            7303, 7304, 7305, 7306, 7307, 7308],
           dtype='int64')

In [127]:
np.any(pd.isna(df_new.query("Crop == 'paddy'")["2008 Yield"].unique()))

True

In [97]:
stats = compute_mean_by_crop(df)
stats

{'paddy': {'Area Sown (Ha)': nan,
  'Area Insured (Ha)': 185.9207033511454,
  'SI Per Ha (Inr/Ha)': 49006.62106853454,
  'Sum Insured (Inr)': 9145615.626896944,
  'Indemnity Level': 0.7225518472737261,
  '2000 Yield': nan,
  '2001 Yield': nan,
  '2002 Yield': nan,
  '2003 Yield': nan,
  '2004 Yield': nan,
  '2005 Yield': nan,
  '2006 Yield': 1845.3569759450172,
  '2007 Yield': 1569.2909965635738,
  '2008 Yield': 2279.8393127147765,
  '2009 Yield': 1523.2164948453608,
  '2010 Yield': 1409.012233676976,
  '2011 Yield': 3665.4360137457043,
  '2012 Yield': 3652.83793814433,
  '2013 Yield': 2864.60206185567,
  '2014 Yield': 3692.9619243986253,
  '2015 Yield': 2862.945154639175,
  '2016 Yield': 3787.5669454532517,
  '2017 Yield': nan,
  '2018 Yield': nan},
 'maize': {'Area Sown (Ha)': nan,
  'Area Insured (Ha)': 4843.607142857143,
  'SI Per Ha (Inr/Ha)': 50121.46428571428,
  'Sum Insured (Inr)': 249914736.42857143,
  'Indemnity Level': 0.7142857142857142,
  '2000 Yield': nan,
  '2001 Yield':

In [11]:
df = precleaning_area_sown(df)
df = precleaning_yield(df)
df

Unnamed: 0,State,Cluster,District,Sub-District,Block,GP,Season,Crop,Area Sown (Ha),Area Insured (Ha),...,2007 Yield,2008 Yield,2009 Yield,2010 Yield,2011 Yield,2012 Yield,2013 Yield,2014 Yield,2015 Yield,2016 Yield
0,Chhattisgarh,1,Balod,Balod,,,Rabi,Castor,,625.142857,...,,,,342.0,276.0,220.0,130.0,349.0,,
1,Chhattisgarh,1,Balod,Dondi,,,Rabi,Castor,,625.142857,...,,,,210.0,180.0,190.0,267.0,160.0,,
2,Chhattisgarh,1,Balod,Dondilohara,,,Rabi,Castor,,625.142857,...,,,,318.0,310.0,250.0,244.0,340.0,,
3,Chhattisgarh,1,Balod,Dondilohara,,,Rabi,Castor,,625.142857,...,,,,318.0,310.0,401.0,209.0,321.0,,
4,Chhattisgarh,1,Balod,Gurur,,,Rabi,Castor,,625.142857,...,,,,103.0,274.0,114.0,191.0,169.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
617,Chhattisgarh,3,Surguja,Lakhanpur,,,Rabi,Castor,,377.500000,...,244.8,272.0,259.0,183.0,417.0,1105.0,581.0,790.0,,
618,Chhattisgarh,3,Surguja,Udaipur,,,Rabi,Castor,,377.500000,...,244.8,272.0,259.0,326.0,400.0,460.0,614.0,555.0,,
619,Chhattisgarh,3,Surguja,Lundra,,,Rabi,Castor,,377.500000,...,288.0,410.0,259.0,668.0,897.0,450.0,375.0,477.0,,
620,Chhattisgarh,3,Surguja,Lundra,,,Rabi,Castor,,377.500000,...,288.0,410.0,259.0,668.0,897.0,450.0,545.0,428.0,,


In [32]:
df_new = df.copy()
df_new = filler(df_new, stats)
df_new

Unnamed: 0,State,Cluster,District,Sub-District,Block,GP,Season,Crop,Area Sown (Ha),Area Insured (Ha),...,2011 Yield,2012 Yield,2013 Yield,2014 Yield,2015 Yield,2016 Yield,2000 Yield,2001 Yield,2017 Yield,2018 Yield
0,Chhattisgarh,1,Balod,Balod,,,Rabi,castor,,625.142857,...,276.0,220.0,130.0,349.0,,,,,,
1,Chhattisgarh,1,Balod,Dondi,,,Rabi,castor,,625.142857,...,180.0,190.0,267.0,160.0,,,,,,
2,Chhattisgarh,1,Balod,Dondilohara,,,Rabi,castor,,625.142857,...,310.0,250.0,244.0,340.0,,,,,,
3,Chhattisgarh,1,Balod,Dondilohara,,,Rabi,castor,,625.142857,...,310.0,401.0,209.0,321.0,,,,,,
4,Chhattisgarh,1,Balod,Gurur,,,Rabi,castor,,625.142857,...,274.0,114.0,191.0,169.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
617,Chhattisgarh,3,Surguja,Lakhanpur,,,Rabi,castor,,377.500000,...,417.0,1105.0,581.0,790.0,,,,,,
618,Chhattisgarh,3,Surguja,Udaipur,,,Rabi,castor,,377.500000,...,400.0,460.0,614.0,555.0,,,,,,
619,Chhattisgarh,3,Surguja,Lundra,,,Rabi,castor,,377.500000,...,897.0,450.0,375.0,477.0,,,,,,
620,Chhattisgarh,3,Surguja,Lundra,,,Rabi,castor,,377.500000,...,897.0,450.0,545.0,428.0,,,,,,


In [None]:
dfs = []

for year in range(2017,2020):
    pathData = Path(f"RawData/{year}/")
    df = pd.read_csv(pathData / f"{year}_summary.csv")
    dfs.append(df)


In [62]:
new_dfs = []
for i, df in enumerate(dfs):
    print(f"Year {2017+i} ...")
    print('Adding key...')
    add_key_str(df)
    # print(df.columns)
    print("Adding area sown...")
    df = precleaning_area_sown(df)
    print("Adding yield...")
    df = precleaning_yield(df)
    print("Adding Loss...")
    df = add_Loss(df, 2015+i)
    print("Cleaning ...")
    df = clean(df)
    print("Normalizing...")
    df = normalization_other(df)
    new_dfs.append(df)

df_merged = 
merge_year(new_dfs)
df_merged = pd.get_dummies(df_merged, columns=["Crop", "Crop_2017", "Crop_2018"])


Year 2017 ...
Adding key...
Adding area sown...
Adding yield...


  0%|          | 0/20 [00:00<?, ?it/s]

Adding Loss...
(5, 632543)
Cleaning ...
Normalizing...
Year 2018 ...
Adding key...
Adding area sown...
Adding yield...


  0%|          | 0/20 [00:00<?, ?it/s]

Adding Loss...
(5, 633324)
Cleaning ...
Normalizing...
Year 2019 ...
Adding key...
Adding area sown...
Adding yield...


  0%|          | 0/20 [00:00<?, ?it/s]

Adding Loss...
(5, 690412)
Cleaning ...
Normalizing...


In [24]:
df_2019 = new_dfs[-1].copy()
del dfs, df, new_dfs
df_summary_merged = df_merged.merge(df_2019[yields], on=["Season", "key_str"])
df_summary_merged = normalization_yield(df_summary_merged)
df_summary_merged

In [2]:
df_summary_merged.head()

NameError: name 'df_summary_merged' is not defined

## Merge years for every state and season

In [74]:
pathData = Path('Data/RawData')

In [75]:
def extract_infos(filename):
    state, season = filename.split("_")[1], filename.split("_")[-1].replace(".xlsx","")
    return state, season

def key_state_season(filename):
    filename = str(filename)
    return filename.split("_")[1]+"_"+filename.split("_")[-1].replace(".xlsx","")


def merge_DF(paths):

    dfs = [pd.read_excel(path) for path in paths]
    
    new_dfs = []
    for i, df in enumerate(dfs):
        add_key_str(df)
        # print(df.columns)
        precleaning_area_sown(df)
        precleaning_yield(df)
        # df = add_Loss(df, 2015+i)
        df = clean(df)
        df = normalization_other(df)
        new_dfs.append(df)

    df_merged = merge_year(new_dfs)
    return df_merged    

In [76]:
allStatesAndSeason = {key_state_season(x) for x in pathData.glob("*/*.xlsx")}
len(allStatesAndSeason)

30

df["Area Sown (Ha)"] = df["Area Sown (Ha)"].astype(float).fillna(-1)
    df["Area Insured (Ha)"] = df["Area Insured (Ha)"].astype(float).fillna(-1)
    df["SI Per Ha (Inr/Ha)"] = df["SI Per Ha (Inr/Ha)"].astype(float).fillna(-1)
    df["Sum Insured (Inr)"] = df["Sum Insured (Inr)"].astype(float).fillna(-1)
    df["Indemnity Level"] = df["Indemnity Level"].astype(float).fillna(-1)

In [77]:
df = pd.read_excel("/Users/maximebonnin/Documents/Projects/SCOR/Datathon/Data/RawData/2018/2018_Andhra Pradesh_Kharif.xlsx")


In [82]:
stats = compute_mean_by_crop(df)



  0%|          | 0/18735 [00:00<?, ?it/s]

AttributeError: 'str' object has no attribute 'keys'

In [69]:
add_key_str(df)
# print(df.columns)
print("Adding area sown...")
clean_area_sown(df)
print("Adding yield...")
clean_yield(df)
print("Adding Loss...")
df = add_Loss(df, 2015+i)
print("Cleaning ...")
df = clean(df)
print("Normalizing...")
df = normalization_other(df)

array(['Arhar', 'Paddy', 'Bajra', 'Castor', 'Chilli IRR', 'Chilli',
       'Cotton Un-IRR', 'Groundnut Un-IRR', 'Groundnut', 'Jowar', 'Maize',
       'Moong', 'Navane', 'Sugarcane Plant', 'Sugarcane Ratoon',
       'Sunflower', 'Urad', 'Chilli Un-IRR', 'Cotton IRR',
       'Groundnut IRR'], dtype=object)

In [68]:
df.query("Crop == 'Paddy'")["SI Per Ha (Inr/Ha)"].fillna(-1).unique()

array([70000, 80000, 71250, 75000, 62500, 72500, 67500, 65000, 40000])

In [44]:
notFull = []
for key in tqdm(allStatesAndSeason):
    paths = [x for x in pathData.glob(f"*/*_{key}.xlsx")]
    paths.sort()
    if len(paths)==3:
        print(paths)
        df = merge_DF(paths)
        print(df.head())
        df.to_csv(pathData / "Unified" / f"{key}.csv")
    else:
        notFull.append(key)
        continue

  0%|          | 0/30 [00:00<?, ?it/s]

[PosixPath('Data/RawData/2017/2017_Rajasthan_Kharif.xlsx'), PosixPath('Data/RawData/2018/2018_Rajasthan_Kharif.xlsx'), PosixPath('Data/RawData/2019/2019_Rajasthan_Kharif.xlsx')]


  0%|          | 0/20 [00:00<?, ?it/s]

(5, 28670)


  0%|          | 0/20 [00:00<?, ?it/s]

(5, 28659)


  0%|          | 0/20 [00:00<?, ?it/s]

(5, 31029)
   Season Crop_2017  Area Sown (Ha)_2017  Area Insured (Ha)_2017  \
0  kharif     arhar             1.540322               -0.946948   
1  kharif     arhar             1.540322               -0.946948   
2  kharif     arhar             1.540322               -0.946948   
3  kharif     arhar             1.540322               -0.946948   
4  kharif     arhar             1.540322               -0.946948   

   SI Per Ha (Inr/Ha)_2017  Sum Insured (Inr)_2017  Indemnity Level_2017  \
0                 1.843366               -0.874697             -0.999983   
1                 1.843366               -0.874697             -0.999983   
2                 1.843366               -0.874697             -0.999983   
3                 1.843366               -0.874697             -0.999983   
4                 1.843366               -0.874697             -0.999983   

                 key_str  Loss_2017 Crop_2018  ...  Sum Insured (Inr)_2018  \
0  rajasthan_alwar_nan__        NaN     bajra

  0%|          | 0/20 [00:00<?, ?it/s]

(5, 485)


  0%|          | 0/20 [00:00<?, ?it/s]

(5, 485)


  0%|          | 0/20 [00:00<?, ?it/s]

(5, 451)
Empty DataFrame
Columns: [Crop_2017, Area Sown (Ha)_2017, Area Insured (Ha)_2017, SI Per Ha (Inr/Ha)_2017, Sum Insured (Inr)_2017, Indemnity Level_2017, Loss_2017, Crop_2018, Area Sown (Ha)_2018, Area Insured (Ha)_2018, SI Per Ha (Inr/Ha)_2018, Sum Insured (Inr)_2018, Indemnity Level_2018, Loss_2018, Season, Crop, Area Sown (Ha), Area Insured (Ha), SI Per Ha (Inr/Ha), Sum Insured (Inr), Indemnity Level, key_str, Loss]
Index: []

[0 rows x 23 columns]
[PosixPath('Data/RawData/2017/2017_Telangana_Kharif.xlsx'), PosixPath('Data/RawData/2018/2018_Telangana_Kharif.xlsx'), PosixPath('Data/RawData/2019/2019_Telangana_Kharif.xlsx')]


  0%|          | 0/20 [00:00<?, ?it/s]

(5, 13894)


  0%|          | 0/20 [00:00<?, ?it/s]

(5, 13894)


  0%|          | 0/20 [00:00<?, ?it/s]

(5, 17928)
Empty DataFrame
Columns: [Crop_2017, Area Sown (Ha)_2017, Area Insured (Ha)_2017, SI Per Ha (Inr/Ha)_2017, Sum Insured (Inr)_2017, Indemnity Level_2017, Loss_2017, Crop_2018, Area Sown (Ha)_2018, Area Insured (Ha)_2018, SI Per Ha (Inr/Ha)_2018, Sum Insured (Inr)_2018, Indemnity Level_2018, Loss_2018, Season, Crop, Area Sown (Ha), Area Insured (Ha), SI Per Ha (Inr/Ha), Sum Insured (Inr), Indemnity Level, key_str, Loss]
Index: []

[0 rows x 23 columns]
[PosixPath('Data/RawData/2017/2017_Madhya Pradesh_Rabi.xlsx'), PosixPath('Data/RawData/2018/2018_Madhya Pradesh_Rabi.xlsx'), PosixPath('Data/RawData/2019/2019_Madhya Pradesh_Rabi.xlsx')]


  0%|          | 0/20 [00:00<?, ?it/s]

KeyError: '2015 Yield'

In [34]:
"2018" < "2097"

True

In [43]:
notFull

['Bihar_Rabi']