In [43]:
import pandas as pd

# Crop and Soil Properties Data Processing

In [44]:
# Reading the data 

crop_data_path = '../data/data_raw/cpdata.csv'
soil_data_path = '../data/data_raw/soil_properties.csv'

crop_data = pd.read_csv(crop_data_path)
soil_data = pd.read_csv(soil_data_path)

In [45]:
# Shuffle the crop dataframe
crop_data_shuffled = crop_data.sample(frac=1)

# Displaying the crop data
crop_data_shuffled.head()

Unnamed: 0,temperature,humidity,ph,rainfall,label
618,29.030176,64.491666,7.475927,54.939377,Lentil
2636,27.427117,90.026962,6.379691,21.750877,muskmelon
2405,22.860066,93.128599,5.824152,117.729673,apple
2376,17.986678,81.177121,5.777271,72.371277,grapes
827,27.560886,68.492999,6.516312,167.435808,Coffee


In [46]:
# Shuffle the soil dataframe
soil_data_shuffled = soil_data.sample(frac=1)

#Displaying the soil data
soil_data_shuffled.head()

Unnamed: 0.1,Unnamed: 0,Crop,N,P,K,pH
234,234,Onion,120,60,65,6.04
1493,1493,Lady Finger,80,10,40,6.38
482,482,Jute,80,40,40,5.58
653,653,Custard apple,40,20,40,6.62
249,249,Garlic,50,10,60,5.54


In [47]:
# Function for lowering the cases

def change_case(crop):
    crop = crop.replace(" ", "")
    crop = crop.lower()
    return crop

crop_data_shuffled['label'] = crop_data_shuffled['label'].apply(change_case)
soil_data_shuffled['Crop'] = soil_data_shuffled['Crop'].apply(change_case)

In [48]:
# Cleaning up the soil data

soil_data_shuffled['Crop'] = soil_data_shuffled['Crop'].replace('mungbeans','mungbean')
soil_data_shuffled['Crop'] = soil_data_shuffled['Crop'].replace('lentils(masoordal)','lentil')
soil_data_shuffled['Crop'] = soil_data_shuffled['Crop'].replace('pigeonpeas(toordal)','pigeonpeas')
soil_data_shuffled['Crop'] = soil_data_shuffled['Crop'].replace('mothbean(matki)','mothbeans')
soil_data_shuffled['Crop'] = soil_data_shuffled['Crop'].replace('chickpeas(channa)','chickpea')
del soil_data_shuffled['Unnamed: 0']

In [49]:
# Displaying the shuffled crop data
crop_data_shuffled.head()

Unnamed: 0,temperature,humidity,ph,rainfall,label
618,29.030176,64.491666,7.475927,54.939377,lentil
2636,27.427117,90.026962,6.379691,21.750877,muskmelon
2405,22.860066,93.128599,5.824152,117.729673,apple
2376,17.986678,81.177121,5.777271,72.371277,grapes
827,27.560886,68.492999,6.516312,167.435808,coffee


In [50]:
crop_names = crop_data_shuffled['label'].unique()
crop_names

array(['lentil', 'muskmelon', 'apple', 'grapes', 'coffee', 'sugarcane',
       'millet', 'adzukibeans', 'rice', 'jute', 'banana', 'mango',
       'wheat', 'kidneybeans', 'coconut', 'chickpea', 'peas',
       'watermelon', 'pomegranate', 'pigeonpeas', 'tobacco', 'mothbeans',
       'cotton', 'blackgram', 'mungbean', 'groundnut', 'maize', 'tea',
       'orange', 'papaya', 'rubber'], dtype=object)

In [51]:
# Displaying the shuffled soil data
soil_data_shuffled.head()

Unnamed: 0,Crop,N,P,K,pH
234,onion,120,60,65,6.04
1493,ladyfinger,80,10,40,6.38
482,jute,80,40,40,5.58
653,custardapple,40,20,40,6.62
249,garlic,50,10,60,5.54


In [52]:
crop_names_from_soil_data = soil_data_shuffled['Crop'].unique()
crop_names_from_soil_data

array(['onion', 'ladyfinger', 'jute', 'custardapple', 'garlic',
       'ashgourd', 'bayleaf', 'cloves', 'ziziphusmauritiana(bor)',
       'arecanut', 'muskmelon', 'turmeric', 'pistachionut', 'lemon',
       'potato', 'ragi(naachnnii)', 'olive', 'mango', 'cashewnuts',
       'capsicum', 'tomato', 'jambun(syzygiumcumini)', 'kidneybeans',
       'pineapple', 'carrot', 'cinnamon', 'almondnut',
       'fenugreekleaf(methi)', 'garciniaindica(kokam)', 'sesameseed',
       'watermelon', 'coconut', 'curryleaves', 'jowar(sorghum)', 'ginger',
       'raisins', 'drumstick–moringa', 'dates', 'soyabean', 'bittergourd',
       'jackfruit', 'brinjal', 'mothbeans', 'maize', 'mungbean',
       'favabeans(papdi-val)', 'mushroom', 'frenchbeans(farasbi)',
       'barley(jav)', 'cucumber', 'guava', 'pomegranate', 'ridgegourd',
       'cabbage', 'rice', 'cotton', 'apple', 'sunflower', 'cuminseeds',
       'blackgram', 'radish', 'asafoetida', 'aniseed', 'lentil', 'papaya',
       'grapes', 'corianderseeds', '

In [53]:
# Extracting relevant labels from crop_data based on crop_names in soil_data

extract_labels = []
for crop_name in crop_names_from_soil_data:
    if crop_name in crop_names:
        extract_labels.append(crop_name)

In [54]:
# using extract labesl on crop to get all the data related to those labels
new_crop_data = pd.DataFrame(columns = crop_data_shuffled.columns)
new_soil_data = pd.DataFrame(columns = soil_data_shuffled.columns)

In [55]:
for label in extract_labels:
    new_crop_data = new_crop_data.append(crop_data_shuffled[crop_data_shuffled['label'] == label])

  new_crop_data = new_crop_data.append(crop_data_shuffled[crop_data_shuffled['label'] == label])
  new_crop_data = new_crop_data.append(crop_data_shuffled[crop_data_shuffled['label'] == label])
  new_crop_data = new_crop_data.append(crop_data_shuffled[crop_data_shuffled['label'] == label])
  new_crop_data = new_crop_data.append(crop_data_shuffled[crop_data_shuffled['label'] == label])
  new_crop_data = new_crop_data.append(crop_data_shuffled[crop_data_shuffled['label'] == label])
  new_crop_data = new_crop_data.append(crop_data_shuffled[crop_data_shuffled['label'] == label])
  new_crop_data = new_crop_data.append(crop_data_shuffled[crop_data_shuffled['label'] == label])
  new_crop_data = new_crop_data.append(crop_data_shuffled[crop_data_shuffled['label'] == label])
  new_crop_data = new_crop_data.append(crop_data_shuffled[crop_data_shuffled['label'] == label])
  new_crop_data = new_crop_data.append(crop_data_shuffled[crop_data_shuffled['label'] == label])
  new_crop_data = new_crop_dat

In [56]:
for label in extract_labels:
    new_soil_data = new_soil_data.append(soil_data_shuffled[soil_data_shuffled['Crop'] == label].iloc[0])

  new_soil_data = new_soil_data.append(soil_data_shuffled[soil_data_shuffled['Crop'] == label].iloc[0])
  new_soil_data = new_soil_data.append(soil_data_shuffled[soil_data_shuffled['Crop'] == label].iloc[0])
  new_soil_data = new_soil_data.append(soil_data_shuffled[soil_data_shuffled['Crop'] == label].iloc[0])
  new_soil_data = new_soil_data.append(soil_data_shuffled[soil_data_shuffled['Crop'] == label].iloc[0])
  new_soil_data = new_soil_data.append(soil_data_shuffled[soil_data_shuffled['Crop'] == label].iloc[0])
  new_soil_data = new_soil_data.append(soil_data_shuffled[soil_data_shuffled['Crop'] == label].iloc[0])
  new_soil_data = new_soil_data.append(soil_data_shuffled[soil_data_shuffled['Crop'] == label].iloc[0])
  new_soil_data = new_soil_data.append(soil_data_shuffled[soil_data_shuffled['Crop'] == label].iloc[0])
  new_soil_data = new_soil_data.append(soil_data_shuffled[soil_data_shuffled['Crop'] == label].iloc[0])
  new_soil_data = new_soil_data.append(soil_data_shuffled[soil_d

In [57]:
new_crop_data

Unnamed: 0,temperature,humidity,ph,rainfall,label
710,24.886928,71.917115,7.319735,150.249867,jute
771,24.806250,82.092817,6.356296,156.361617,jute
704,23.584193,72.004608,6.090060,190.424216,jute
729,26.213128,81.704764,6.667633,180.123776,jute
730,24.307486,78.543410,6.186814,186.233757,jute
...,...,...,...,...,...
862,26.350342,58.506502,7.460175,121.558630,coffee
815,25.177877,62.262446,6.647766,135.011965,coffee
847,27.745770,54.369761,7.205079,139.861943,coffee
829,23.177144,52.138640,6.959404,117.311356,coffee


In [58]:
new_soil_data

Unnamed: 0,Crop,N,P,K,pH
482,jute,80,40,40,5.58
940,muskmelon,100,10,50,5.68
1711,mango,20,20,30,4.84
1079,kidneybeans,20,60,20,5.46
1521,watermelon,100,10,50,5.38
573,coconut,20,10,30,5.1
208,mothbeans,20,40,20,5.54
1264,maize,80,40,20,5.42
1276,mungbean,20,40,20,5.42
642,pomegranate,20,10,40,5.62


In [59]:
new_crop_data.to_csv('../data/data_raw/crop_data_merged.csv')
new_soil_data.to_csv('../data/data_raw/soil_data_merged.csv')