In [82]:
import os
import pandas as pd
import numpy as np

from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn import feature_selection
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics.pairwise import euclidean_distances
import random

# Visualization
import matplotlib
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from plotly import graph_objects as go
from sklearn.neighbors import NearestNeighbors

#use streamlit for deployment

In [83]:
fertilizer_df= pd.read_csv("/content/Fertilizer.csv")
crop_df=pd.read_csv("/content/Extended_Crop_Recommendation.csv")
season_df=pd.read_csv("/content/data_season.csv")
yield_df=pd.read_csv("/content/crop_yield.csv")
tunisia_df=pd.read_csv("/content/Tunisie_Sol.csv")

In [84]:
fertilizer_df.head(5)

Unnamed: 0,Nitrogen,Potassium,Phosphorous,Fertilizer Name
0,37,0,0,Urea
1,12,0,36,DAP
2,7,9,30,Fourteen-Thirty Five-Fourteen
3,22,0,20,Twenty Eight-Twenty Eight
4,35,0,0,Urea


In [85]:
crop_df.head()

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90,42,43,20.879744,82.002744,6.502985,202.935536,rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,rice
4,78,42,42,20.130175,81.604873,7.628473,262.71734,rice


In [86]:
yield_df.head()

Unnamed: 0,Crop,Crop_Year,Season,State,Area,Production,Annual_Rainfall,Fertilizer,Pesticide,Yield
0,Arecanut,1997,Whole Year,Assam,73814.0,56708,2051.4,7024878.38,22882.34,0.796087
1,Arhar/Tur,1997,Kharif,Assam,6637.0,4685,2051.4,631643.29,2057.47,0.710435
2,Castor seed,1997,Kharif,Assam,796.0,22,2051.4,75755.32,246.76,0.238333
3,Coconut,1997,Whole Year,Assam,19656.0,126905000,2051.4,1870661.52,6093.36,5238.051739
4,Cotton(lint),1997,Kharif,Assam,1739.0,794,2051.4,165500.63,539.09,0.420909


In [87]:
crop_df_crops = crop_df['label'].unique()
yield_df_crops = yield_df['Crop'].unique()

crop_df_crops, season_df_crops, yield_df_crops

(array(['rice', 'maize', 'chickpea', 'kidneybeans', 'pigeonpeas',
        'mothbeans', 'mungbean', 'blackgram', 'lentil', 'pomegranate',
        'banana', 'mango', 'grapes', 'watermelon', 'muskmelon', 'apple',
        'orange', 'papaya', 'coconut', 'cotton', 'jute', 'coffee', 'Wheat',
        'Barley', 'green gram', 'peas', 'Horse-gram', 'Groundnut',
        'Sesamum', 'Sunflower', 'Potato', 'Sweet potato', 'Garlic',
        'Onion', 'Coriander', 'Tobacco', 'Urad'], dtype=object),
 array(['Coconut', 'Cocoa', 'Coffee', 'Cardamum', 'Pepper', 'Arecanut',
        'Ginger', 'Tea', 'Paddy', 'Groundnut', 'Blackgram', 'Cashew',
        'Cotton'], dtype=object),
 array(['Arecanut', 'Arhar/Tur', 'Castor seed', 'Coconut ', 'Cotton(lint)',
        'Dry chillies', 'Gram', 'Jute', 'Linseed', 'Maize', 'Mesta',
        'Niger seed', 'Onion', 'Other  Rabi pulses', 'Potato',
        'Rapeseed &Mustard', 'Rice', 'Sesamum', 'Small millets',
        'Sugarcane', 'Sweet potato', 'Tapioca', 'Tobacco', 'Turme

In [88]:
tunisia_df.head()

Unnamed: 0,system:index,ADM0_CODE,ADM0_NAME,ADM1_CODE,ADM1_NAME,DISP_AREA,EXP1_YEAR,Nitrogen,Phosphorus,Potassium,Profondeur_cm,STATUS,STR1_YEAR,Shape_Area,Shape_Leng,pH,.geo
0,1_1_1_1_1_0000000000000000086a,248,Tunisia,2993,Ariana,NO,3000,73.406243,22.222009,45.421008,0,Member State,1000,0.045516,1.494669,65.196538,"{""type"":""MultiPolygon"",""coordinates"":[[[[10.19..."
1,1_1_1_1_1_0000000000000000086b,248,Tunisia,2994,Ben Arous,NO,3000,77.959755,23.461844,45.62387,0,Member State,1000,0.066773,1.817406,65.518249,"{""type"":""MultiPolygon"",""coordinates"":[[[[10.28..."
2,1_1_1_1_1_0000000000000000086c,248,Tunisia,2995,Bizerte,NO,3000,87.015902,23.211956,45.582688,0,Member State,1000,0.359975,5.60906,63.283688,"{""type"":""MultiPolygon"",""coordinates"":[[[[9.840..."
3,1_1_1_1_1_0000000000000000086d,248,Tunisia,2996,Beja,NO,3000,81.28308,21.718121,45.04004,0,Member State,1000,0.361929,4.07154,64.468094,"{""type"":""MultiPolygon"",""coordinates"":[[[[8.986..."
4,1_1_1_1_1_0000000000000000086e,248,Tunisia,2997,Gabes,NO,3000,25.404086,10.264953,21.478236,0,Member State,1000,0.727515,4.66162,68.484813,"{""type"":""MultiPolygon"",""coordinates"":[[[[10.29..."


Traitement des données

Traitement de la table tunise_sol

In [89]:
cols_to_drop = ['ADM0_NAME','system:index', 'ADM0_CODE', 'ADM1_CODE', 'DISP_AREA',
                'EXP1_YEAR', 'STATUS', 'STR1_YEAR', '.geo']
tunisia_df_clean = tunisia_df.drop(columns=cols_to_drop)


In [90]:
tunisia_df_clean = tunisia_df_clean[tunisia_df_clean['Profondeur_cm'].isin([0, 10])]

tunisia_df_clean.head()

Unnamed: 0,ADM1_NAME,Nitrogen,Phosphorus,Potassium,Profondeur_cm,Shape_Area,Shape_Leng,pH
0,Ariana,73.406243,22.222009,45.421008,0,0.045516,1.494669,65.196538
1,Ben Arous,77.959755,23.461844,45.62387,0,0.066773,1.817406,65.518249
2,Bizerte,87.015902,23.211956,45.582688,0,0.359975,5.60906,63.283688
3,Beja,81.28308,21.718121,45.04004,0,0.361929,4.07154,64.468094
4,Gabes,25.404086,10.264953,21.478236,0,0.727515,4.66162,68.484813


In [91]:
tunisia_df_clean = tunisia_df_clean.rename(columns={'ADM1_NAME': 'gouvernerat'})
tunisia_df_clean.head()

Unnamed: 0,gouvernerat,Nitrogen,Phosphorus,Potassium,Profondeur_cm,Shape_Area,Shape_Leng,pH
0,Ariana,73.406243,22.222009,45.421008,0,0.045516,1.494669,65.196538
1,Ben Arous,77.959755,23.461844,45.62387,0,0.066773,1.817406,65.518249
2,Bizerte,87.015902,23.211956,45.582688,0,0.359975,5.60906,63.283688
3,Beja,81.28308,21.718121,45.04004,0,0.361929,4.07154,64.468094
4,Gabes,25.404086,10.264953,21.478236,0,0.727515,4.66162,68.484813


Fusion des fichiers

In [92]:
# Appliquer la même logique avec le bon nom de colonne

yield_df['Crop_clean'] = yield_df['Crop'].str.lower().str.strip()

# Créer un dictionnaire Crop_clean -> Season
season_dict = yield_df.drop_duplicates(subset='Crop_clean').set_index('Crop_clean')['Season'].to_dict()

# Mapper les saisons sur crop_df
crop_df['label_clean'] = crop_df['label'].str.lower().str.strip()
crop_df['Season'] = crop_df['label_clean'].map(season_dict)

# Supprimer la colonne temporaire
crop_df.drop(columns=['label_clean'], inplace=True)

# Afficher le résultat
crop_df

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label,Season
0,90,42,43,20.879744,82.002744,6.502985,202.935536,rice,Autumn
1,85,58,41,21.770462,80.319644,7.038096,226.655537,rice,Autumn
2,60,55,44,23.004459,82.320763,7.840207,263.964248,rice,Autumn
3,74,35,40,26.491096,80.158363,6.980401,242.864034,rice,Autumn
4,78,42,42,20.130175,81.604873,7.628473,262.717340,rice,Autumn
...,...,...,...,...,...,...,...,...,...
2328,39,27,28,27.940000,46.830000,7.140000,563.960000,Urad,Kharif
2329,25,25,28,25.540000,41.600000,6.050000,576.440000,Urad,Kharif
2330,37,22,28,25.890000,42.530000,6.230000,494.140000,Urad,Kharif
2331,41,26,24,29.530000,59.060000,7.200000,460.000000,Urad,Kharif


dictionnaire irrigation

In [93]:
irrigation_methods = {
    'rice': 'Flood',
    'maize': 'Sprinkler',
    'chickpea': 'Drip',
    'kidneybeans': 'Drip',
    'pigeonpeas': 'Drip',
    'mothbeans': 'Drip',
    'mungbean': 'Drip',
    'blackgram': 'Drip',
    'lentil': 'Drip',
    'pomegranate': 'Drip',
    'banana': 'Sprinkler',
    'mango': 'Drip',
    'grapes': 'Drip',
    'watermelon': 'Sprinkler',
    'muskmelon': 'Sprinkler',
    'apple': 'Sprinkler',
    'orange': 'Drip',
    'papaya': 'Sprinkler',
    'coconut': 'Flood',
    'cotton': 'Drip',
    'jute': 'Flood',
    'coffee': 'Sprinkler',
    'wheat': 'Sprinkler',
    'barley': 'Sprinkler',
    'green gram': 'Drip',
    'peas': 'Drip',
    'horse-gram': 'Drip',
    'groundnut': 'Drip',
    'sesamum': 'Drip',
    'sunflower': 'Drip',
    'potato': 'Sprinkler',
    'sweet potato': 'Sprinkler',
    'garlic': 'Drip',
    'onion': 'Drip',
    'coriander': 'Drip',
    'tobacco': 'Sprinkler',
    'urad': 'Drip'
}

In [94]:
crop_df['Irrigation'] = crop_df['label'].str.lower().map(irrigation_methods)

In [95]:
crop_df

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label,Season,Irrigation
0,90,42,43,20.879744,82.002744,6.502985,202.935536,rice,Autumn,Flood
1,85,58,41,21.770462,80.319644,7.038096,226.655537,rice,Autumn,Flood
2,60,55,44,23.004459,82.320763,7.840207,263.964248,rice,Autumn,Flood
3,74,35,40,26.491096,80.158363,6.980401,242.864034,rice,Autumn,Flood
4,78,42,42,20.130175,81.604873,7.628473,262.717340,rice,Autumn,Flood
...,...,...,...,...,...,...,...,...,...,...
2328,39,27,28,27.940000,46.830000,7.140000,563.960000,Urad,Kharif,Drip
2329,25,25,28,25.540000,41.600000,6.050000,576.440000,Urad,Kharif,Drip
2330,37,22,28,25.890000,42.530000,6.230000,494.140000,Urad,Kharif,Drip
2331,41,26,24,29.530000,59.060000,7.200000,460.000000,Urad,Kharif,Drip


In [96]:
soil_features = ['Nitrogen', 'Phosphorus', 'Potassium', 'pH']
crop_features = ['N', 'P', 'K', 'ph']

tunisia_values = tunisia_df_clean[soil_features].values
crop_values = crop_df[crop_features].values

# Calcul de la distance euclidienne
distances = euclidean_distances(crop_values, tunisia_values)
closest_indices = np.argmin(distances, axis=1)

# Ajout du gouvernerat à crop_df
crop_df['gouvernerat'] = tunisia_df_clean.iloc[closest_indices]['gouvernerat'].values

In [97]:
crop_df

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label,Season,Irrigation,gouvernerat
0,90,42,43,20.879744,82.002744,6.502985,202.935536,rice,Autumn,Flood,Bizerte
1,85,58,41,21.770462,80.319644,7.038096,226.655537,rice,Autumn,Flood,Bizerte
2,60,55,44,23.004459,82.320763,7.840207,263.964248,rice,Autumn,Flood,Tunis
3,74,35,40,26.491096,80.158363,6.980401,242.864034,rice,Autumn,Flood,Bizerte
4,78,42,42,20.130175,81.604873,7.628473,262.717340,rice,Autumn,Flood,Bizerte
...,...,...,...,...,...,...,...,...,...,...,...
2328,39,27,28,27.940000,46.830000,7.140000,563.960000,Urad,Kharif,Drip,Mahdia
2329,25,25,28,25.540000,41.600000,6.050000,576.440000,Urad,Kharif,Drip,Sfax
2330,37,22,28,25.890000,42.530000,6.230000,494.140000,Urad,Kharif,Drip,Sidi Bouz
2331,41,26,24,29.530000,59.060000,7.200000,460.000000,Urad,Kharif,Drip,Mahdia


Nettoyage des bases de données

In [98]:
crop_df_crops = crop_df['label'].unique()
crop_df_crops

array(['rice', 'maize', 'chickpea', 'kidneybeans', 'pigeonpeas',
       'mothbeans', 'mungbean', 'blackgram', 'lentil', 'pomegranate',
       'banana', 'mango', 'grapes', 'watermelon', 'muskmelon', 'apple',
       'orange', 'papaya', 'coconut', 'cotton', 'jute', 'coffee', 'Wheat',
       'Barley', 'green gram', 'peas', 'Horse-gram', 'Groundnut',
       'Sesamum', 'Sunflower', 'Potato', 'Sweet potato', 'Garlic',
       'Onion', 'Coriander', 'Tobacco', 'Urad'], dtype=object)

In [99]:
tunisia_crops = [
    'wheat', 'barley', 'maize', 'chickpea', 'lentil', 'peas', 'green gram',
    'potato', 'sweet potato', 'garlic', 'onion', 'coriander',
    'orange', 'apple', 'grapes', 'pomegranate', 'watermelon',
    'muskmelon', 'olive', 'citrus', 'mango', 'sunflower', 'groundnut',
    'tobacco',
]

In [100]:
# Normaliser les cultures pour la comparaison (tout en minuscule)
tunisia_crops_set = set([c.lower() for c in tunisia_crops])
crop_df = crop_df[crop_df['label'].str.lower().isin(tunisia_crops_set)]

In [101]:
crop_df

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label,Season,Irrigation,gouvernerat
100,71,54,16,22.613600,63.690706,5.749914,87.759539,maize,Kharif,Sprinkler,Manouba
101,61,44,17,26.100184,71.574769,6.931757,102.266244,maize,Kharif,Sprinkler,Le Kef
102,80,43,16,23.558821,71.593514,6.657965,66.719955,maize,Kharif,Sprinkler,Bizerte
103,73,58,21,19.972160,57.682729,6.596061,60.651715,maize,Kharif,Sprinkler,Bizerte
104,61,38,20,18.478913,62.695039,5.970458,65.438354,maize,Kharif,Sprinkler,Le Kef
...,...,...,...,...,...,...,...,...,...,...,...
2321,49,36,62,22.970000,60.110000,5.790000,529.810000,Tobacco,Whole Year,Sprinkler,Tunis
2322,44,30,68,23.840000,67.780000,5.820000,657.570000,Tobacco,Whole Year,Sprinkler,Ariana
2323,50,44,66,29.580000,63.220000,6.350000,590.190000,Tobacco,Whole Year,Sprinkler,Tunis
2324,43,38,63,26.120000,67.570000,6.420000,524.140000,Tobacco,Whole Year,Sprinkler,Tunis


In [102]:
# Define the mapping dictionary
season_mapping = {
    "Kharif": "Summer",
    "Rabi": "Winter"
}

# Strip any spaces or hidden characters from 'Season' before replacing
crop_df['Season'] = crop_df['Season'].str.strip().replace(season_mapping)

# Display the first few rows to confirm the changes
crop_df.sample(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  crop_df['Season'] = crop_df['Season'].str.strip().replace(season_mapping)


Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label,Season,Irrigation,gouvernerat
1109,30,28,30,31.866414,52.193316,5.064613,98.467686,mango,,Drip,Sfax
2214,28,49,21,18.43,42.07,6.77,119.21,chickpea,,Drip,Sidi Bouz
1178,28,27,34,32.454653,50.696938,6.526654,95.048716,mango,,Drip,Sfax
2222,27,47,35,18.39,44.92,7.26,63.54,lentil,,Drip,Sidi Bouz
968,14,25,40,20.073865,90.978197,6.407872,103.708405,pomegranate,,Drip,Sfax
188,78,58,15,25.009334,67.816568,6.528631,62.913595,maize,Summer,Sprinkler,Bizerte
1180,1,29,29,27.329614,49.303472,6.052026,93.531974,mango,,Drip,Sfax
157,77,58,19,22.805603,56.507689,5.79165,101.595279,maize,Summer,Sprinkler,Bizerte
1331,92,21,48,25.816922,82.043255,6.377427,54.829634,watermelon,,Sprinkler,Jendouba
908,25,27,41,19.200904,94.276596,6.923509,108.042355,pomegranate,,Drip,Sfax


In [103]:
null_season_df = crop_df[crop_df['Season'].isna()]
null_season_df_crops = null_season_df['label'].unique()
null_season_df_crops

array(['chickpea', 'lentil', 'pomegranate', 'mango', 'grapes',
       'watermelon', 'muskmelon', 'apple', 'orange', 'green gram', 'peas'],
      dtype=object)

In [104]:
season_dict = {
    'chickpea': 'Winter',         # grows mainly through the winter
    'lentil': 'Winter',           # develops during winter
    'pomegranate': 'Summer',      # matures in late summer
    'mango': 'Summer',            # ripens in summer
    'grapes': 'Summer',           # grows and ripens in summer
    'watermelon': 'Summer',       # rapid growth in early summer
    'muskmelon': 'Spring',        # grows mainly in spring
    'apple': 'Autumn',            # matures mostly in autumn
    'orange': 'Winter',           # develops and harvested in winter
    'green gram': 'Summer',       # summer short-cycle pulse
    'peas': 'Winter'              # slow development through winter
}

In [109]:
crop_df.loc[crop_df['Season'].isna(), 'Season'] = crop_df.loc[crop_df['Season'].isna(), 'label'].str.lower().map(season_dict)

In [110]:
null_season_df = crop_df[crop_df['Season'].isna()]
null_season_df_crops = null_season_df['label'].unique()
null_season_df_crops

array([], dtype=object)

In [111]:
crop_df

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label,Season,Irrigation,gouvernerat
100,71,54,16,22.613600,63.690706,5.749914,87.759539,maize,Summer,Sprinkler,Manouba
101,61,44,17,26.100184,71.574769,6.931757,102.266244,maize,Summer,Sprinkler,Le Kef
102,80,43,16,23.558821,71.593514,6.657965,66.719955,maize,Summer,Sprinkler,Bizerte
103,73,58,21,19.972160,57.682729,6.596061,60.651715,maize,Summer,Sprinkler,Bizerte
104,61,38,20,18.478913,62.695039,5.970458,65.438354,maize,Summer,Sprinkler,Le Kef
...,...,...,...,...,...,...,...,...,...,...,...
2321,49,36,62,22.970000,60.110000,5.790000,529.810000,Tobacco,Whole Year,Sprinkler,Tunis
2322,44,30,68,23.840000,67.780000,5.820000,657.570000,Tobacco,Whole Year,Sprinkler,Ariana
2323,50,44,66,29.580000,63.220000,6.350000,590.190000,Tobacco,Whole Year,Sprinkler,Tunis
2324,43,38,63,26.120000,67.570000,6.420000,524.140000,Tobacco,Whole Year,Sprinkler,Tunis


In [113]:
print("Duplicate Values =",crop_df.duplicated().sum()) #duplicate

Duplicate Values = 0


Modélisation

Déploiement