In [1]:
# import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings

warnings.filterwarnings('ignore')


In [2]:
# read the data
df = pd.read_csv('crop_production.csv')

In [3]:
df.head()

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production
0,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Arecanut,1254.0,2000.0
1,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Other Kharif pulses,2.0,1.0
2,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Rice,102.0,321.0
3,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Banana,176.0,641.0
4,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Cashewnut,720.0,165.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246091 entries, 0 to 246090
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   State_Name     246091 non-null  object 
 1   District_Name  246091 non-null  object 
 2   Crop_Year      246091 non-null  int64  
 3   Season         246091 non-null  object 
 4   Crop           246091 non-null  object 
 5   Area           246091 non-null  float64
 6   Production     242361 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 9.4+ MB


In [5]:
df.dtypes

State_Name        object
District_Name     object
Crop_Year          int64
Season            object
Crop              object
Area             float64
Production       float64
dtype: object

In [6]:
df.shape

(246091, 7)

In [7]:
df['Production'].isna().sum()

3730

In [8]:
Frames=[]
for i in list (set(df['District_Name'])):
  df_district=df[df['District_Name']==i]
  df_district['Production'].fillna(df_district['Production'].mean(),inplace=True)
  Frames.append(df_district)
  final_df= pd.concat(Frames)

In [9]:
final_df.isna().any()

State_Name       False
District_Name    False
Crop_Year        False
Season           False
Crop             False
Area             False
Production       False
dtype: bool

In [10]:
final_df.duplicated(subset=None, keep='first').any()

False

In [11]:
import plotly.io as pio
pio.renderers.default = 'iframe'

In [12]:
# Compute the correlation matrix
correlation_matrix = final_df.corr()

# Create a correlation heatmap
fig = go.Figure(data=go.Heatmap(
    z=correlation_matrix.values,
    x=correlation_matrix.columns,
    y=correlation_matrix.columns,
    colorscale='Viridis',  # Choose the desired colorscale
    colorbar=dict(title='Correlation')  # Customize the colorbar title
))

# Add text annotations
fig.update_layout(
    title="Correlation Heatmap",
    xaxis_title="Variables",
    yaxis_title="Variables"
)

# Display the correlation heatmap
fig.show()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [13]:
# Univariate analysis

In [14]:
# state name variable

In [15]:
print(final_df['State_Name'].nunique())
print(final_df['State_Name'].unique())

33
['Haryana' 'Tamil Nadu' 'Rajasthan' 'Jharkhand' 'Punjab' 'Uttar Pradesh'
 'West Bengal' 'Madhya Pradesh' 'Himachal Pradesh' 'Bihar' 'Maharashtra'
 'Gujarat' 'Odisha' 'Assam' 'Karnataka' 'Andhra Pradesh' 'Nagaland'
 'Uttarakhand' 'Chhattisgarh' 'Telangana ' 'Jammu and Kashmir ' 'Tripura'
 'Arunachal Pradesh' 'Meghalaya' 'Manipur' 'Mizoram' 'Kerala' 'Sikkim'
 'Puducherry' 'Goa' 'Dadra and Nagar Haveli' 'Andaman and Nicobar Islands'
 'Chandigarh']


In [16]:
final_df['State_Name'].value_counts()


Uttar Pradesh                  33306
Madhya Pradesh                 22943
Karnataka                      21122
Bihar                          18885
Assam                          14628
Odisha                         13575
Tamil Nadu                     13547
Maharashtra                    12628
Rajasthan                      12514
Chhattisgarh                   10709
Andhra Pradesh                  9628
West Bengal                     9613
Gujarat                         8436
Haryana                         5875
Telangana                       5649
Uttarakhand                     4896
Kerala                          4261
Nagaland                        3906
Punjab                          3173
Meghalaya                       2867
Arunachal Pradesh               2546
Himachal Pradesh                2494
Jammu and Kashmir               1634
Tripura                         1412
Manipur                         1267
Jharkhand                       1266
Mizoram                          957
P

In [17]:
# district name variable

In [18]:
print(final_df['District_Name'].nunique())
print(final_df['District_Name'].unique())

646
['FATEHABAD' 'VELLORE' 'HISAR' 'BUNDI' 'RAMGARH' 'FAZILKA' 'MAHOBA'
 '24 PARAGANAS NORTH' 'BHOPAL' 'SIRMAUR' 'GONDA' 'AURANGABAD' 'JUNAGADH'
 'HOSHANGABAD' 'WEST SINGHBHUM' 'CHANDAULI' 'PANNA' 'SAGAR' 'SUNDARGARH'
 'SINDHUDURG' 'DIMA HASAO' 'BHIWANI' 'ALIRAJPUR' 'KOLAR' 'RANCHI'
 'SPSR NELLORE' 'MANDLA' 'NANDED' 'DAMOH' 'MOKOKCHUNG' 'NAINITAL'
 'NUAPADA' 'BEMETARA' 'HYDERABAD' 'SONBHADRA' 'LAKHISARAI' 'MUKTSAR'
 'WASHIM' 'KHEDA' 'PULWAMA' 'YAMUNANAGAR' 'TUENSANG' 'MUNGELI'
 'NORTH TRIPURA' 'MAU' 'SINGRAULI' 'DIBANG VALLEY' 'MORENA'
 'KANNIYAKUMARI' 'GHAZIPUR' 'MIRZAPUR' 'JAISALMER' 'BELGAUM' 'SEPAHIJALA'
 'JASHPUR' 'GANJAM' 'MEERUT' 'SANGLI' 'TAPI' 'DIBRUGARH' 'VARANASI'
 'NALANDA' 'WEST GODAVARI' 'JAMUI' 'CACHAR' 'BANSWARA' 'GURGAON'
 'NIZAMABAD' 'MORADABAD' 'BARAN' 'JALAUN' 'BUXAR' 'BULDHANA' 'KADAPA'
 'SOUTH WEST GARO HILLS' 'KISHANGANJ' 'ASHOKNAGAR' 'PASHCHIM CHAMPARAN'
 'THIRUVARUR' 'JIND' 'HOWRAH' 'RAIGAD' 'BENGALURU URBAN' 'AHMEDNAGAR'
 'VIRUDHUNAGAR' 'JAMTARA' 'SOLAN' 'NEEM

In [19]:
final_df['District_Name'].value_counts()


BIJAPUR      945
TUMKUR       936
BELGAUM      925
HASSAN       895
BELLARY      887
            ... 
HYDERABAD      8
KHUNTI         6
RAMGARH        6
NAMSAI         1
MUMBAI         1
Name: District_Name, Length: 646, dtype: int64

In [20]:
# crop year varibale

In [21]:
print(final_df['Crop_Year'].nunique())
print(final_df['Crop_Year'].max())
print(final_df['Crop_Year'].min())

19
2015
1997


In [22]:
final_df.Crop_Year.value_counts()


2003    17287
2002    16671
2008    14550
2007    14526
2006    14328
2004    14117
2009    14116
2011    14071
2010    14065
2005    13799
2000    13658
2013    13650
2012    13410
2001    13361
1999    12515
1998    11533
2014    10973
1997     8899
2015      562
Name: Crop_Year, dtype: int64

In [23]:
# season variable

In [24]:
final_df['Season'].nunique()


6

In [25]:
final_df['Season'].max()

'Winter     '

In [26]:
final_df['Season'].value_counts()

Kharif         95951
Rabi           66987
Whole Year     57305
Summer         14841
Winter          6058
Autumn          4949
Name: Season, dtype: int64

In [27]:
# crop variable

In [28]:
print(final_df['Crop'].nunique())
print(final_df.Crop.value_counts().head(10))

124
Rice                 15104
Maize                13947
Moong(Green Gram)    10318
Urad                  9850
Sesamum               9046
Groundnut             8834
Sugarcane             7921
Wheat                 7899
Rapeseed &Mustard     7592
Arhar/Tur             7578
Name: Crop, dtype: int64


In [29]:
final_df['Crop'].max()


'other oilseeds'

In [30]:
final_df['Crop'].unique()


array(['Bajra', 'Other Kharif pulses', 'Rice', 'Barley', 'Gram', 'Wheat',
       'Cotton(lint)', 'Sugarcane', 'Arhar/Tur', 'Groundnut', 'Maize',
       'Moong(Green Gram)', 'Moth', 'Sesamum', 'Masoor',
       'Rapeseed &Mustard', 'Sunflower', 'Dry chillies', 'Potato', 'Urad',
       'Peas & beans (Pulses)', 'Onion', 'Sannhamp', 'Grapes',
       'Other Fresh Fruits', 'Other Vegetables', 'Castor seed', 'Garlic',
       'Guar seed', 'Banana', 'Horse-gram', 'Small millets', 'Cashewnut',
       'Coriander', 'Jowar', 'Pulses total', 'Ragi', 'Sweet potato',
       'Tapioca', 'Total foodgrain', 'Turmeric', 'Korra', 'Varagu',
       'Samai', 'Other Cereals & Millets', 'Ash Gourd',
       'Beans & Mutter(Vegetable)', 'Bhindi', 'Bitter Gourd', 'Brinjal',
       'Cabbage', 'Citrus Fruit', 'Coconut ', 'Drum Stick', 'Jack Fruit',
       'Lab-Lab', 'Litchi', 'Mango', 'Orange', 'Other Citrus Fruit',
       'Papaya', 'Pineapple', 'Pome Fruit', 'Pome Granet', 'Pump Kin',
       'Redish', 'Ribed Guard', 

In [31]:
final_df.Production.describe()


count    2.460910e+05
mean     6.069226e+05
std      1.696894e+07
min      0.000000e+00
25%      9.100000e+01
50%      7.880000e+02
75%      7.870000e+03
max      1.250800e+09
Name: Production, dtype: float64

In [32]:

# Create a boxplot using Plotly
fig = go.Figure()
fig.add_trace(go.Box(y=final_df['Area'], name='Area'))

# Customize the boxplot
fig.update_layout(
    title="Boxplot of Area",
    yaxis_title="Area"
)

# Display the boxplot
fig.show()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [33]:

# Create a boxplot using Plotly
fig = go.Figure()
fig.add_trace(go.Box(y=final_df['Production'], name='Production'))

# Customize the boxplot
fig.update_layout(
    title="Boxplot of Production",
    yaxis_title="Production"
)

# Display the boxplot
fig.show()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [34]:

# Create a horizontal bar plot using Plotly
fig = go.Figure(data=go.Bar(
    x=df['Production'],
    y=df['State_Name'],
    orientation='h'
))

# Customize the bar plot
fig.update_layout(
    title="Bar Plot of Production by State",
    xaxis_title="Production",
    yaxis_title="State Name",
    yaxis=dict(autorange="reversed")  # Reverse the order of y-axis labels
)

# Display the bar plot
fig.show()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Note: Kerala has yielded the highest production

In [35]:
# new variables

In [38]:
import plotly.graph_objects as go

# Define the zone categories and their respective states
zone_states = {
    'North India': ['Jammu and Kashmir', 'Punjab', 'Himachal Pradesh', 'Haryana', 'Uttarakhand', 'Uttar Pradesh', 'Chandigarh'],
    'East India': ['Bihar', 'Odisha', 'Jharkhand', 'West Bengal'],
    'South India': ['Andhra Pradesh', 'Karnataka', 'Kerala', 'Tamil Nadu', 'Telangana'],
    'West India': ['Rajasthan', 'Gujarat', 'Goa', 'Maharashtra'],
    'Central India': ['Madhya Pradesh', 'Chhattisgarh'],
    'North East India': ['Assam', 'Sikkim', 'Nagaland', 'Meghalaya', 'Manipur', 'Mizoram', 'Tripura', 'Arunachal Pradesh'],
    'Union Territories': ['Andaman and Nicobar Islands', 'Dadra and Nagar Haveli', 'Puducherry']
}

# Create a new DataFrame for zone-wise production
zone_production = pd.DataFrame(columns=['Zone', 'Production'])

# Calculate production for each zone
for zone, states in zone_states.items():
    zone_production = zone_production.append({
        'Zone': zone,
        'Production': df[df['State_Name'].isin(states)]['Production'].sum()
    }, ignore_index=True)

# Create a bar chart for zone-wise production
fig = go.Figure(data=go.Bar(
    x=zone_production['Zone'],
    y=zone_production['Production'],
    marker_color='blue'
))

# Customize the bar chart
fig.update_layout(
    title="Zone-Wise Production - 1997-2014",
    xaxis_title="Zone",
    yaxis_title="Production"
)

# Display the bar chart
fig.show()


In [40]:
#Zone-Wise Production - 1997-2014
north_india = ['Jammu and Kashmir', 'Punjab', 'Himachal Pradesh', 'Haryana', 'Uttarakhand', 'Uttar Pradesh', 'Chandigarh']
east_india = ['Bihar', 'Odisha', 'Jharkhand', 'West Bengal']
south_india = ['Andhra Pradesh', 'Karnataka', 'Kerala' ,'Tamil Nadu', 'Telangana']
west_india = ['Rajasthan' , 'Gujarat', 'Goa','Maharashtra']
central_india = ['Madhya Pradesh', 'Chhattisgarh']
north_east_india = ['Assam', 'Sikkim', 'Nagaland', 'Meghalaya', 'Manipur', 'Mizoram', 'Tripura', 'Arunachal Pradesh']
ut_india = ['Andaman and Nicobar Islands', 'Dadra and Nagar Haveli', 'Puducherry']

In [41]:
def get_zonal_names(row):
    if row['State_Name'].strip() in north_india:
        val = 'North Zone'
    elif row['State_Name'].strip()  in south_india:
        val = 'South Zone'
    elif row['State_Name'].strip()  in east_india:
        val = 'East Zone'
    elif row['State_Name'].strip()  in west_india:
        val = 'West Zone'
    elif row['State_Name'].strip()  in central_india:
        val = 'Central Zone'
    elif row['State_Name'].strip()  in north_east_india:
        val = 'NE Zone'
    elif row['State_Name'].strip()  in ut_india:
        val = 'Union Terr'
    else:
        val = 'No Value'
    return val

final_df['Zones'] = final_df.apply(get_zonal_names, axis=1)
final_df['Zones'].unique()

array(['North Zone', 'South Zone', 'West Zone', 'East Zone',
       'Central Zone', 'NE Zone', 'Union Terr'], dtype=object)

In [42]:
final_df['Zones'].value_counts()


South Zone      54207
North Zone      51468
East Zone       43339
West Zone       33786
Central Zone    33652
NE Zone         28297
Union Terr       1342
Name: Zones, dtype: int64

In [45]:
def cat_crop(crop):
    cereal_crops = ['Rice', 'Maize', 'Wheat', 'Barley', 'Varagu', 'Other Cereals & Millets', 'Ragi', 'Small millets', 'Bajra', 'Jowar', 'Paddy', 'Total foodgrain', 'Jobster']
    pulses_crops = ['Moong', 'Urad', 'Arhar/Tur', 'Peas & beans', 'Masoor', 'Other Kharif pulses', 'other misc. pulses', 'Ricebean (nagadal)', 'Rajmash Kholar', 'Lentil', 'Samai', 'Blackgram', 'Korra', 'Cowpea(Lobia)', 'Other Rabi pulses', 'Other Kharif pulses', 'Peas & beans (Pulses)', 'Pulses total', 'Gram']
    fruits_crops = ['Peach', 'Apple', 'Litchi', 'Pear', 'Plums', 'Ber', 'Sapota', 'Lemon', 'Pome Granet', 'Other Citrus Fruit', 'Water Melon', 'Jack Fruit', 'Grapes', 'Pineapple', 'Orange', 'Pome Fruit', 'Citrus Fruit', 'Other Fresh Fruits', 'Mango', 'Papaya', 'Coconut', 'Banana']
    beans_crops = ['Bean', 'Lab-Lab', 'Moth', 'Guar seed', 'Soyabean', 'Horse-gram']
    vegetables_crops = ['Turnip', 'Peas', 'Beet Root', 'Carrot', 'Yam', 'Ribed Guard', 'Ash Gourd', 'Pump Kin', 'Redish', 'Snak Guard', 'Bottle Gourd', 'Bitter Gourd', 'Cucumber', 'Drum Stick', 'Cauliflower', 'Beans & Mutter(Vegetable)', 'Cabbage', 'Bhindi', 'Tomato', 'Brinjal', 'Khesari', 'Sweet potato', 'Potato', 'Onion', 'Tapioca', 'Colocosia']
    spices_crops = ['Perilla', 'Ginger', 'Cardamom', 'Black pepper', 'Dry ginger', 'Garlic', 'Coriander', 'Turmeric', 'Dry chillies', 'Cond-spcs other']
    fibres_crops = ['other fibres', 'Kapas', 'Jute & mesta', 'Jute', 'Mesta', 'Cotton(lint)', 'Sannhamp']
    nuts_crops = ['Arcanut (Processed)', 'Atcanut (Raw)', 'Cashewnut Processed', 'Cashewnut Raw', 'Cashewnut', 'Arecanut', 'Groundnut']
    oilseeds_crops = ['other oilseeds', 'Safflower', 'Niger seed', 'Castor seed', 'Linseed', 'Sunflower', 'Rapeseed &Mustard', 'Sesamum', 'Oilseeds total']
    commercial_crops = ['Tobacco', 'Coffee', 'Tea', 'Sugarcane', 'Rubber']

    if crop in cereal_crops:
        return 'Cereal'
    elif crop in pulses_crops:
        return 'Pulses'
    elif crop in fruits_crops:
        return 'Fruits'
    elif crop in beans_crops:
        return 'Beans'
    elif crop in vegetables_crops:
        return 'Vegetables'
    elif crop in spices_crops:
        return 'Spices'
    elif crop in fibres_crops:
        return 'Fibres'
    elif crop in nuts_crops:
        return 'Nuts'
    elif crop in oilseeds_crops:
        return 'Oilseeds'
    elif crop in commercial_crops:
        return 'Commercial'
    else:
        return 'Other'

final_df['cat_crop'] = final_df['Crop'].apply(cat_crop)


In [46]:
final_df['cat_crop'].value_counts()


Cereal        63800
Pulses        38509
Oilseeds      34454
Vegetables    23369
Spices        21986
Other         15876
Nuts          11588
Commercial    10716
Fibres        10195
Beans          9355
Fruits         6243
Name: cat_crop, dtype: int64

In [47]:
# findings and visualizations

In [49]:
# zone-wise production

In [50]:

# Sort the DataFrame by 'Zones' in descending order
final_df_sorted = final_df.sort_values('Zones', ascending=False)

# Create a bar plot using Plotly
fig = go.Figure(data=go.Bar(
    x=final_df_sorted['Zones'],
    y=final_df_sorted['Production'],
    marker_color='blue'
))

# Set the y-axis scale to logarithmic
fig.update_layout(yaxis_type="log")

# Customize the bar plot
fig.update_layout(
    title="Zone Wise Total Production",
    xaxis_title="Zones",
    yaxis_title="Production"
)

# Display the bar plot
fig.show()


In [51]:
# analysing the state in south zone which produces the highest production

In [53]:

# Filter the DataFrame for the South Zone
south_zone = final_df[final_df['Zones'] == 'South Zone']

# Create a bar plot using Plotly
fig = go.Figure(data=go.Bar(
    x=south_zone['State_Name'],
    y=south_zone['Production'],
    marker_color='orange'
))

# Set the y-axis scale to logarithmic
fig.update_layout(yaxis_type='log')

# Customize the bar plot
fig.update_layout(
    title='Southern-Zone wise Production',
    xaxis_title='State Name',
    yaxis_title='Production'
)

# Display the bar plot
fig.show()


In [54]:
# analysing the district zone in kerala which gives the highest production

In [63]:
import plotly.graph_objects as go
import pandas as pd

# Assuming you have a DataFrame called 'final_df' with the 'State_Name', 'District_Name', and 'Production' variables

# Filter the DataFrame for Kerala
Kerala = final_df[final_df['State_Name'] == 'Kerala']

# Create a bar plot using Plotly
fig = go.Figure(data=go.Bar(
    x=Kerala['District_Name'],
    y=Kerala['Production'],
    marker_color='blue'
))

# Set the y-axis scale to logarithmic
fig.update_layout(yaxis_type='log')

# Customize the bar plot
fig.update_layout(
    title='Kerala District-wise Production',
    xaxis_title='District Name',
    yaxis_title='Production'
)

# Rotate the x-axis labels
fig.update_layout(xaxis_tickangle=-90)

# Display the bar plot
fig.show()


In [57]:
# crop wise production

In [58]:
import plotly.graph_objects as go
import pandas as pd

# Assuming you have a DataFrame called 'final_df' with the 'Crop' and 'Production' variables

# Calculate the sum of production for each crop and select the top 10 crops
crop = final_df.groupby('Crop')['Production'].sum().reset_index().sort_values('Production', ascending=False).head(10)

# Create a bar plot using Plotly
fig = go.Figure(data=go.Bar(
    x=crop['Crop'],
    y=crop['Production'],
    marker_color=px.colors.qualitative.Plotly
))

# Set the y-axis scale to logarithmic
fig.update_layout(yaxis_type='log')

# Customize the bar plot
fig.update_layout(
    title='Overall Crops vs Production',
    xaxis_title='Crop',
    yaxis_title='Production'
)

# Rotate the x-axis labels
fig.update_layout(xaxis_tickangle=-90)

# Display the bar plot
fig.show()


<IPython.core.display.Javascript object>

In [64]:
# season wise productions

In [65]:

# Calculate the sum of production for each season and select the top 10 seasons
season = final_df.groupby('Season')['Production'].sum().reset_index().sort_values('Production', ascending=False).head(10)

# Create a bar plot using Plotly
fig = go.Figure(data=go.Bar(
    x=season['Season'],
    y=season['Production'],
    marker_color=px.colors.qualitative.Plotly
))

# Set the y-axis scale to logarithmic
fig.update_layout(yaxis_type='log')

# Customize the bar plot
fig.update_layout(
    title='Seasonal Crops vs Production',
    xaxis_title='Season',
    yaxis_title='Production'
)

# Rotate the x-axis labels
fig.update_layout(xaxis_tickangle=-90)

# Display the bar plot
fig.show()


In [66]:
# year vs production analysis

In [69]:

# Group the data by crop year and calculate the sum of production
production_by_year = final_df.groupby('Crop_Year')['Production'].sum()

# Create a bar plot using Plotly
fig = go.Figure(data=go.Bar(
    x=production_by_year.index,
    y=production_by_year.values,
    marker_color='skyblue'
))

# Customize the bar plot
fig.update_layout(
    title='Total Production by Crop Year',
    xaxis_title='Crop Year',
    yaxis_title='Production'
)

# Set the tick label font size
fig.update_layout(
    xaxis=dict(tickfont=dict(size=10)),
    yaxis=dict(tickfont=dict(size=10))
)

# Display the bar plot
fig.show()


In [70]:
# crop categories vs production

In [74]:

# Group the data by 'cat_crop' and calculate the count of productions
count_by_cat_crop = final_df.groupby('cat_crop')['Production'].count()

# Create a bar plot using Plotly
fig = go.Figure(data=go.Bar(
    x=count_by_cat_crop.index,
    y=count_by_cat_crop.values,
    marker_color='mediumslateblue'
))

# Customize the bar plot
fig.update_layout(
    title='Count of Productions by Category of Crop',
    xaxis_title='Crop Category',
    yaxis_title='Count of Productions'
)

# Set the tick label font size
fig.update_layout(
    xaxis=dict(tickfont=dict(size=10)),
    yaxis=dict(tickfont=dict(size=10))
)

# Display the bar plot
fig.show()


In [75]:
# season vs crop categories vs state

In [82]:
# Percentage of crop categories

In [83]:

# Calculate the count of crop categories
cat_crop_count_df = final_df["cat_crop"].value_counts()

# Create a pie chart using Plotly
fig = go.Figure(data=go.Pie(
    labels=cat_crop_count_df.index,
    values=cat_crop_count_df.values,
    hole=0.3,
    hoverinfo='label+percent',
    textinfo='value+percent'
))

# Customize the pie chart
fig.update_layout(
    title='Crop Category Distribution',
    font=dict(
        size=10
    )
)

# Display the pie chart
fig.show()


Result: Cereal production was greater followed by Pulses and oilseeds

In [86]:
# key Indicators and obtaining useful inaights

In [87]:
# State that dominates in crop production by producing a variety of crop categories

In [89]:

# Create a cross-tabulation of State_Name and cat_crop
state_cat_crop = pd.crosstab(final_df['State_Name'], final_df['cat_crop'])

# Get the unique crop categories
crop_categories = final_df['cat_crop'].unique()

# Create a list to store the data for each crop category
data = []

# Iterate over the crop categories and create a bar trace for each
for category in crop_categories:
    data.append(go.Bar(name=category, x=state_cat_crop.index, y=state_cat_crop[category]))

# Create the layout for the chart
layout = go.Layout(
    title='Crop Category Distribution across States',
    xaxis=dict(title='State'),
    yaxis=dict(title='Count'),
    barmode='stack'
)

# Create the figure object
fig = go.Figure(data=data, layout=layout)

# Display the chart
fig.show()


In [90]:
# Which crop is grown in high frequency in India? Provide useful insights for the same

In [91]:
final_df['Crop'].value_counts()[:5]

Rice                 15104
Maize                13947
Moong(Green Gram)    10318
Urad                  9850
Sesamum               9046
Name: Crop, dtype: int64

In [92]:
rice_df=final_df[final_df['Crop']=="Rice"]
print(rice_df.shape)
rice_df.head(4)

(15104, 9)


Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production,Zones,cat_crop
66501,Haryana,FATEHABAD,1997,Kharif,Rice,52000.0,184000.0,North Zone,Cereal
66514,Haryana,FATEHABAD,1998,Kharif,Rice,70105.0,208000.0,North Zone,Cereal
66531,Haryana,FATEHABAD,1999,Kharif,Rice,69238.0,164000.0,North Zone,Cereal
66552,Haryana,FATEHABAD,2000,Kharif,Rice,61308.0,199000.0,North Zone,Cereal


In [93]:

fig = go.Figure(data=go.Bar(
    x=rice_df['Season'],
    y=rice_df['Production']
))

fig.update_layout(
    title='Season vs Production',
    xaxis_title='Season',
    yaxis_title='Production'
)

fig.show()


In [96]:

fig = go.Figure(data=go.Bar(
    x=rice_df['State_Name'],
    y=rice_df['Production']
   
))

fig.update_layout(
    title='Production of Rice by State',
    xaxis_title='State Name',
    yaxis_title='Production'
)

fig.update_layout(xaxis_tickangle=-90)

fig.show()


In [97]:
top_rice_dist = rice_df.groupby("District_Name")["Production"].sum().reset_index().sort_values(by="Production", ascending=False)
top_rice_dist.head(5)

Unnamed: 0,District_Name,Production
58,BARDHAMAN,34239980.0
375,MEDINIPUR WEST,29192720.0
613,WEST GODAVARI,27845310.0
316,KOLLAM,26951880.0
169,EAST GODAVARI,24690930.0


In [98]:

# Create a list of colors for each bar
colors = ['rgb(31, 119, 180)', 'rgb(255, 127, 14)', 'rgb(44, 160, 44)', 'rgb(214, 39, 40)', 'rgb(148, 103, 189)']

# Create a bar trace for each district
bar_traces = []
for i, row in top_rice_dist[:10].iterrows():
    bar_trace = go.Bar(
        x=[row['District_Name']],
        y=[row['Production']],
        marker=dict(color=colors[i % len(colors)]),
        name=row['District_Name']
    )
    bar_traces.append(bar_trace)

# Create the layout for the bar plot
layout = go.Layout(
    title='Top 10 Rice Producing Districts',
    xaxis=dict(title='District Name'),
    yaxis=dict(title='Production')
)

# Create the figure with the bar traces and layout
fig = go.Figure(data=bar_traces, layout=layout)

# Show the figure
fig.show()


In [99]:

# Create a bar trace
bar_trace = go.Bar(
    x=rice_df['Crop_Year'],
    y=rice_df['Production'],
    marker=dict(
        color=rice_df['Crop_Year'],
        colorscale='Viridis',  # Choose the colorscale you prefer
        colorbar=dict(
            title='Crop Year'
        )
    )
)

# Create the layout for the bar plot
layout = go.Layout(
    title='Rice Production by Crop Year',
    xaxis=dict(title='Crop Year'),
    yaxis=dict(title='Production')
)

# Create the figure with the bar trace and layout
fig = go.Figure(data=bar_trace, layout=layout)

# Show the figure
fig.show()


In [100]:

# Create a scatter trace
scatter_trace = go.Scatter(
    x=rice_df['Area'],
    y=rice_df['Production'],
    mode='markers',
    marker=dict(
        size=6,
        color='blue'
    ),
    name='Data Points'
)

# Create a line trace for the regression line
regression_trace = go.Scatter(
    x=rice_df['Area'],
    y=rice_df['Production'],
    mode='lines',
    line=dict(
        color='red',
        width=2
    ),
    name='Regression Line'
)

# Create the layout for the scatter plot
layout = go.Layout(
    title='Scatter Plot with Regression Line',
    xaxis=dict(title='Area'),
    yaxis=dict(title='Production')
)

# Create the figure with the scatter and regression traces and layout
fig = go.Figure(data=[scatter_trace, regression_trace], layout=layout)

# Show the figure
fig.show()


#### Result:

> Rice is the most frequent crop in India. 

> Rice is produced in high is Winter season followed by Whole year and Kharif

> Punjab is richest state for high rice production.

> Bardaman district in Punjab is rich for rice production

In [101]:
# State that ranks high in area wise crop production in India

In [102]:
df_area=final_df.groupby("State_Name")["Area"].sum().reset_index().sort_values(by="Area",ascending=False)
df_area.head()

Unnamed: 0,State_Name,Area
30,Uttar Pradesh,433631600.0
16,Madhya Pradesh,329813100.0
17,Maharashtra,322206200.0
25,Rajasthan,272024900.0
32,West Bengal,215405200.0


In [104]:

# Create a list of colors for each bar
colors = ['blue', 'green', 'red', 'orange', 'purple', 'yellowgreen', 'cyan', 'magenta', 'brown', 'gray']

# Create a Bar trace
bar_trace = go.Bar(
    x=df_area.State_Name.head(10),
    y=df_area.Area.head(10),
    marker=dict(color=colors),
    error_y=dict(visible=False)  # Set error bars to be invisible
)

# Create the layout for the bar plot
layout = go.Layout(
    title='Indian Agricultural Area Distribution',
    xaxis=dict(title='State Name'),
    yaxis=dict(title='Area')
)

# Create the figure with the bar trace and layout
fig = go.Figure(data=[bar_trace], layout=layout)

# Show the figure
fig.show()


In [105]:
# Top crops produced in Northeen parts of State

In [106]:
North_Zone=final_df[final_df["Zones"]=="North Zone"]
print(North_Zone.shape)

(51468, 9)


In [108]:
North_Zone.head()

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production,Zones,cat_crop
66499,Haryana,FATEHABAD,1997,Kharif,Bajra,11000.0,17000.0,North Zone,Cereal
66500,Haryana,FATEHABAD,1997,Kharif,Other Kharif pulses,1700.0,1300.0,North Zone,Pulses
66501,Haryana,FATEHABAD,1997,Kharif,Rice,52000.0,184000.0,North Zone,Cereal
66502,Haryana,FATEHABAD,1997,Rabi,Barley,7000.0,15000.0,North Zone,Cereal
66503,Haryana,FATEHABAD,1997,Rabi,Gram,13000.0,11000.0,North Zone,Pulses


In [109]:

# Create the bar plot
fig = go.Figure(data=go.Bar(
    x=North_Zone['State_Name'],
    y=North_Zone['Production'],
    marker=dict(
        color=North_Zone['Production'],
        colorscale='Viridis',
        showscale=True
    )
))

# Set the y-axis to logarithmic scale
fig.update_layout(yaxis_type='log')

# Customize the x-axis tick labels rotation
fig.update_layout(xaxis_tickangle=-90)

# Set the plot title
fig.update_layout(title='Northern-Zone Production')

# Group by State_Name and calculate the total Production
grouped_df = North_Zone.groupby('State_Name')['Production'].sum().reset_index().sort_values(by='Production', ascending=False)

# Print the grouped data
print(grouped_df)

# Display the plot
fig.show()


           State_Name    Production
5       Uttar Pradesh  3.248159e+09
4              Punjab  5.905425e+08
1             Haryana  4.930824e+08
6         Uttarakhand  1.344728e+08
2    Himachal Pradesh  1.823680e+07
3  Jammu and Kashmir   1.331539e+07
0          Chandigarh  6.467511e+04


In [111]:
df_NZ = North_Zone.groupby("Crop")["Production"].sum().reset_index().sort_values(by="Production",ascending=False).head(10)
# Create the bar plot
fig = go.Figure(data=go.Bar(
    x=df_NZ['Crop'],
    y=df_NZ['Production'],
    marker=dict(
        color=df_NZ['Production'],
        colorscale='Viridis',
        showscale=True
    )
))

# Set the y-axis to logarithmic scale
fig.update_layout(yaxis_type='log')

# Customize the x-axis tick labels rotation
fig.update_layout(xaxis_tickangle=-90)

# Set the plot title
fig.update_layout(title='Crop vs Production in Northern-Zone')

# Display the plot
fig.show()


In [112]:
# Top crop produced in Southern Zone

In [114]:
South_Zone=final_df[final_df["Zones"]=="South Zone"]
print(South_Zone.shape)

(54207, 9)


In [115]:
South_Zone.head(10)

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production,Zones,cat_crop
189707,Tamil Nadu,VELLORE,1997,Kharif,Banana,4121.0,137719.0,South Zone,Fruits
189708,Tamil Nadu,VELLORE,1997,Kharif,Horse-gram,3234.0,850.0,South Zone,Beans
189709,Tamil Nadu,VELLORE,1997,Kharif,Onion,17.0,137.0,South Zone,Vegetables
189710,Tamil Nadu,VELLORE,1997,Kharif,Sesamum,666.0,450.0,South Zone,Oilseeds
189711,Tamil Nadu,VELLORE,1997,Kharif,Small millets,4649.0,4080.0,South Zone,Cereal
189712,Tamil Nadu,VELLORE,1997,Whole Year,Arhar/Tur,9822.0,3930.0,South Zone,Pulses
189713,Tamil Nadu,VELLORE,1997,Whole Year,Bajra,1511.0,2080.0,South Zone,Cereal
189714,Tamil Nadu,VELLORE,1997,Whole Year,Banana,8661.0,309930.0,South Zone,Fruits
189715,Tamil Nadu,VELLORE,1997,Whole Year,Cashewnut,22.0,20.0,South Zone,Nuts
189716,Tamil Nadu,VELLORE,1997,Whole Year,Castor seed,198.0,70.0,South Zone,Oilseeds


In [117]:
df_SZ=South_Zone.groupby("Crop")["Production"].sum().reset_index().sort_values(by="Production",ascending=False).head(10)

# Create the bar plot
fig = go.Figure(data=go.Bar(
    x=df_SZ['Crop'],
    y=df_SZ['Production'],
    marker=dict(
        color=df_SZ['Production'],
        colorscale='Viridis',
        showscale=True
    )
))

# Set the y-axis to logarithmic scale
fig.update_layout(yaxis_type='log')

# Customize the x-axis tick labels rotation
fig.update_layout(xaxis_tickangle=-90)

# Set the plot title
fig.update_layout(title='Crop vs Production in Southern-Zone')

# Display the plot
fig.show()


In [119]:
df_coco = final_df[final_df["Crop"]=="Coconut "]
print(df_coco.shape)
df_coco[:5]

(1985, 9)


Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production,Zones,cat_crop
189850,Tamil Nadu,VELLORE,2002,Whole Year,Coconut,18690.0,171300.0,South Zone,Other
189909,Tamil Nadu,VELLORE,2003,Whole Year,Coconut,21805.0,106800.0,South Zone,Other
189959,Tamil Nadu,VELLORE,2004,Whole Year,Coconut,20928.0,192400.0,South Zone,Other
189992,Tamil Nadu,VELLORE,2005,Whole Year,Coconut,23098.0,2765.0,South Zone,Other
190012,Tamil Nadu,VELLORE,2006,Whole Year,Coconut,22569.0,2654.0,South Zone,Other


In [120]:

# Create the bar plot
fig = go.Figure(data=go.Bar(
    x=df_coco['Season'],
    y=df_coco['Production'],
))

# Set the plot title
fig.update_layout(title='Season vs Production')

# Display the plot
fig.show()


In [121]:

# Create the bar plot
fig = go.Figure(data=go.Bar(
    x=df_coco['State_Name'],
    y=df_coco['Production'],
))

# Set the plot title and axis labels
fig.update_layout(
    title='State_Name vs Production',
    xaxis=dict(title='State_Name'),
    yaxis=dict(title='Production')
)

# Rotate x-axis labels
fig.update_layout(xaxis_tickangle=-90)

# Display the plot
fig.show()


In [122]:
top_coco_dist = df_coco.groupby("District_Name")["Production"].sum().reset_index().sort_values(by="Production", ascending=False)
top_coco_dist.head(5)

Unnamed: 0,District_Name,Production
80,KOZHIKODE,15278710000.0
89,MALAPPURAM,14512520000.0
134,THIRUVANANTHAPURAM,10013370000.0
136,THRISSUR,9920739000.0
66,KANNUR,9780310000.0


In [128]:

# Create the bar plot
fig = go.Figure(data=go.Bar(
    x=top_coco_dist['District_Name'][:8],
    y=top_coco_dist['Production'][:8],
    marker_color="mediumaquamarine"
))

# Set the plot title and axis labels
fig.update_layout(
    title='Top 8 Districts: District_Name vs Production',
    xaxis=dict(title='District_Name'),
    yaxis=dict(title='Production')
)

# Rotate x-axis labels
fig.update_layout(xaxis_tickangle=-90)

# Define the color scale
colorscale = [[0, '#FF0000'], [0.5, '#00FF00'], [1, '#0000FF']]
fig.update_traces(marker=dict(colorscale=colorscale))

# Display the plot
fig.show()


In [129]:

# Create the bar plot
fig = go.Figure(data=go.Bar(
    x=df_coco['Crop_Year'],
    y=df_coco['Production'],
    marker=dict(color=df_coco['Crop_Year'], coloraxis="coloraxis")
))

# Set the plot title and axis labels
fig.update_layout(
    title='Crop_Year vs Production',
    xaxis=dict(title='Crop_Year'),
    yaxis=dict(title='Production')
)

# Rotate x-axis labels
fig.update_layout(xaxis_tickangle=-45)

# Define the color scale
colorscale = [[0, '#FF0000'], [0.5, '#00FF00'], [1, '#0000FF']]
fig.update_traces(marker=dict(colorscale=colorscale))

# Display the plot
fig.show()


In [131]:

# Create the scatter plot
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df_coco['Area'],
    y=df_coco['Production'],
    mode='markers',
    marker=dict(color='blue'),
    name='Data'
))

# Add the regression line
fig.add_trace(go.Scatter(
    x=df_coco['Area'],
    y=df_coco['Production'],
    mode='lines',
    line=dict(color='red'),
    name='Regression Line',
    showlegend=False,
    hoverinfo='none',
    line_shape='spline'
))

# Set the plot title and axis labels
fig.update_layout(
    title='Area vs Production',
    xaxis=dict(title='Area'),
    yaxis=dict(title='Production')
)

# Rotate x-axis labels
fig.update_layout(xaxis_tickangle=-90)

# Display the plot
fig.show()


#### Result:

> Top crop grown in Southern Zone is coconut.

> The coconut gets matured in Whole year season

> Kerala state in southern zone yeilds a high amount of Coconut

> Kozhikode and Malapuram districts of Kerala are rich in Coconut production

> Coconut production was yeilded more during 2011 and 2014.

> High coconut cultivation is directly proportional to area under cultivation.

In [132]:
# Top Crop produced in Central state

In [133]:
Central_Zone=final_df[final_df["Zones"]=="Central Zone"]
print(Central_Zone.shape)

(33652, 9)


In [134]:
Central_Zone.head(9)

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production,Zones,cat_crop
105068,Madhya Pradesh,BHOPAL,1997,Kharif,Arhar/Tur,800.0,500.0,Central Zone,Pulses
105069,Madhya Pradesh,BHOPAL,1997,Kharif,Groundnut,200.0,200.0,Central Zone,Nuts
105070,Madhya Pradesh,BHOPAL,1997,Kharif,Jowar,3000.0,2600.0,Central Zone,Cereal
105071,Madhya Pradesh,BHOPAL,1997,Kharif,Maize,2100.0,1200.0,Central Zone,Cereal
105072,Madhya Pradesh,BHOPAL,1997,Kharif,Paddy,600.0,400.0,Central Zone,Cereal
105073,Madhya Pradesh,BHOPAL,1997,Kharif,Soyabean,74700.0,68200.0,Central Zone,Beans
105074,Madhya Pradesh,BHOPAL,1997,Rabi,Linseed,800.0,500.0,Central Zone,Oilseeds
105075,Madhya Pradesh,BHOPAL,1997,Rabi,Rapeseed &Mustard,200.0,100.0,Central Zone,Oilseeds
105076,Madhya Pradesh,BHOPAL,1997,Rabi,Wheat,78800.0,129500.0,Central Zone,Cereal


In [139]:
df_central_crop = Central_Zone.groupby("Crop")["Production"].sum().reset_index().sort_values(by="Production",ascending=False).head(10)
# Create the bar plot
fig = go.Figure()

fig.add_trace(go.Bar(
    x=df_central_crop['Crop'],
    y=df_central_crop['Production'],
    marker=dict(color='mediumturquoise')
))

# Set the y-axis scale to logarithmic
fig.update_yaxes(type='log')

# Set the plot title and axis labels
fig.update_layout(
    title='Top crop produced in Central zone',
    xaxis=dict(title='Crop'),
    yaxis=dict(title='Production')
)

# Rotate x-axis labels
fig.update_layout(xaxis_tickangle=-90)

# Display the plot
fig.show()

In [141]:


# Create the bar plot
fig = go.Figure()

fig.add_trace(go.Bar(
    x=Central_Zone['State_Name'],
    y=Central_Zone['Production'],
    marker=dict(color='mediumorchid')
))

# Set the y-axis scale to logarithmic
fig.update_yaxes(type='log')

# Set the plot title and axis labels
fig.update_layout(
    title='Central-Zone Production',
    xaxis=dict(title='State Name'),
    yaxis=dict(title='Production')
)

# Rotate x-axis labels
fig.update_layout(xaxis_tickangle=-90)

# Display the plot
fig.show()

Central_Zone.groupby(by='State_Name')['Production'].sum().reset_index().sort_values(by='Production', ascending=False)


Unnamed: 0,State_Name,Production
1,Madhya Pradesh,455542700.0
0,Chhattisgarh,105526600.0


In [142]:
# crop categories and their production rate

In [143]:

# Create the line plot
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=final_df['cat_crop'],
    y=final_df['Production'],
    mode='lines',
    marker=dict(color='blue')
))

# Set the y-axis scale to logarithmic
fig.update_yaxes(type='log')

# Set the plot title and axis labels
fig.update_layout(
    title='Crop categories vs Production rate',
    xaxis=dict(title='Crop categories'),
    yaxis=dict(title='Production')
)

# Rotate x-axis labels
fig.update_layout(xaxis_tickangle=-90)

# Display the plot
fig.show()

final_df.groupby(by='cat_crop')['Production'].sum().reset_index().sort_values(by='Production',ascending=False)

Unnamed: 0,cat_crop,Production
7,Other,130808300000.0
2,Commercial,6612507000.0
1,Cereal,3944948000.0
9,Spices,1827835000.0
8,Pulses,1792185000.0
10,Vegetables,1775118000.0
6,Oilseeds,752739200.0
3,Fibres,558182200.0
4,Fruits,536009900.0
0,Beans,459014900.0
