In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("/kaggle/input/rice-pest-and-diseases/RICE.csv")

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.isna().sum()

In [None]:
numericals = ['MaxT', 'MinT', 'RH1(%)', 'RH2(%)', 'RF(mm)', 'WS(kmph)', 'SSH(hrs)', 'EVP(mm)']

<div style='font-size:150%'><h1 style='text-align:center;font-family:cursive;'> <b>General data distribution of numerical features</b> </h1>
    </div>

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(15, 10))
k = 0
for i in range(2):
    for j in range(4):
        sns.histplot(df, x=df[numericals[k]], kde=True, color='g', ax=axes[i][j])
        k += 1

<div style='font-size:150%'><h1 style='text-align:center;font-family:cursive;'> <b>Numerical data correlations</b> </h1></div>

In [None]:
to_corr = df[numericals]
corr = to_corr.corr()
sns.heatmap(corr, annot=True)

In [None]:
def plots(df, x):
    grouped = df.groupby('PEST NAME')
    fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(18, 9))
    sns.histplot(df, x=x, hue='PEST NAME', ax=axes[0])
    sns.barplot(x=grouped[x].mean().index,
               y=grouped[x].mean(),
               ax=axes[1])
    
    for container in axes[1].containers:
        axes[1].bar_label(container, size=15, color='black')
        
    axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=45, ha='right')
    
    sns.boxplot(x=df['PEST NAME'], y=df[x], ax=axes[2])
    axes[2].set_xticklabels(axes[1].get_xticklabels(), rotation=45, ha='right')
    
    plt.suptitle("{} data distributions grouped by pest types".format(x.capitalize()), size=15)
        
    plt.tight_layout()
    plt.show()

<div style='font-size:150%'><h1 style='text-align:center;
           font-family:cursive;'> 
    <b>Categorical data pie charts</b>
</h1>
    </div>

In [None]:
cats = ['Collection Type', 'PEST NAME', 'Location']
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 11))
for i in range(3):
    axes[i].pie(df[cats[i]].value_counts(),
               labels=df[cats[i]].value_counts().index,
               autopct='%0.2f%%')
    
    axes[i].set_title("The most popular and frequent {}".format(cats[i]))
plt.tight_layout()
plt.show()

<div style='font-size:150%'><h1 style='text-align:center;
           font-family:cursive;'> <b>Numerical features grouped by Pest Types</b>
</h1>
    </div>

<div style='font-size:150%;
            font-family:cursive;'>
    <p style='text-align:center;'>Showing approximate weather conditions in which listed pests survive</p>
    </div>

In [None]:
for i in numericals:
    plots(df, i)

<div style='font-size:150%'><h1 style='text-align:center;
           font-family:cursive'>
    <b>Most common pests by region</b>
</h1>
    </div>

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 10))
locations = df['Location'].unique()
k = 0
for i in range(2):
    for j in range(3):
        grouped = df[df['Location'] == locations[k]]
        axes[i][j].pie(grouped['PEST NAME'].value_counts(),
                      labels=grouped['PEST NAME'].value_counts().index,
                      autopct='%0.2f%%')
        
        axes[i][j].set_title("Pests that are popular in {}".format(locations[k]))
        
        k += 1
        
plt.suptitle("Popular and most common pests in each region", size=15)
        
plt.tight_layout()
        
plt.show()