In [None]:
import pandas as pd
import numpy as np
from scipy import stats

In [None]:
df = pd.read_csv("/content/dataset - Sheet1.csv")
df

Unnamed: 0,CropType,CropDays,SoilMoisture,temperature,Humidity,Irrigation
0,Wheat,10.0,400.0,30.0,15.0,0.0
1,Wheat,7.0,200.0,30.0,32.0,0.0
2,Wheat,9.0,300.0,21.0,28.0,0.0
3,Wheat,3.0,500.0,40.0,22.0,0.0
4,Wheat,2.0,700.0,23.0,34.0,0.0
...,...,...,...,...,...,...
519,Coffee,93.0,675.0,25.0,19.0,1.0
520,Coffee,95.0,210.0,23.0,17.0,0.0
521,Coffee,97.0,398.0,25.0,18.0,0.0
522,Coffee,99.0,678.0,24.0,18.0,1.0


In [None]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df
# Dropped empty and duplicate rows

Unnamed: 0,CropType,CropDays,SoilMoisture,temperature,Humidity,Irrigation
0,Wheat,10.0,400.0,30.0,15.0,0.0
1,Wheat,7.0,200.0,30.0,32.0,0.0
2,Wheat,9.0,300.0,21.0,28.0,0.0
3,Wheat,3.0,500.0,40.0,22.0,0.0
4,Wheat,2.0,700.0,23.0,34.0,0.0
...,...,...,...,...,...,...
519,Coffee,93.0,675.0,25.0,19.0,1.0
520,Coffee,95.0,210.0,23.0,17.0,0.0
521,Coffee,97.0,398.0,25.0,18.0,0.0
522,Coffee,99.0,678.0,24.0,18.0,1.0


In [None]:
# splitting the above df into multiple smaller df based on "CropType" column
crop_types = df["CropType"].unique()
dfs = {crop_type: df[df["CropType"] == crop_type] for crop_type in crop_types}

In [None]:
df1 = dfs['Wheat']
df2 = dfs['Groundnuts']
df3 = dfs['Garden Flowers']
df4 = dfs['Maize']
df5 = dfs['Paddy']
df6 = dfs['Potato']
df7 = dfs['Pulse']
df8 = dfs['Sugarcane']
df9 = dfs['Coffee']
df1

Unnamed: 0,CropType,CropDays,SoilMoisture,temperature,Humidity,Irrigation
0,Wheat,10.0,400.0,30.0,15.0,0.0
1,Wheat,7.0,200.0,30.0,32.0,0.0
2,Wheat,9.0,300.0,21.0,28.0,0.0
3,Wheat,3.0,500.0,40.0,22.0,0.0
4,Wheat,2.0,700.0,23.0,34.0,0.0
...,...,...,...,...,...,...
76,Wheat,156.0,880.0,34.0,33.0,0.0
77,Wheat,160.0,980.0,23.0,22.0,0.0
78,Wheat,165.0,775.0,32.0,21.0,0.0
79,Wheat,166.0,832.0,33.0,32.0,0.0


In [None]:
def check_outliers(df, column_name):
  z_scores = stats.zscore(df[column_name])
  outliers = df[abs(z_scores) > 3]
  return outliers

In [None]:
crop_types = df["CropType"].unique()

arr = []
attributes = ['CropDays', 'SoilMoisture', 'temperature', 'Humidity']

for crop_type in crop_types:
  for j in attributes:
    outliers = check_outliers(dfs[crop_type],j)
    if outliers.shape[0] > 0:
      arr.append(outliers)
      dfs[crop_type] = dfs[crop_type].drop(outliers.index)

print(arr)

[   CropType  CropDays  SoilMoisture  temperature  Humidity  Irrigation
16    Wheat      20.0         330.0         31.0      80.0         0.0,            CropType  CropDays  SoilMoisture  temperature  Humidity  Irrigation
176  Garden Flowers     200.0         200.0         19.0      69.0         0.0,            CropType  CropDays  SoilMoisture  temperature  Humidity  Irrigation
136  Garden Flowers      13.0         350.0         80.0      64.0         1.0,     CropType  CropDays  SoilMoisture  temperature  Humidity  Irrigation
227    Maize      96.0         220.0        263.0      12.0         0.0,     CropType  CropDays  SoilMoisture  temperature  Humidity  Irrigation
201    Maize      44.0          14.0         20.0     120.0         0.0,     CropType  CropDays  SoilMoisture  temperature  Humidity  Irrigation
235    Paddy       7.0         250.0         63.0      19.0         0.0,     CropType  CropDays  SoilMoisture  temperature  Humidity  Irrigation
255    Paddy      36.0         

In [None]:
# recombine the splited dfs into one dataframe
cleaned_df = pd.concat([dfs[crop_type]for crop_type in crop_types], ignore_index=True)
cleaned_df

Unnamed: 0,CropType,CropDays,SoilMoisture,temperature,Humidity,Irrigation
0,Wheat,10.0,400.0,30.0,15.0,0.0
1,Wheat,7.0,200.0,30.0,32.0,0.0
2,Wheat,9.0,300.0,21.0,28.0,0.0
3,Wheat,3.0,500.0,40.0,22.0,0.0
4,Wheat,2.0,700.0,23.0,34.0,0.0
...,...,...,...,...,...,...
495,Coffee,93.0,675.0,25.0,19.0,1.0
496,Coffee,95.0,210.0,23.0,17.0,0.0
497,Coffee,97.0,398.0,25.0,18.0,0.0
498,Coffee,99.0,678.0,24.0,18.0,1.0


In [None]:
# export the cleaned_df as a csv file named cropdata
cleaned_df.to_csv('/content/cropdata.csv', index=False)