# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Reading the CSV Files

In [2]:
train_df = pd.read_csv('../datasets/train.csv')
test_df = pd.read_csv('../datasets/test.csv')

In [3]:
train_df.shape, test_df.shape

((750000, 10), (250000, 9))

In [11]:
train_df.head()

Unnamed: 0,id,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name
0,0,37,70,36,Clayey,Sugarcane,36,4,5,28-28
1,1,27,69,65,Sandy,Millets,30,6,18,28-28
2,2,29,63,32,Sandy,Millets,24,12,16,17-17-17
3,3,35,62,54,Sandy,Barley,39,12,4,10-26-26
4,4,35,58,43,Red,Paddy,37,2,16,DAP


In [12]:
test_df.head()

Unnamed: 0,id,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous
0,750000,31,70,52,Sandy,Wheat,34,11,24
1,750001,27,62,45,Red,Sugarcane,30,14,15
2,750002,28,72,28,Clayey,Ground Nuts,14,15,4
3,750003,37,53,57,Black,Ground Nuts,18,17,36
4,750004,31,55,32,Red,Pulses,13,19,14


In [13]:
train_df.isnull().sum(), test_df.isnull().sum()

(id                 0
 Temparature        0
 Humidity           0
 Moisture           0
 Soil Type          0
 Crop Type          0
 Nitrogen           0
 Potassium          0
 Phosphorous        0
 Fertilizer Name    0
 dtype: int64,
 id             0
 Temparature    0
 Humidity       0
 Moisture       0
 Soil Type      0
 Crop Type      0
 Nitrogen       0
 Potassium      0
 Phosphorous    0
 dtype: int64)

In [14]:
train_df.duplicated().sum(), test_df.duplicated().sum()

(np.int64(0), np.int64(0))

In [15]:
df_merged = pd.merge(train_df.drop(columns = 'Fertilizer Name'), test_df, how = 'outer')

In [16]:
df_merged.shape

(1000000, 9)

# Encoding

In [22]:
cat_cols = df_merged.select_dtypes(include = 'object').columns

for i in cat_cols:
    print(df_merged[i].value_counts())
    print('______________________')
    print()

Soil Type
Sandy     209033
Black     201103
Clayey    198174
Red       197461
Loamy     194229
Name: count, dtype: int64
______________________

Crop Type
Paddy          114086
Pulses         104401
Cotton          92262
Tobacco         90728
Wheat           88722
Millets         87180
Barley          86761
Sugarcane       85836
Oil seeds       85711
Maize           83472
Ground Nuts     80841
Name: count, dtype: int64
______________________



## One-Hot Encoding

In [23]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output = False)

encoded = encoder.fit_transform(df_merged[cat_cols])

encoded_df = pd.DataFrame(encoded, columns = encoder.get_feature_names_out(cat_cols))

df_merged = pd.concat([df_merged.drop(columns = cat_cols), encoded_df], axis = 1)

In [24]:
df_merged.head()

Unnamed: 0,id,Temparature,Humidity,Moisture,Nitrogen,Potassium,Phosphorous,Soil Type_Black,Soil Type_Clayey,Soil Type_Loamy,...,Crop Type_Cotton,Crop Type_Ground Nuts,Crop Type_Maize,Crop Type_Millets,Crop Type_Oil seeds,Crop Type_Paddy,Crop Type_Pulses,Crop Type_Sugarcane,Crop Type_Tobacco,Crop Type_Wheat
0,0,37,70,36,36,4,5,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1,27,69,65,30,6,18,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,29,63,32,24,12,16,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,35,62,54,39,12,4,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,35,58,43,37,2,16,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [25]:
df_merged.columns

Index(['id', 'Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium',
       'Phosphorous', 'Soil Type_Black', 'Soil Type_Clayey', 'Soil Type_Loamy',
       'Soil Type_Red', 'Soil Type_Sandy', 'Crop Type_Barley',
       'Crop Type_Cotton', 'Crop Type_Ground Nuts', 'Crop Type_Maize',
       'Crop Type_Millets', 'Crop Type_Oil seeds', 'Crop Type_Paddy',
       'Crop Type_Pulses', 'Crop Type_Sugarcane', 'Crop Type_Tobacco',
       'Crop Type_Wheat'],
      dtype='object')