<a href="https://colab.research.google.com/github/biruk-tafese/MachineLearingIntern/blob/main/Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Dataset uploading**

In [8]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [10]:
crop_data = pd.read_csv("crop_recommendation.csv")

In [None]:
crop_data.head()

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90,42,43,20.879744,82.002744,6.502985,202.935536,rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,rice
4,78,42,42,20.130175,81.604873,7.628473,262.71734,rice


# **Data cleaning:**

*   handle missing values:
    -imputation
    -dropping
*   handle duplicates



In [None]:
crop_data.shape

(2200, 8)

In [None]:
crop_data.isnull().sum()

N              0
P              0
K              0
temperature    0
humidity       0
ph             0
rainfall       0
label          0
dtype: int64

In [None]:
crop_data.duplicated().sum()

0

# **Data Transformation**

1.Feature Scaling - Normalization and Standardization:

- **Normalization** scales the data to a range of [0, 1]
- standardization transforms data to have zero mean and unit variance.

In [11]:
from sklearn.preprocessing import MinMaxScaler

In [12]:
# Extract numerical columns for scaling
numerical_columns = ['N','P','K','temperature','humidity','ph','rainfall']

# Perform normalization
scaler_minmax = MinMaxScaler()
crop_data[numerical_columns] = scaler_minmax.fit_transform(crop_data[numerical_columns])

print("Normalized DataFrame:\n", crop_data)

Normalized DataFrame:
              N         P      K  temperature  humidity        ph  rainfall  \
0     0.642857  0.264286  0.190     0.345886  0.790267  0.466264  0.656458   
1     0.607143  0.378571  0.180     0.371445  0.770633  0.549480  0.741675   
2     0.428571  0.357143  0.195     0.406854  0.793977  0.674219  0.875710   
3     0.528571  0.214286  0.175     0.506901  0.768751  0.540508  0.799905   
4     0.557143  0.264286  0.185     0.324378  0.785626  0.641291  0.871231   
...        ...       ...    ...          ...       ...       ...       ...   
2195  0.764286  0.207143  0.135     0.515037  0.608410  0.509353  0.566064   
2196  0.707143  0.071429  0.110     0.533473  0.494359  0.401561  0.386972   
2197  0.842857  0.200000  0.125     0.439202  0.617880  0.444433  0.550071   
2198  0.835714  0.192857  0.145     0.500627  0.441760  0.506045  0.384280   
2199  0.742857  0.092857  0.125     0.424029  0.538222  0.509317  0.433721   

       label  
0       rice  
1       ri

2.Feature Encoding:

- One-Hot Encoding creates binary columns for each category
- **Label Encoding** assigns integer labels to categories

In [14]:
from sklearn.preprocessing import LabelEncoder

In [15]:
dict_crop = {
    'rice': 1,'maize': 2,'jute': 3,'cotton': 4,'coconut': 5,'papaya': 6,'orange': 7,'apple': 8,'muskmelon': 9,'watermelon': 10,'grapes': 11,'mango': 12,'banana': 13,'pomegranate': 14,'lentil': 15,'blackgram': 16,'mungbean': 17,'mothbeans': 18,'pigeonpeas': 19,'kidneybeans': 20, 'chickpea': 21,'coffee': 22
}
crop_data['num_crop'] = crop_data['label'].map(dict_crop)


In [16]:
crop_data['num_crop'].value_counts()

1     100
2     100
3     100
4     100
5     100
6     100
7     100
8     100
9     100
10    100
11    100
12    100
13    100
14    100
15    100
16    100
17    100
18    100
19    100
20    100
21    100
22    100
Name: num_crop, dtype: int64

# **Data Splitting**

In [17]:
independent = crop_data.drop(['num_crop', 'label'], axis=1)
dependent = crop_data['num_crop']

In [None]:
independent

In [18]:
dependent

0        1
1        1
2        1
3        1
4        1
        ..
2195    22
2196    22
2197    22
2198    22
2199    22
Name: num_crop, Length: 2200, dtype: int64

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
dependent_train, dependent_test, independent_train, independent_test = train_test_split(dependent, independent, test_size=0.2, random_state=42)

In [22]:
dependent_train.shape


(1760,)

In [None]:
dependent_test.shape

In [None]:
independent_train.shape

In [23]:
dependent_train

1656     7
752     16
892     15
1041    13
1179    12
        ..
1638     7
1095    13
1130    12
1294    11
860     15
Name: num_crop, Length: 1760, dtype: int64

In [24]:
independent_train

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall
1656,0.121429,0.078571,0.045,0.217234,0.909006,0.485322,0.297227
752,0.264286,0.528571,0.070,0.537110,0.642643,0.565941,0.176748
892,0.050000,0.485714,0.100,0.536479,0.570134,0.588352,0.089805
1041,0.721429,0.464286,0.215,0.474462,0.708950,0.390017,0.346119
1179,0.000000,0.085714,0.125,0.764684,0.393289,0.431452,0.278713
...,...,...,...,...,...,...,...
1638,0.071429,0.000000,0.000,0.355451,0.899347,0.670741,0.333295
1095,0.771429,0.635714,0.210,0.531809,0.819938,0.448294,0.253643
1130,0.078571,0.221429,0.130,0.547921,0.437703,0.461981,0.287579
1294,0.078571,0.850000,0.995,0.132116,0.767678,0.444205,0.183903


# **Data Correlation**


In [25]:
cor = crop_data.corr()
cor

  cor = crop_data.corr()


Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,num_crop
N,1.0,-0.23146,-0.140512,0.026504,0.190688,0.096683,0.05902,-0.317076
P,-0.23146,1.0,0.736232,-0.127541,-0.118734,-0.138019,-0.063839,0.109993
K,-0.140512,0.736232,1.0,-0.160387,0.190859,-0.169503,-0.053461,-0.089209
temperature,0.026504,-0.127541,-0.160387,1.0,0.20532,-0.017795,-0.030084,-0.031383
humidity,0.190688,-0.118734,0.190859,0.20532,1.0,-0.008483,0.094423,-0.640925
ph,0.096683,-0.138019,-0.169503,-0.017795,-0.008483,1.0,-0.109069,0.057527
rainfall,0.05902,-0.063839,-0.053461,-0.030084,0.094423,-0.109069,1.0,-0.311053
num_crop,-0.317076,0.109993,-0.089209,-0.031383,-0.640925,0.057527,-0.311053,1.0


In [None]:
#import seaborn as sns
sns.heatmap(cor,annot=True,cbar=True, cmap='coolwarm')

# Feature Selection part

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Your existing code
dict_crop = {
    'rice': 1,'maize': 2,'jute': 3,'cotton': 4,'coconut': 5,'papaya': 6,'orange': 7,'apple': 8,'muskmelon': 9,'watermelon': 10,'grapes': 11,'mango': 12,'banana': 13,'pomegranate': 14,'lentil': 15,'blackgram': 16,'mungbean': 17,'mothbeans': 18,'pigeonpeas': 19,'kidneybeans': 20, 'chickpea': 21,'coffee': 22
}
crop_data['num_crop'] = crop_data['label'].map(dict_crop)

# Additional code for one-hot encoding
numeric_features = crop_data.iloc[:, :-1]  # Separate numeric columns
categorical_feature = crop_data.iloc[:, -1]  # Get the categorical column

# Label encode the categorical column
label_encoder = LabelEncoder()
categorical_encoded = label_encoder.fit_transform(categorical_feature)

# One-hot encode the categorical data
onehot_encoder = OneHotEncoder()
categorical_encoded = categorical_encoded.reshape(-1, 1)
onehot_encoded = onehot_encoder.fit_transform(categorical_encoded).toarray()

# Combine the encoded categorical data with the numeric data
encoded_data = pd.concat([numeric_features, pd.DataFrame(onehot_encoded)], axis=1)

print(encoded_data)