In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Load the dataset
file_path = r"C:\Users\adyas\Desktop\crop yield prediction\CROP-YIELD-ONLY-KHARIF-FINAL.csv"
crop_data = pd.read_csv(file_path)

In [3]:
# Defining features (X) and target variable (y)
X = crop_data.drop(columns=['Yield'])  # Assuming 'Yield' is the target variable
y = crop_data['Yield']

In [4]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# One-hot encoding categorical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns
encoder = OneHotEncoder(drop='first', handle_unknown='ignore')
X_train_encoded = pd.DataFrame(encoder.fit_transform(X_train[categorical_cols]).toarray())
X_test_encoded = pd.DataFrame(encoder.transform(X_test[categorical_cols]).toarray())



In [6]:
# Reassign column names
encoded_column_names = encoder.get_feature_names_out(input_features=categorical_cols)
X_train_encoded.columns = encoded_column_names
X_test_encoded.columns = encoded_column_names

In [7]:
# Drop original categorical columns
X_train = X_train.drop(columns=categorical_cols)
X_test = X_test.drop(columns=categorical_cols)

In [8]:
# Concatenate encoded categorical columns with the remaining features
X_train = pd.concat([X_train, X_train_encoded], axis=1)
X_test = pd.concat([X_test, X_test_encoded], axis=1)

In [9]:
# Clipping extreme values in numerical features
max_threshold = 1e6  # Choose an appropriate threshold based on your data
numerical_features = X_train.select_dtypes(include=['float64', 'int64']).columns
X_train[numerical_features] = X_train[numerical_features].clip(upper=max_threshold, axis=1)
X_test[numerical_features] = X_test[numerical_features].clip(upper=max_threshold, axis=1)

In [10]:
# Applying RobustScaler to numerical features
scaler = RobustScaler()
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

In [11]:
# Define preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols)],
    remainder='passthrough'
)

# Define the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])


In [12]:
# Train the model
model.fit(X_train, y_train)

# Predictions on the testing set
y_pred = model.predict(X_test)

ValueError: A given column is not a column of the dataframe

In [13]:
# Inspect column names
print(crop_data.columns)

# Check if 'Crop' is in column names
print('Crop' in crop_data.columns)


Index(['Crop', 'Season', 'State', 'Area', 'Production', 'Annual_Rainfall',
       'Fertilizer', 'Pesticide', 'Yield'],
      dtype='object')
True


In [14]:

# Defining features (X) and target variable (y)
X = crop_data.drop(columns=['Yield'])  # Assuming 'Yield' is the target variable
y = crop_data['Yield']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [15]:
# Check if identified categorical columns are present in the dataset
for col in categorical_cols:
    if col not in crop_data.columns:
        print(f"Column '{col}' is not present in the dataset.")


In [16]:
# One-hot encode categorical columns
encoder = OneHotEncoder(drop='first', handle_unknown='ignore')
X_train_encoded = pd.DataFrame(encoder.fit_transform(X_train[categorical_cols]).toarray())
X_test_encoded = pd.DataFrame(encoder.transform(X_test[categorical_cols]).toarray())



In [17]:
# Reassign column names
encoded_column_names = encoder.get_feature_names_out(input_features=categorical_cols)
X_train_encoded.columns = encoded_column_names
X_test_encoded.columns = encoded_column_names

In [18]:
# Drop original categorical columns
X_train = X_train.drop(columns=categorical_cols)
X_test = X_test.drop(columns=categorical_cols)


In [19]:
# Concatenate encoded categorical columns with the remaining features
X_train = pd.concat([X_train, X_train_encoded], axis=1)
X_test = pd.concat([X_test, X_test_encoded], axis=1)

In [20]:
# Clipping extreme values in numerical features
max_threshold = 1e6  # Choose an appropriate threshold based on your data
X_train[numerical_features] = X_train[numerical_features].clip(upper=max_threshold, axis=1)
X_test[numerical_features] = X_test[numerical_features].clip(upper=max_threshold, axis=1)

In [21]:
# Applying RobustScaler to numerical features
scaler = RobustScaler()
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

In [22]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Initialize the Linear Regression model
linear_reg = LinearRegression()

# Train the model on the preprocessed training data
linear_reg.fit(X_train, y_train)

ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [23]:
from sklearn.impute import SimpleImputer

# Initialize SimpleImputer with mean strategy
imputer = SimpleImputer(strategy='mean')

# Fit and transform imputer on training data
X_train_imputed = imputer.fit_transform(X_train)

# Transform test data using the same imputer
X_test_imputed = imputer.transform(X_test)

# Now, train your model using X_train_imputed and y_train


In [24]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Initialize the Linear Regression model
linear_reg = LinearRegression()

# Train the model on the imputed training data
linear_reg.fit(X_train_imputed, y_train)

# Predictions on the testing set
y_pred = linear_reg.predict(X_test_imputed)

# Model evaluation
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R-squared:", r2_score(y_test, y_pred))


ValueError: Found input variables with inconsistent numbers of samples: [7887, 6585]

In [25]:
# Reuse the same encoder to transform the test data
X_test_encoded = pd.DataFrame(encoder.transform(X_test[categorical_cols]).toarray())
encoded_column_names = encoder.get_feature_names_out(input_features=categorical_cols)
X_test_encoded.columns = encoded_column_names

# Check if all expected categorical columns are present in the test data
missing_cols = set(X_train.columns) - set(X_test.columns)
for col in missing_cols:
    X_test_encoded[col] = 0  # Add missing columns with all zeros

# Ensure that column names are correctly identified
print("Training set column names:", X_train.columns)
print("Test set column names:", X_test.columns)

# Verify the shapes of X_train_imputed and y_train
print("Shape of X_train_imputed:", X_train_imputed.shape)
print("Shape of y_train:", y_train.shape)

KeyError: "None of [Index(['Crop', 'Season', 'State'], dtype='object')] are in the [columns]"

In [26]:
# Print column names in X_test
print("Column names in X_test:", X_test.columns)

# Verify categorical_cols variable
print("Categorical columns:", categorical_cols)


Column names in X_test: Index(['Area', 'Production', 'Annual_Rainfall', 'Fertilizer', 'Pesticide',
       'Crop_Arhar/Tur', 'Crop_Bajra', 'Crop_Banana', 'Crop_Barley',
       'Crop_Black pepper', 'Crop_Cardamom', 'Crop_Cashewnut',
       'Crop_Castor seed', 'Crop_Coconut ', 'Crop_Coriander',
       'Crop_Cotton(lint)', 'Crop_Cowpea(Lobia)', 'Crop_Dry chillies',
       'Crop_Garlic', 'Crop_Ginger', 'Crop_Gram', 'Crop_Groundnut',
       'Crop_Guar seed', 'Crop_Horse-gram', 'Crop_Jowar', 'Crop_Jute',
       'Crop_Linseed', 'Crop_Maize', 'Crop_Masoor', 'Crop_Mesta',
       'Crop_Moong(Green Gram)', 'Crop_Moth', 'Crop_Niger seed',
       'Crop_Oilseeds total', 'Crop_Onion', 'Crop_Other Cereals',
       'Crop_Other Kharif pulses', 'Crop_Peas & beans (Pulses)', 'Crop_Potato',
       'Crop_Ragi', 'Crop_Rapeseed &Mustard', 'Crop_Rice', 'Crop_Safflower',
       'Crop_Sannhamp', 'Crop_Sesamum', 'Crop_Small millets', 'Crop_Soyabean',
       'Crop_Sugarcane', 'Crop_Sunflower', 'Crop_Sweet potato', 

In [27]:
# Update categorical_cols variable to match the columns in X_test
categorical_cols = X_test.columns[X_test.columns.str.startswith('Crop_') | X_test.columns.str.startswith('State_')]


In [28]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Define numerical and categorical features
numerical_features = ['Area', 'Production', 'Annual_Rainfall', 'Fertilizer', 'Pesticide']
categorical_features = [col for col in X_train.columns if col not in numerical_features]

# Define preprocessing pipeline
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features)
    ], 
    remainder='passthrough'
)

# Preprocess training and test data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Define and train the model
model = LinearRegression()
model.fit(X_train_preprocessed, y_train)

# Make predictions
y_pred = model.predict(X_test_preprocessed)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)


ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [29]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Define numerical and categorical features
numerical_features = ['Area', 'Production', 'Annual_Rainfall', 'Fertilizer', 'Pesticide']
categorical_features = [col for col in X_train.columns if col not in numerical_features]

# Define preprocessing pipeline
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features)
    ], 
    remainder='passthrough'
)

# Preprocess training and test data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Define and train the model
model = LinearRegression()
model.fit(X_train_preprocessed, y_train)

# Make predictions
y_pred = model.predict(X_test_preprocessed)

#


ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [30]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Define numerical and categorical features
numerical_features = ['Area', 'Production', 'Annual_Rainfall', 'Fertilizer', 'Pesticide']
categorical_features = [col for col in X_train.columns if col not in numerical_features]

# Define preprocessing pipeline
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Preprocess training and test data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Define and train the model
model = LinearRegression()
model.fit(X_train_preprocessed, y_train)

# Make predictions
y_pred = model.predict(X_test_preprocessed)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)


ValueError: Found input variables with inconsistent numbers of samples: [7887, 6585]

In [31]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Define numerical and categorical features
numerical_features = ['Area', 'Production', 'Annual_Rainfall', 'Fertilizer', 'Pesticide']
categorical_features = [col for col in X_train.columns if col not in numerical_features]

# Define preprocessing pipeline
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Preprocess training and test data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Define and train the model
model = LinearRegression()
model.fit(X_train_preprocessed, y_train)

# Make predictions
y_pred = model.predict(X_test_preprocessed)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)


ValueError: Found input variables with inconsistent numbers of samples: [7887, 6585]

In [32]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Define numerical and categorical features
numerical_features = ['Area', 'Production', 'Annual_Rainfall', 'Fertilizer', 'Pesticide']
categorical_features = [col for col in X_train.columns if col not in numerical_features]

# Define preprocessing pipeline
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Preprocess training and test data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Define and train the model
model = LinearRegression()
model.fit(X_train_preprocessed, y_train)

# Make predictions
y_pred = model.predict(X_test_preprocessed)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)


ValueError: Found input variables with inconsistent numbers of samples: [7887, 6585]