In [1]:
# from pyspark.sql import SparkSession
# from pyngrok import ngrok
# !ngrok config add-authtoken 2od7Q3DgoNRsx9n0LjNOwHGFzT9_4hYE1ar5x3u7hj5Dfsw9f
# # Create a Spark session
# spark = SparkSession.builder.appName("CropYieldAnalysis").config("spark.ui.port","4050").getOrCreate()

# public_url = ngrok.connect(4050, "http")
# print(f"Access Spark UI at: {public_url}")

In [None]:
from pyspark.sql import SparkSession
# Initialize SparkSession
spark = SparkSession.builder.appName("CropYieldAnalysis").getOrCreate()

# Load the CSV file
df = spark.read.csv("uncleaned_crop_yield.csv", header=True, inferSchema=False)
# Display initial data
print("Initial Data:")
df.show(5)

Initial Data:
+------+---------+-------+-----------------+-------------------+---------------+---------------+-----------------+---------------+----------------------+
|Region|Soil_Type|   Crop|      Rainfall_mm|Temperature_Celsius|Fertilizer_Used|Irrigation_Used|Weather_Condition|Days_to_Harvest|Yield_tons_per_hectare|
+------+---------+-------+-----------------+-------------------+---------------+---------------+-----------------+---------------+----------------------+
|  West|    Sandy| Cotton|897.0772391101236| 27.676966373377603|          False|           True|           Cloudy|          122.0|     6.555816258223593|
| South|     Clay|   Rice|992.6732816189208|  18.02614225436302|           True|           True|          Rainy  |          140.0|       8.5273409063236|
| North|     Loam| Barley|147.9980252926104|  29.79404241557257|          False|          False|            Sunny|          106.0|                  NULL|
| noRTH|    Sandy|Soybean|986.8663313367324|  16.6441901913772

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 52489)
Traceback (most recent call last):
  File "c:\Users\DARSHAN SONAWANE\AppData\Local\Programs\Python\Python311\Lib\socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "c:\Users\DARSHAN SONAWANE\AppData\Local\Programs\Python\Python311\Lib\socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "c:\Users\DARSHAN SONAWANE\AppData\Local\Programs\Python\Python311\Lib\socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "c:\Users\DARSHAN SONAWANE\AppData\Local\Programs\Python\Python311\Lib\socketserver.py", line 755, in __init__
    self.handle()
  File "c:\Users\DARSHAN SONAWANE\AppData\Local\Programs\Python\Python311\Lib\site-packages\pyspark\accumulators.py", line 295, in handle
    poll(accum_updates)
  File "c:\

In [3]:
# Drop duplicate rows
df = df.dropDuplicates()

# Display schema and information about the DataFrame
print("DataFrame Info:")
df.printSchema()

# Show a summary of the DataFrame (similar to .info() in Pandas)
print("Data Summary:")
df.describe().show()

DataFrame Info:
root
 |-- Region: string (nullable = true)
 |-- Soil_Type: string (nullable = true)
 |-- Crop: string (nullable = true)
 |-- Rainfall_mm: string (nullable = true)
 |-- Temperature_Celsius: string (nullable = true)
 |-- Fertilizer_Used: string (nullable = true)
 |-- Irrigation_Used: string (nullable = true)
 |-- Weather_Condition: string (nullable = true)
 |-- Days_to_Harvest: string (nullable = true)
 |-- Yield_tons_per_hectare: string (nullable = true)

Data Summary:
+-------+--------+----------+----------+------------------+-------------------+---------------+---------------+-----------------+------------------+----------------------+
|summary|  Region| Soil_Type|      Crop|       Rainfall_mm|Temperature_Celsius|Fertilizer_Used|Irrigation_Used|Weather_Condition|   Days_to_Harvest|Yield_tons_per_hectare|
+-------+--------+----------+----------+------------------+-------------------+---------------+---------------+-----------------+------------------+-------------------

In [4]:
from pyspark.sql.functions import col, sum

# Calculate the sum of null values for each column
null_counts = df.select([
    sum(col(c).isNull().cast("int")).alias(c) for c in df.columns
])

# Show the result
null_counts.show()


+------+---------+----+-----------+-------------------+---------------+---------------+-----------------+---------------+----------------------+
|Region|Soil_Type|Crop|Rainfall_mm|Temperature_Celsius|Fertilizer_Used|Irrigation_Used|Weather_Condition|Days_to_Harvest|Yield_tons_per_hectare|
+------+---------+----+-----------+-------------------+---------------+---------------+-----------------+---------------+----------------------+
|     0|        0|   0|      50000|              49999|              0|              0|                0|          50000|                 50000|
+------+---------+----+-----------+-------------------+---------------+---------------+-----------------+---------------+----------------------+



In [5]:
from pyspark.sql.functions import col

# List of columns to change datatype (all columns or specify desired ones)
columns_to_convert = ["Rainfall_mm", "Temperature_Celsius", "Yield_tons_per_hectare"]

# Convert columns from string to double
for col_name in columns_to_convert:
    df = df.withColumn(col_name, col(col_name).cast("double"))

# Verify the schema after conversion
df.printSchema()


root
 |-- Region: string (nullable = true)
 |-- Soil_Type: string (nullable = true)
 |-- Crop: string (nullable = true)
 |-- Rainfall_mm: double (nullable = true)
 |-- Temperature_Celsius: double (nullable = true)
 |-- Fertilizer_Used: string (nullable = true)
 |-- Irrigation_Used: string (nullable = true)
 |-- Weather_Condition: string (nullable = true)
 |-- Days_to_Harvest: string (nullable = true)
 |-- Yield_tons_per_hectare: double (nullable = true)



In [6]:
# Handle missing values
# Fill numeric columns with their mean
from pyspark.sql.functions import col

numeric_columns = [col_name for col_name, dtype in df.dtypes if dtype in ("int", "double")]
for col_name in numeric_columns:
    mean_value = df.select(col_name).groupBy().avg(col_name).first()[0]
    df = df.fillna({col_name: mean_value})

# Fill categorical columns with a placeholder
categorical_columns = [col_name for col_name, dtype in df.dtypes if dtype == "string"]
df = df.fillna({col_name: "Unknown" for col_name in categorical_columns})

# Verify missing values
print("Missing values after filling:")
df.select([col(c).isNull().alias(c) for c in df.columns]).show()

Missing values after filling:
+------+---------+-----+-----------+-------------------+---------------+---------------+-----------------+---------------+----------------------+
|Region|Soil_Type| Crop|Rainfall_mm|Temperature_Celsius|Fertilizer_Used|Irrigation_Used|Weather_Condition|Days_to_Harvest|Yield_tons_per_hectare|
+------+---------+-----+-----------+-------------------+---------------+---------------+-----------------+---------------+----------------------+
| false|    false|false|      false|              false|          false|          false|            false|          false|                 false|
| false|    false|false|      false|              false|          false|          false|            false|          false|                 false|
| false|    false|false|      false|              false|          false|          false|            false|          false|                 false|
| false|    false|false|      false|              false|          false|          false|      

In [7]:

# Calculate the sum of null values for each column
null_counts = df.select([
    sum(col(c).isNull().cast("int")).alias(c) for c in df.columns
])

# Show the result
null_counts.show()

+------+---------+----+-----------+-------------------+---------------+---------------+-----------------+---------------+----------------------+
|Region|Soil_Type|Crop|Rainfall_mm|Temperature_Celsius|Fertilizer_Used|Irrigation_Used|Weather_Condition|Days_to_Harvest|Yield_tons_per_hectare|
+------+---------+----+-----------+-------------------+---------------+---------------+-----------------+---------------+----------------------+
|     0|        0|   0|          0|                  0|              0|              0|                0|              0|                     0|
+------+---------+----+-----------+-------------------+---------------+---------------+-----------------+---------------+----------------------+



In [8]:
from pyspark.sql.functions import trim

# Strip whitespace from string columns
for col_name in categorical_columns:
    df = df.withColumn(col_name, trim(col(col_name)))

# Display the DataFrame to verify the result
print("DataFrame after trimming whitespace from string columns:")
df.show(truncate=False)


DataFrame after trimming whitespace from string columns:
+------+---------+-------+------------------+-------------------+---------------+---------------+-----------------+---------------+----------------------+
|Region|Soil_Type|Crop   |Rainfall_mm       |Temperature_Celsius|Fertilizer_Used|Irrigation_Used|Weather_Condition|Days_to_Harvest|Yield_tons_per_hectare|
+------+---------+-------+------------------+-------------------+---------------+---------------+-----------------+---------------+----------------------+
|North |Silt     |Soybean|199.38193686127573|24.40539970164948  |False          |False          |Rainy            |77.0           |2.094286585378294     |
|North |Clay     |Soybean|124.59555905272148|24.381639466718735 |False          |False          |Rainy            |125.0          |1.1297512768654394    |
|East  |Sandy    |Wheat  |983.47603209473   |28.71469155885488  |False          |False          |Cloudy           |104.0          |5.076176030917903     |
|West  |Loam 

In [9]:
from pyspark.sql.functions import col, when, upper, isnan

# Assume `df` is the PySpark DataFrame

# Convert all string columns to uppercase
string_columns = [col_name for col_name, dtype in df.dtypes if dtype == "string"]
for col_name in string_columns:
    df = df.withColumn(col_name, upper(col(col_name)))

# Display a sample of the data after converting strings to uppercase
print("Sample data after converting strings to uppercase:")
df.show(5, truncate=False)

# Count null values for all columns after the transformation
null_counts_after_upper = (
    df.select([when(col(c).isNull() | isnan(c), 1).alias(c) for c in df.columns])
    .groupBy()
    .sum()
)

print("Null counts per column after converting strings to uppercase:")
null_counts_after_upper.show()

# Count the number of rows after the transformation
row_count_after_upper = df.count()
print(f"Number of rows after converting strings to uppercase: {row_count_after_upper}")

# Collect and display a sample of the processed data
processed_data_after_upper = df.take(5)  # Collect the first 5 rows
for row in processed_data_after_upper:
    print(row)


Sample data after converting strings to uppercase:
+------+---------+-------+------------------+-------------------+---------------+---------------+-----------------+---------------+----------------------+
|Region|Soil_Type|Crop   |Rainfall_mm       |Temperature_Celsius|Fertilizer_Used|Irrigation_Used|Weather_Condition|Days_to_Harvest|Yield_tons_per_hectare|
+------+---------+-------+------------------+-------------------+---------------+---------------+-----------------+---------------+----------------------+
|NORTH |SILT     |SOYBEAN|199.38193686127573|24.40539970164948  |FALSE          |FALSE          |RAINY            |77.0           |2.094286585378294     |
|NORTH |CLAY     |SOYBEAN|124.59555905272148|24.381639466718735 |FALSE          |FALSE          |RAINY            |125.0          |1.1297512768654394    |
|EAST  |SANDY    |WHEAT  |983.47603209473   |28.71469155885488  |FALSE          |FALSE          |CLOUDY           |104.0          |5.076176030917903     |
|WEST  |LOAM     |M

In [10]:
from pyspark.sql.types import StringType

# Convert 'Region' column to StringType (already default for strings in PySpark)
df = df.withColumn("Region", col("Region").cast(StringType()))

# Convert 'Crop' column to StringType
df = df.withColumn("Crop", col("Crop").cast(StringType()))

# Display schema to confirm the datatype
df.printSchema()

# Show the updated DataFrame
df.select("Region", "Crop").show()

root
 |-- Region: string (nullable = false)
 |-- Soil_Type: string (nullable = false)
 |-- Crop: string (nullable = false)
 |-- Rainfall_mm: double (nullable = false)
 |-- Temperature_Celsius: double (nullable = false)
 |-- Fertilizer_Used: string (nullable = false)
 |-- Irrigation_Used: string (nullable = false)
 |-- Weather_Condition: string (nullable = false)
 |-- Days_to_Harvest: string (nullable = false)
 |-- Yield_tons_per_hectare: double (nullable = false)

+------+-------+
|Region|   Crop|
+------+-------+
| NORTH|SOYBEAN|
| NORTH|SOYBEAN|
|  EAST|  WHEAT|
|  WEST|  MAIZE|
| SOUTH| COTTON|
|  WEST|  MAIZE|
| NORTH|SOYBEAN|
| NORTH| BARLEY|
| NORTH| BARLEY|
| SOUTH|SOYBEAN|
| NORTH| BARLEY|
|  WEST| COTTON|
|  EAST| COTTON|
|  EAST| COTTON|
| SOUTH|  MAIZE|
| SOUTH| BARLEY|
| SOUTH|   RICE|
| NORTH| BARLEY|
| NORTH| BARLEY|
| SOUTH| BARLEY|
+------+-------+
only showing top 20 rows



In [11]:
from pyspark.sql.functions import round

# Round the 'Rainfall_mm' column to 2 decimal places
df = df.withColumn("Rainfall_mm", round(col("Rainfall_mm"), 2))

# Display the updated DataFrame
df.show(5)

+------+---------+-------+-----------+-------------------+---------------+---------------+-----------------+---------------+----------------------+
|Region|Soil_Type|   Crop|Rainfall_mm|Temperature_Celsius|Fertilizer_Used|Irrigation_Used|Weather_Condition|Days_to_Harvest|Yield_tons_per_hectare|
+------+---------+-------+-----------+-------------------+---------------+---------------+-----------------+---------------+----------------------+
| NORTH|     SILT|SOYBEAN|     199.38|  24.40539970164948|          FALSE|          FALSE|            RAINY|           77.0|     2.094286585378294|
| NORTH|     CLAY|SOYBEAN|      124.6| 24.381639466718735|          FALSE|          FALSE|            RAINY|          125.0|    1.1297512768654394|
|  EAST|    SANDY|  WHEAT|     983.48|  28.71469155885488|          FALSE|          FALSE|           CLOUDY|          104.0|     5.076176030917903|
|  WEST|     LOAM|  MAIZE|     966.01|  30.05385797634982|           TRUE|          FALSE|           CLOUDY|    

In [12]:
from pyspark.sql.functions import col

# Identify numeric columns
numeric_columns = [col_name for col_name, dtype in df.dtypes if dtype in ("int", "double")]

# Loop through numeric columns to filter outliers
for col_name in numeric_columns:
    # Calculate Q1 and Q3 using approxQuantile
    Q1, Q3 = df.approxQuantile(col_name, [0.25, 0.75], relativeError=0.01)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Filter rows within the bounds
    df = df.filter((col(col_name) >= lower_bound) & (col(col_name) <= upper_bound))

# Show the DataFrame information
print("Filtered DataFrame:")
df.printSchema()
df.show()


Filtered DataFrame:
root
 |-- Region: string (nullable = false)
 |-- Soil_Type: string (nullable = false)
 |-- Crop: string (nullable = false)
 |-- Rainfall_mm: double (nullable = true)
 |-- Temperature_Celsius: double (nullable = false)
 |-- Fertilizer_Used: string (nullable = false)
 |-- Irrigation_Used: string (nullable = false)
 |-- Weather_Condition: string (nullable = false)
 |-- Days_to_Harvest: string (nullable = false)
 |-- Yield_tons_per_hectare: double (nullable = false)

+------+---------+-------+-----------+-------------------+---------------+---------------+-----------------+---------------+----------------------+
|Region|Soil_Type|   Crop|Rainfall_mm|Temperature_Celsius|Fertilizer_Used|Irrigation_Used|Weather_Condition|Days_to_Harvest|Yield_tons_per_hectare|
+------+---------+-------+-----------+-------------------+---------------+---------------+-----------------+---------------+----------------------+
| NORTH|     SILT|SOYBEAN|     199.38|  24.40539970164948|          

In [13]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.sql.functions import udf, col
from pyspark.sql.types import FloatType

# Define categorical columns to encode
categorical_columns = ['Region', 'Soil_Type', 'Crop', 'Weather_Condition', 'Irrigation_Used', 'Fertilizer_Used']

# Step 1: Remove existing index/encoded columns if present
columns_to_remove = [f"{col}_Index" for col in categorical_columns] + [f"{col}_Encoded" for col in categorical_columns]
df = df.drop(*[c for c in columns_to_remove if c in df.columns])

# Step 2: Apply StringIndexer and OneHotEncoder
indexers = [StringIndexer(inputCol=col_name, outputCol=f"{col_name}_Index") for col_name in categorical_columns]
encoders = [OneHotEncoder(inputCol=f"{col_name}_Index", outputCol=f"{col_name}_Encoded", dropLast=False) for col_name in categorical_columns]

# Combine into a pipeline
pipeline = Pipeline(stages=indexers + encoders)

# Fit and transform the DataFrame
pipeline_model = pipeline.fit(df)
encoded_df = pipeline_model.transform(df)

# Step 3: Convert OneHotEncoded vectors to separate binary columns
def extract_dense_vector(vector, index):
    return float(vector[index]) if vector is not None else 0.0

for col_name in categorical_columns:
    num_categories = encoded_df.select(f"{col_name}_Encoded").first()[0].size  # Get number of categories
    for i in range(num_categories):
        extract_udf = udf(lambda vector: extract_dense_vector(vector, i), FloatType())
        encoded_df = encoded_df.withColumn(f"{col_name}_{i}", extract_udf(col(f"{col_name}_Encoded")))

# Step 4: Drop original, indexed, and encoded columns
columns_to_drop = categorical_columns + [f"{col}_Index" for col in categorical_columns] + [f"{col}_Encoded" for col in categorical_columns]
encoded_df = encoded_df.drop(*columns_to_drop)

# Show the final DataFrame
encoded_df.show(truncate=False)
encoded_df.printSchema()


+-----------+-------------------+---------------+----------------------+--------+--------+--------+--------+-----------+-----------+-----------+-----------+-----------+-----------+------+------+------+------+------+------+-------------------+-------------------+-------------------+-----------------+-----------------+-----------------+-----------------+
|Rainfall_mm|Temperature_Celsius|Days_to_Harvest|Yield_tons_per_hectare|Region_0|Region_1|Region_2|Region_3|Soil_Type_0|Soil_Type_1|Soil_Type_2|Soil_Type_3|Soil_Type_4|Soil_Type_5|Crop_0|Crop_1|Crop_2|Crop_3|Crop_4|Crop_5|Weather_Condition_0|Weather_Condition_1|Weather_Condition_2|Irrigation_Used_0|Irrigation_Used_1|Fertilizer_Used_0|Fertilizer_Used_1|
+-----------+-------------------+---------------+----------------------+--------+--------+--------+--------+-----------+-----------+-----------+-----------+-----------+-----------+------+------+------+------+------+------+-------------------+-------------------+-------------------+--------

In [14]:
# Define the columns to scale
columns_to_convert = [ "Days_to_Harvest", "Yield_tons_per_hectare"]

# Step 1: Replace null values with 0.0 and cast to double
for col_name in columns_to_convert:
    encoded_df = encoded_df.withColumn(col_name,
                                       when(col(col_name).isNull(), 0.0).otherwise(col(col_name).cast("double")))


In [15]:
from pyspark.ml.feature import MinMaxScaler, VectorAssembler
from pyspark.ml.functions import vector_to_array
from pyspark.sql.functions import col, when

# Define the columns to scale
columns_to_scale = ["Rainfall_mm", "Temperature_Celsius"]

# Step 1: Replace null values with 0.0 and cast to double
for col_name in columns_to_scale:
    encoded_df = encoded_df.withColumn(
        col_name,
        when(col(col_name).isNull(), 0.0).otherwise(col(col_name).cast("double"))
    )

# Verify data types (optional)
encoded_df.printSchema()

# Step 2: Create a VectorAssembler to combine the columns into a single vector
assembler = VectorAssembler(inputCols=columns_to_scale, outputCol="features_to_scale")

# Transform the DataFrame to add the 'features_to_scale' column
df = assembler.transform(encoded_df)

# Step 3: Initialize the MinMaxScaler
scaler = MinMaxScaler(inputCol="features_to_scale", outputCol="scaled_features")

# Step 4: Fit and transform the scaler to scale the features
scaler_model = scaler.fit(df)
scaled_df = scaler_model.transform(df)

# Step 5: Convert the vector column to an array to extract individual elements
scaled_df = scaled_df.withColumn("scaled_features_array", vector_to_array(col("scaled_features")))

# Step 6: Extract the scaled features back into individual columns
for i, col_name in enumerate(columns_to_scale):
    scaled_df = scaled_df.withColumn(col_name + "_scaled", col("scaled_features_array")[i])

# Step 7: Drop intermediate columns (optional)
scaled_df = scaled_df.drop("features_to_scale", "scaled_features", "scaled_features_array")

# Show the resulting DataFrame
scaled_df.show(truncate=False)

# Print schema to verify new scaled columns
scaled_df.printSchema()


root
 |-- Rainfall_mm: double (nullable = true)
 |-- Temperature_Celsius: double (nullable = false)
 |-- Days_to_Harvest: double (nullable = true)
 |-- Yield_tons_per_hectare: double (nullable = false)
 |-- Region_0: float (nullable = true)
 |-- Region_1: float (nullable = true)
 |-- Region_2: float (nullable = true)
 |-- Region_3: float (nullable = true)
 |-- Soil_Type_0: float (nullable = true)
 |-- Soil_Type_1: float (nullable = true)
 |-- Soil_Type_2: float (nullable = true)
 |-- Soil_Type_3: float (nullable = true)
 |-- Soil_Type_4: float (nullable = true)
 |-- Soil_Type_5: float (nullable = true)
 |-- Crop_0: float (nullable = true)
 |-- Crop_1: float (nullable = true)
 |-- Crop_2: float (nullable = true)
 |-- Crop_3: float (nullable = true)
 |-- Crop_4: float (nullable = true)
 |-- Crop_5: float (nullable = true)
 |-- Weather_Condition_0: float (nullable = true)
 |-- Weather_Condition_1: float (nullable = true)
 |-- Weather_Condition_2: float (nullable = true)
 |-- Irrigation_Us

In [16]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

window_spec = Window.partitionBy("Region_0").orderBy(F.desc("Yield_tons_per_hectare"))
df = scaled_df.withColumn("Rank", F.rank().over(window_spec))
df.show()


+-----------+-------------------+---------------+----------------------+--------+--------+--------+--------+-----------+-----------+-----------+-----------+-----------+-----------+------+------+------+------+------+------+-------------------+-------------------+-------------------+-----------------+-----------------+-----------------+-----------------+------------------+--------------------------+----+
|Rainfall_mm|Temperature_Celsius|Days_to_Harvest|Yield_tons_per_hectare|Region_0|Region_1|Region_2|Region_3|Soil_Type_0|Soil_Type_1|Soil_Type_2|Soil_Type_3|Soil_Type_4|Soil_Type_5|Crop_0|Crop_1|Crop_2|Crop_3|Crop_4|Crop_5|Weather_Condition_0|Weather_Condition_1|Weather_Condition_2|Irrigation_Used_0|Irrigation_Used_1|Fertilizer_Used_0|Fertilizer_Used_1|Rainfall_mm_scaled|Temperature_Celsius_scaled|Rank|
+-----------+-------------------+---------------+----------------------+--------+--------+--------+--------+-----------+-----------+-----------+-----------+-----------+-----------+------+-

Part 2

In [17]:
from pyspark.sql.functions import col

# Separate features (X) by dropping the target column
X = scaled_df.drop("Yield_tons_per_hectare")

# Select the target column (Y)
Y = scaled_df.select("Yield_tons_per_hectare")

# Show summary statistics for features
print("Feature Summary:")
X.describe().show()

# Show summary statistics for the target variable
print("Target Summary:")
Y.describe().show()

Feature Summary:
+-------+------------------+-------------------+------------------+------------------+-------------------+------------------+------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+------------------+--------------------------+
|summary|       Rainfall_mm|Temperature_Celsius|   Days_to_Harvest|          Region_0|           Region_1|          Region_2|          Region_3|        Soil_Type_0|        Soil_Type_1|        Soil_Type_2|        Soil_Type_3|        Soil_Type_4|        Soil_Type_5|             Crop_0|            Crop_1|            Crop_2|             Crop_3|             Crop_4|             Crop_5|Weather_Condition_0|Weather_Condit

In [18]:
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.sql.functions import col, when

# Step 1: Ensure all feature columns have valid numeric data
feature_columns = [col for col in scaled_df.columns if col != "Yield_tons_per_hectare"]
for feature in feature_columns:
    scaled_df = scaled_df.withColumn(feature, when(col(feature).isNull(), 0.0).otherwise(col(feature).cast("double")))

# Step 2: Combine features into a single vector column
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
dataset = assembler.transform(scaled_df).select("features", col("Yield_tons_per_hectare").alias("label"))

# Step 3: Split the dataset into training and testing sets
train_df, test_df = dataset.randomSplit([0.8, 0.2], seed=42)

# Step 4: Scale the features
scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withMean=True, withStd=True)

# Fit the scaler on the training set
scaler_model = scaler.fit(train_df)

# Transform the training and testing datasets
train_df = scaler_model.transform(train_df).select("scaled_features", "label")
test_df = scaler_model.transform(test_df).select("scaled_features", "label")

# Display the processed training dataset
train_df.show(truncate=False)


+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+
|scaled_features                                                                                                                                                                                                                                                                                                                                                                                                                

In [19]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col

# Debug: Check schema of train_df and test_df
print("Train DataFrame Schema:")
train_df.printSchema()
print("Test DataFrame Schema:")
test_df.printSchema()

# Debug: Show a sample of the training and testing data
print("Train DataFrame Sample:")
train_df.show(5, truncate=False)
print("Test DataFrame Sample:")
test_df.show(5, truncate=False)

# Step 4: Train Linear Regression model
linear_reg = LinearRegression(featuresCol="scaled_features", labelCol="label", predictionCol="prediction")
linear_model = linear_reg.fit(train_df)

# Step 5: Make predictions on the test data
predictions = linear_model.transform(test_df)

# Step 6: Evaluate the model
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction")
mse = evaluator.evaluate(predictions, {evaluator.metricName: "mse"})
r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})

# Output results
print(f"Mean Squared Error (MSE): {mse}")
print(f"R Squared (R²): {r2}")


Train DataFrame Schema:
root
 |-- scaled_features: vector (nullable = true)
 |-- label: double (nullable = false)

Test DataFrame Schema:
root
 |-- scaled_features: vector (nullable = true)
 |-- label: double (nullable = false)

Train DataFrame Sample:
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+
|scaled_features                                                                                                                                                         

In [20]:
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Step 1: Initialize the Generalized Linear Regression (GLR) model
glr = GeneralizedLinearRegression(featuresCol="scaled_features",
                                  labelCol="label",
                                  predictionCol="prediction",
                                  family="gaussian",  # You can set this to "poisson", "gamma", etc. based on the target variable's distribution
                                  link="identity")   # Common link functions: "log", "identity", "inverse"

# Step 2: Fit the GLR model on the training dataset
glr_model = glr.fit(train_df)

# Step 3: Make predictions on the test dataset
predictions = glr_model.transform(test_df)

# Step 4: Evaluate the model using MSE and R² metrics
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction")

# Calculate Mean Squared Error (MSE)
mse = evaluator.evaluate(predictions, {evaluator.metricName: "mse"})

# Calculate R-squared (R²)
r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})

# Step 5: Output results
print(f"Mean Squared Error (MSE): {mse}")
print(f"R Squared (R²): {r2}")

# Step 6: Display the model summary
summary = glr_model.summary
print("\nModel Summary:")
print(f"Coefficients: {glr_model.coefficients}")
print(f"Intercept: {glr_model.intercept}")
print(f"Deviance Residuals: {summary.residuals().show(5)}")


Mean Squared Error (MSE): 0.43812148514757643
R Squared (R²): 0.8394501839003956

Model Summary:
Coefficients: [0.6002735511598194,0.06638619731465019,-0.0006526940177461589,0.0012281067410129562,-0.0007245538695789818,-0.00095434489270439,0.000450085611738269,-0.00024573199436658246,-9.76185334030066e-05,-0.000743296011464675,-0.0006922038480599438,0.0014544682059179977,0.00032628804849871366,-0.0004918632391310292,0.0016195688644274637,0.00010838066552501012,-9.520240668075435e-05,-0.00036083316287776235,-0.0007816008717983226,-0.00039287048177802543,0.0009314797181383419,-0.0005387813117502888,-0.28468197497095293,0.28468197497095565,-0.3562991888869114,0.35629918888690754,0.6002735511598145,0.06638619731465112]
Intercept: 4.65107498564581
+-------------------+
|  devianceResiduals|
+-------------------+
| 0.1622250037249362|
|-0.3651342149654542|
|-0.1988389202941978|
|-0.5480463007165359|
|-0.2610726004741608|
+-------------------+
only showing top 5 rows

Deviance Residuals: None

In [21]:
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator

# Step 4: Train a Decision Tree Regressor model
decision_tree_reg = DecisionTreeRegressor(featuresCol="scaled_features", labelCol="label", predictionCol="prediction")
decision_tree_model = decision_tree_reg.fit(train_df)

# Step 5: Make predictions on the test data
predictions = decision_tree_model.transform(test_df)

# Step 6: Evaluate the model
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction")
mse = evaluator.evaluate(predictions, {evaluator.metricName: "mse"})
r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})

# Output results
print(f"Mean Squared Error (MSE): {mse}")
print(f"R Squared (R²): {r2}")

# (Optional) Display feature importance if needed
print("Feature Importances:", decision_tree_model.featureImportances)


Mean Squared Error (MSE): 0.4808651520078428
R Squared (R²): 0.8237867478752781
Feature Importances: (28,[0,22,23,24,25],[0.6301366933473066,0.11037449288933474,0.03329791724530636,0.1052680483032503,0.12092284821480206])


In [22]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

# Step 4: Initialize the Random Forest Regressor
random_forest = RandomForestRegressor(featuresCol="scaled_features", labelCol="label", predictionCol="prediction",
                                      numTrees=20, maxDepth=5, seed=42)  # You can tune these parameters as needed

# Step 5: Train the Random Forest Regressor
rf_model = random_forest.fit(train_df)

# Step 6: Make predictions on the test data
predictions = rf_model.transform(test_df)

# Step 7: Evaluate the model
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction")

# Calculate Mean Squared Error (MSE) and R-Squared (R²)
mse = evaluator.evaluate(predictions, {evaluator.metricName: "mse"})
r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})

# Output results
print(f"Mean Squared Error (MSE): {mse}")
print(f"R Squared (R²): {r2}")


Mean Squared Error (MSE): 0.4927737654977338
R Squared (R²): 0.819422831083662


In [23]:
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator

# Step 4: Train a Gradient-Boosted Tree Regressor model
gbt_regressor = GBTRegressor(featuresCol="scaled_features", labelCol="label", predictionCol="prediction", maxIter=100)
gbt_model = gbt_regressor.fit(train_df)

# Step 5: Make predictions on the test data
predictions = gbt_model.transform(test_df)

# Step 6: Evaluate the model
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction")
mse = evaluator.evaluate(predictions, {evaluator.metricName: "mse"})
r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})

# Output results
print(f"Mean Squared Error (MSE): {mse}")
print(f"R Squared (R²): {r2}")


Mean Squared Error (MSE): 0.441874210008113
R Squared (R²): 0.8380749961804219
