In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'farm-connect-soil-water-content-prediction:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F68435%2F7600466%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240223%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240223T181949Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D62cf01db165247f16cf729fe3faa674266f9233c80fd46af0841da0b93c9171cc6f93511021be3cb2f87835857848e67ed3e3e77bbf031f220255349ff01c0355b4677dfacdc59a8be0bcf7552433b2d3f776bb156a8613d026db4975d3dfee41602b69cf503a2d1925bc1f9ac2106a99e0eaebc2a1b4a6087bc8ae811712a546aa9ff39e3ea8aa106ef22c098f80e292e99131f1b709a5b746643f6267b9d56d4cf6e802ded33a76d5b460ce96942a781587d9cbd2653d89c61d06aa6d34c290d525bf7f0b0c7bf4f3b82295a4a0ce7f5ebf031021e77af92d028c15c2f48aa85da5bcc0244b450aa2312a8577b478110068c5d5225ad4badbc48e282dac604'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
!pip install --upgrade scikit-learn
!pip install pycaret

In [None]:
!pip install --upgrade scipy

In [None]:
from pycaret.regression import *

In [None]:
import sklearn.model_selection
import sklearn.metrics
import os
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFECV
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

In [None]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

# Data Preprocessing

In [None]:
def plot_distribution(df):
    # Determine the number of rows needed for subplots
    num_plots = len(df.columns)
    num_rows = (num_plots + 2) // 3  # Ensure even distribution of plots

    # Create a figure and axes for subplots
    fig, axes = plt.subplots(num_rows, 3, figsize=(15, 5*num_rows))

    # Flatten axes if only one row
    if num_rows == 1:
        axes = axes.reshape(1, -1)

    # Iterate over each column in the DataFrame
    for i, col in enumerate(df.columns):
        # Calculate subplot index
        row_idx = i // 3
        col_idx = i % 3

        # Create a seaborn histogram plot for the column
        sns.histplot(df[col], kde=True, ax=axes[row_idx, col_idx])
        # Add a title with the column name
        axes[row_idx, col_idx].set_title(f'Distribution of {col}')

    # Adjust layout to prevent overlap
    plt.tight_layout()
    # Show the plot
    plt.show()


## Train

In [None]:
df = []
src = '/kaggle/input/farm-connect-soil-water-content-prediction/train/train'
for file in os.listdir(src):
    df.append(pd.read_csv(os.path.join(src, file)))
df = pd.concat(df)
df.shape

In [None]:
df = df[['1Air Humidity', '1AirTemperature', '1LightHigh',
       '1LightLow',  '12WindSpeed','13WindDirection', '23SoilWC ค.ชื้นดิน', '23SoilTemp.อุณหภูมิดิน',
       '23SoilEC', '23SoilpH', '26N', '26P', '26K', '29SoilTension', 'flow1']]

### Outliers

In [None]:
z_scores = (df - df.mean()) / df.std()
outliers = (z_scores > 3) | (z_scores < -3)
df = df[~outliers.any(axis=1)]
df.shape

### Missing Values

In [None]:
df[['23SoilWC ค.ชื้นดิน']].isnull().sum()

In [None]:
df.dropna(subset=['23SoilWC ค.ชื้นดิน'], inplace=True)
df.shape

In [None]:
df_means = df.mean(numeric_only=True)
df.fillna(df_means, inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.reset_index(drop=True, inplace=True)

### EDA_1

In [None]:
plot_distribution(df.drop(['23SoilWC ค.ชื้นดิน'], axis=1))

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap((df.drop(['23SoilWC ค.ชื้นดิน'], axis=1)).corr(), annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Heatmap of Correlation Matrix')
plt.show()

### Scaler

In [None]:
df.head()

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[['1Air Humidity', '1AirTemperature', '1LightHigh', '1LightLow','12WindSpeed', '23SoilTemp.อุณหภูมิดิน', '23SoilEC', '23SoilpH', '26N', '26P', '26K','29SoilTension']] = scaler.fit_transform(df[['1Air Humidity', '1AirTemperature', '1LightHigh', '1LightLow','12WindSpeed', '23SoilTemp.อุณหภูมิดิน', '23SoilEC', '23SoilpH', '26N', '26P', '26K','29SoilTension']])
df.head()

### Encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder

encoding_1 = OneHotEncoder()
encoding_1.fit(np.array(df['flow1']).reshape(-1, 1))
dummies_1 = encoding_1.transform(np.array(df['flow1']).reshape(-1, 1)).toarray()

In [None]:
cols = []
cols.extend(df.columns)
cols.extend(['flow_0', 'flow_ 1'])
cols

In [None]:
df = pd.DataFrame(np.column_stack([df, dummies_1]), columns = [i for i in cols])
df.head()

In [None]:
encoding_2 = OneHotEncoder()
encoding_2.fit(np.array(df['13WindDirection']).reshape(-1, 1))
dummies_2 = encoding_2.transform(np.array(df['13WindDirection']).reshape(-1, 1)).toarray()
cols.extend(['wd_0', 'wd_1','wd_2','wd_3', 'wd_4','wd_5', 'wd_6','wd_7'])

In [None]:
df = pd.DataFrame(np.column_stack([df, dummies_2]), columns = [i for i in cols])
df.head()

In [None]:
df.drop(["13WindDirection", "flow1"], axis = 1, inplace=True)
df.columns

#### EDA_2

In [None]:
plot_distribution(df.drop(['23SoilWC ค.ชื้นดิน'], axis=1))

### TrainVal Split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(df.drop(['23SoilWC ค.ชื้นดิน'], axis=1),
                                                  df[['23SoilWC ค.ชื้นดิน']], test_size = 0.1, random_state = 888)
print("The shape of X_train is:", X_train.shape)
print("The shape of X_val is:", X_val.shape)
print("The shape of y_train is:", y_train.shape)
print("The shape of y_val is:", y_val.shape)

## Test

In [None]:
x_test = []
for f in ['02', '09']:
    x_test.append(
        pd.read_csv(
            f'/kaggle/input/farm-connect-soil-water-content-prediction/test/test/set{f}.csv'
        )
    )
x_test = pd.concat(x_test)
x_test.head()

In [None]:
x_test = x_test[['1Air Humidity', '1AirTemperature', '1LightHigh',
       '1LightLow',  '12WindSpeed','13WindDirection', '23SoilTemp.อุณหภูมิดิน',
       '23SoilEC', '23SoilpH', '26N', '26P', '26K', '29SoilTension', 'flow1']]
x_test.reset_index(drop=True, inplace=True)

### missing values

In [None]:
test_means = x_test.mean(numeric_only=True)
x_test.fillna(test_means, inplace=True)
x_test.isnull().sum()

### EDA_1

In [None]:
plot_distribution(x_test)

### Scaler

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_test[['1Air Humidity', '1AirTemperature', '1LightHigh', '1LightLow','12WindSpeed', '23SoilTemp.อุณหภูมิดิน', '23SoilEC', '23SoilpH', '26N', '26P', '26K','29SoilTension']] = scaler.fit_transform(x_test[['1Air Humidity', '1AirTemperature', '1LightHigh', '1LightLow','12WindSpeed', '23SoilTemp.อุณหภูมิดิน', '23SoilEC', '23SoilpH', '26N', '26P', '26K','29SoilTension']])
x_test.head()

In [None]:
x_test.reset_index(drop=True, inplace=True)

### Encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder

encoding_1 = OneHotEncoder()
encoding_1.fit(np.array(x_test['flow1']).reshape(-1, 1))
dummies_1 = encoding_1.transform(np.array(x_test['flow1']).reshape(-1, 1)).toarray()

In [None]:
cols = []
cols.extend(x_test.columns)
cols.extend(['flow_0', 'flow_ 1'])
cols

In [None]:
x_test = pd.DataFrame(np.column_stack([x_test, dummies_1]), columns = [i for i in cols])
x_test.head()

In [None]:
encoding_2 = OneHotEncoder()
encoding_2.fit(np.array(x_test['13WindDirection']).reshape(-1, 1))
dummies_2 = encoding_2.transform(np.array(x_test['13WindDirection']).reshape(-1, 1)).toarray()
cols.extend(['wd_0', 'wd_1','wd_2','wd_3', 'wd_4','wd_5', 'wd_6','wd_7'])

In [None]:
x_test = pd.DataFrame(np.column_stack([x_test, dummies_2]), columns = [i for i in cols])
x_test.head()

In [None]:
x_test.drop(["13WindDirection", "flow1"], axis = 1, inplace=True)
x_test.columns

#### EDA_2

In [None]:
plot_distribution(x_test)

## data for pycaret

In [None]:
ogf = []
src = '/kaggle/input/farm-connect-soil-water-content-prediction/train/train'
for file in os.listdir(src):
    ogf.append(pd.read_csv(os.path.join(src, file)))
ogf = pd.concat(ogf)
ogf = ogf[['1Air Humidity', '1AirTemperature', '1LightHigh',
       '1LightLow',  '12WindSpeed','13WindDirection', '23SoilWC ค.ชื้นดิน', '23SoilTemp.อุณหภูมิดิน',
       '23SoilEC', '23SoilpH', '26N', '26P', '26K', '29SoilTension', 'flow1']]

In [None]:
og_test = []
for f in ['02', '09']:
    og_test.append(
        pd.read_csv(
            f'/kaggle/input/farm-connect-soil-water-content-prediction/test/test/set{f}.csv'
        )
    )
og_test = pd.concat(og_test)
og_test = og_test[['1Air Humidity', '1AirTemperature', '1LightHigh',
       '1LightLow',  '12WindSpeed','13WindDirection', '23SoilTemp.อุณหภูมิดิน',
       '23SoilEC', '23SoilpH', '26N', '26P', '26K', '29SoilTension', 'flow1']]
og_test.reset_index(drop=True, inplace=True)

### outliers

In [None]:
z_scores = (ogf - ogf.mean()) / ogf.std()
outliers = (z_scores > 3) | (z_scores < -3)
ogf = ogf[~outliers.any(axis=1)]
ogf = ogf.reset_index(drop=True)
ogf.shape

### missing values

In [None]:
target = ogf['23SoilWC ค.ชื้นดิน']
ogf = ogf.drop(['23SoilWC ค.ชื้นดิน'], axis=1)
target

In [None]:
og = [ogf, og_test]
og = pd.concat(og, ignore_index=True)
og

In [None]:
means = og.mean(numeric_only=True)
og.fillna(means, inplace=True)
og.isnull().sum()

### split data, select feature

In [None]:
ogf = og[:len(ogf)]
ogf = ogf.reset_index(drop=True)
ogf

In [None]:
og_test = og[len(og_test):]
og_test = og_test.reset_index(drop=True)
og_test

In [None]:
ogf['23SoilWC ค.ชื้นดิน'] = target

ogf = ogf.dropna()
ogf.isnull().sum()

In [None]:
ogf = ogf.reset_index(drop=True)
ogf.shape

In [None]:
target = ogf['23SoilWC ค.ชื้นดิน']
ogf = ogf.drop(['23SoilWC ค.ชื้นดิน'], axis=1)
target

# Tree model



In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor

regr = RandomForestRegressor()
etr = ExtraTreesRegressor()

## Feature Selection

In [None]:
reg_mod = RFECV(regr,
                cv=3,
                step=5
                )
reg_mod.fit(X_train, y_train)
X_train.columns[reg_mod.support_]

In [None]:
reg_mod = RFECV(etr,
                cv=3,
                step=5
                )
reg_mod.fit(X_train, y_train)
X_train.columns[reg_mod.support_]

## training

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_absolute_error

etr = ExtraTreesRegressor(n_estimators=300, random_state=888)
# etr = RFE(estimator=etr, n_features_to_select=5, step=5)
etr.fit(X_train[['23SoilEC', '23SoilpH', '26N', '26P', '29SoilTension']], y_train)
y_etr = etr.predict(X_val[['23SoilEC', '23SoilpH', '26N', '26P', '29SoilTension']])
print("Mean Absolute Error (MAE):", mean_absolute_error(y_val, y_etr))

In [None]:
etr = ExtraTreesRegressor(n_estimators=300, random_state=888)
# etr = RFE(estimator=etr, n_features_to_select=5, step=5)
etr.fit(X_train, y_train)
y_etr = etr.predict(X_val)
print("Mean Absolute Error (MAE):", mean_absolute_error(y_val, y_etr))

In [None]:
regr = RandomForestRegressor(n_estimators=300, random_state=888)
regr = RFE(estimator=regr, n_features_to_select=5, step=5)
regr.fit(X_train[['23SoilEC', '23SoilpH', '26N', '26P', '29SoilTension']], y_train)
y_regr = regr.predict(X_val[['23SoilEC', '23SoilpH', '26N', '26P', '29SoilTension']])
print("Mean Absolute Error (MAE):", mean_absolute_error(y_val, y_regr))

In [None]:
regr = RandomForestRegressor(n_estimators=300, random_state=888)
# regr = RFE(estimator=regr, n_features_to_select=5, step=5)
regr.fit(X_train, y_train)
y_regr = regr.predict(X_val)
print("Mean Absolute Error (MAE):", mean_absolute_error(y_val, y_regr))

In [None]:
y_pred_e = etr.predict(x_test)
y_pred_e

In [None]:
y_pred_r = regr.predict(x_test)
y_pred_r

## Submission

In [None]:
submission = pd.read_csv('/kaggle/input/farm-connect-soil-water-content-prediction/sample_submission.csv')
submission.head()

In [None]:
submission['23SoilWC ค.ชื้นดิน'][1:] = y_pred_e[1:]
submission

In [None]:
submission.to_csv('etr_superprocess.csv', index=False)

# Pycaret

## train

In [None]:
from pycaret.regression import *

reg = setup(data=ogf[['23SoilEC', '23SoilpH', '26N', '26P', '29SoilTension']], target=target, train_size=0.9, session_id=999)

In [None]:
best_model = compare_models()

## stacking

In [None]:
t_1 = create_model('et')
t_2 = create_model('rf')
t_3 = create_model('xgboost')

estimators = [
    ('et', t_1),
    ('rf', t_2),
    ('xgboost', t_3)
]

In [None]:
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import train_test_split

reg = StackingRegressor(
    estimators=estimators
)

X_train, X_val, y_train, y_val = train_test_split(ogf, target, random_state=42)

In [None]:
reg.fit(X_train, y_train).score(X_val, y_val)

In [None]:
reg.predict(og_test)

## LinearReg

In [None]:
lng = create_model('lr')

In [None]:
predict_model(lng, data=og_test)

## extratree

In [None]:
tune = create_model('et')

In [None]:
predictions = predict_model(tune, data=og_test)
predictions

### submit and compare

In [None]:
submission = pd.read_csv('/kaggle/input/farm-connect-soil-water-content-prediction/sample_submission.csv')
submission['23SoilWC ค.ชื้นดิน'][1:] = predictions['prediction_label'][1:]
submission

In [None]:
from sklearn.metrics import mean_absolute_error

df_old = pd.read_csv('/content/ExtraTreesRegressor_Outlier_estimators300_Fx888.csv')
mean_absolute_error(df_old['23SoilWC ค.ชื้นดิน'], submission['23SoilWC ค.ชื้นดิน'])

In [None]:
submission.to_csv('pycaret_eklao.csv', index=False)