<a href="https://www.kaggle.com/code/averma111/pytorch-ps-s3e14?scriptVersionId=128451865" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


### Global Setting

In [None]:
sns.set_style("darkgrid")
pd.set_option('mode.chained_assignment',None)

###  Assigning the directory and file paths

In [None]:
ROOT_PATH='/kaggle/input/playground-series-s3e14'
train_file = 'train.csv'
test_file = 'test.csv'
sample = 'sample_submission.csv'
original = '/kaggle/input/wild-blueberry-yield-prediction/Data in Brief/Data in Brief/WildBlueberryPollinationSimulationData.csv'

### Reading the train data

In [None]:
train = pd.read_csv(ROOT_PATH+'/'+train_file)
train.head()

In [None]:
train.drop('id',axis=1,inplace=True)

In [None]:
test = pd.read_csv(ROOT_PATH+'/'+test_file)
test.head()

In [None]:
original_df = pd.read_csv(original)
original_df.head()

In [None]:
original_df.drop('Row#',axis=1,inplace=True)

In [None]:
original_df.head()

In [None]:
train.head()

In [None]:
df_full = pd.concat([train, original_df])
for col in original_df.columns:
    df_full[col] = df_full[col].astype('float64')

### Defining the summary function

In [None]:
def summary(text, df):
    print(f'{text} shape: {df.shape}')
    summ = pd.DataFrame(df.dtypes, columns=['dtypes'])
    summ['null'] = df.isnull().sum()
    summ['unique'] = df.nunique()
    summ['min'] = df.min()
    summ['median'] = df.median()
    summ['max'] = df.max()
    summ['mean'] = df.mean()
    summ['std'] = df.std()
    summ['inf'] = np.isinf(df).sum().sum()
    summ['duplicate'] = df.duplicated().sum()
    return summ

###  Summary of the trained data

In [None]:
summary('train',df_full)


* No null values. We therefore dont need to use imputation
* Categorical data ==> No Categorical data
* Data types are all float values excluding the target (integer)
* Data is reasonably small with only 15289 datapoints
* Duplicates: 7 duplicate

In [None]:
df_full = df_full.drop_duplicates()

In [None]:
summary('full',df_full)

In [None]:
df_full["fruit_seed"] = df_full["fruitset"] * df_full["seeds"]
test["fruit_seed"] = test["fruitset"] * test["seeds"]

### Distribution of the target label 

* Distribution looks fairely normal with -negative skewness

In [None]:
sns.displot(df_full['yield'])


## Pairplot of the train dataset

### Distribution of Train vs Test data

* The distribution of Test and Train datasets seem to align

In [None]:
y = df_full['yield']
df_full.drop(columns=['yield'],axis=1,inplace=True)
X = df_full.copy()

### Distribution of Train and Test look synonimous

In [None]:

fig,ax = plt.subplots(int(np.ceil(len(X.columns)/4)),4, figsize = (30,25))
for i,col in enumerate(X.columns):
    ax = np.ravel(ax)

    sns.kdeplot(x= X[col] , label = 'Train', ax = ax[i])
    sns.kdeplot(x= test[col], label = 'Test', ax = ax[i] )
    
    ax[i].legend()
    ax[i].set_title(f"col")

plt.suptitle("Distribution of Train vs Test Dataset",fontsize = 30)
plt.tight_layout(pad =3)
plt.show()

### Correlation Matrix

* Dataset looks highly correlated with target field

In [None]:
plt.figure(figsize = (25,12))

corr = train.corr()
upper_triangle = np.triu(np.ones_like(corr, dtype=bool))

sns.heatmap(corr,vmin = -1, vmax = 1, cmap = "Spectral", annot = True, mask = upper_triangle)
plt.title("Correlation of all features and target", fontsize= 18)
plt.show()

### Outlier Analaysis

In [None]:
fig,ax = plt.subplots(int(np.ceil(len(X.columns)/4)),4,figsize = (30,15))
ax = np.ravel(ax)

for i,col in enumerate(X.columns):
    sns.boxplot(ax = ax[i], x = X[col], color= "red")

fig.suptitle("Box plots of all data ",fontsize = 20)
plt.tight_layout(pad=3)
plt.show()

### Standardize the numerical features in the dataset

In [None]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler
scaler = StandardScaler()
numerical_cols = list(X.select_dtypes(include=['int','float']))
X_numerical = scaler.fit_transform(X[numerical_cols].values)


### Convert target to numpy array

In [None]:
y = y.to_numpy()

### Split the train data into train test datasets for modelling

In [None]:
# Split the data into training and test 
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_numerical,y,test_size=0.2,random_state=42)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

### Generic Hyper parameters

In [None]:
# Model Parameters 
EPOCHS = 100
BATCH_SIZE = 32
LEARNING_RATE = 0.1

### Defining the Dataset and Dataloader classes for test and train

In [None]:
# Define the DataLoader for train and test data
from torch.utils.data import Dataset, DataLoader
# Train Data
class TrainData(Dataset):
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)
    
# Test Data   
class TestData(Dataset):
    
    def __init__(self, X_data):
        self.X_data = X_data
        
    def __getitem__(self, index):
        return self.X_data[index]
        
    def __len__ (self):
        return len(self.X_data)
    

### Converting the dataset into torch tensor format

In [None]:
# Instantiate the Train and Test data class
import torch
train_data = TrainData(torch.tensor(data=X_train,dtype=torch.float32,requires_grad=True),
                       torch.tensor(data=y_train,dtype=torch.float32,requires_grad=True))
test_data = TestData(torch.tensor(data=X_test,dtype=torch.float32))

### Defining the iterator Dataloader

In [None]:
# Initialize the DataLoader 
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=1)

### Defining the pytorch regression class

In [None]:
import torch

class RegressionBlueBerryNNet(torch.nn.Module):
    def __init__(self,input_features):
        super(RegressionBlueBerryNNet,self).__init__()
        # Number of input features is 16.
        self.input_layer = torch.nn.Linear(input_features,1)
        #self.dense_layer = torch.nn.Linear(32,1)
        #self.output_layer = torch.nn.Linear(32,1)
        
        self.relu = torch.nn.ReLU()
        #self.dropout = torch.nn.Dropout(p=0.1)
        #self.batchnorm_1 = torch.nn.BatchNorm1d(32)
        #self.batchnorm_2 = torch.nn.BatchNorm1d(64)
        
    
    def forward(self,inputs):
        x = self.relu(self.input_layer(inputs))
        #x = self.input_layer(inputs)
        #x = self.batchnorm_1(x)
        #x = self.dense_layer(x)
        #x = self.batchnorm_2(x)
        #x = self.dropout(x)
        #x = self.output_layer(x)
        return x

### Defining the code to run both on CPU and GPU

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

### Instantiating the model

In [None]:
model = RegressionBlueBerryNNet(X_train.shape[1])
model.to(device)
print(model)

### Defining the Loss and Optimizer 

In [None]:
# Define the loss and optimizer 
criterion = torch.nn.L1Loss() ## Mean Absolute Error
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE,momentum=0.9)
#optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE,betas=(0.99, 0.9999))

### Traning the Model :)

In [None]:
# Tain the model
from tqdm.notebook import tqdm
#early_stopper = EarlyStopper(patience=3, min_delta=10)
model.train()
for e in tqdm(range(1, EPOCHS+1)):
    epoch_loss = 0.0
    epoch_acc = 0.0
    
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        optimizer.zero_grad()
        y_pred = model(X_batch)
        
        loss = criterion(y_pred, y_batch.unsqueeze(1))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
 
        
    if e%10 == 0:
        print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f}')
        

In [None]:
# Evaluate the model

y_pred_list = []
model.eval()
with torch.no_grad():
    for X_batch in test_loader:
        #print(X_batch.size())
        X_batch = X_batch.to(device)
        y_test_pred = model(X_batch)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_list.append(y_pred_tag.cpu().numpy())

y_pred_list = [a.squeeze().tolist() for a in y_pred_list]
#y_pred_list

In [None]:
from sklearn.metrics import mean_absolute_error
accuracy=mean_absolute_error(y_test, y_pred_list)
print(accuracy)

### Executing model on test data

In [None]:
test.head()

### Checking the summary of the test data

In [None]:
summary('test',test)

### Creating the Test variable 

In [None]:
X_val=test.loc[:, test.columns != 'id']

### Filtering the continous columns

In [None]:
numerical_cols_test = list(X_val.select_dtypes(include=['int','float']))

### Standardizing the test data as train data

In [None]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler
scaler = StandardScaler()
X_numerical_test = scaler.fit_transform(X_val[numerical_cols_test].values)

### Defining the Test Dataset class

In [None]:
# Test Data   
class TestingData(Dataset):
    
    def __init__(self, X_test_data):
        self.X_test_data = X_test_data
        
    def __getitem__(self, index):
        return self.X_test_data[index]
        
    def __len__ (self):
        return len(self.X_test_data)

### Creating the DataLoader class

In [None]:
test_data_model = TestingData(torch.tensor(data=X_numerical_test,dtype=torch.float32))
test_data_model_loader = DataLoader(dataset=test_data_model, batch_size=BATCH_SIZE)

### Generating the yield values for test data

In [None]:
# Validating the model on test data
yield_target = []
model.eval()
with torch.no_grad():
    for X_batch_test in test_data_model_loader:
        X_batch_test = X_batch_test.to(device)
        y_test_pred = model(X_batch_test)
        y_pred_tag = torch.round(y_test_pred)
        yield_target.append(y_pred_tag.cpu().numpy())

yield_target = [a.squeeze().tolist() for a in yield_target]

### Creating the submission file

In [None]:
yield_submission=[]
for col in yield_target:
    for val in col:
        yield_submission.append(val)
#sp_submission
df_test = pd.DataFrame(data={'id': test['id'],'yield': yield_submission})
df_test.to_csv('submission.csv', index=False)

In [None]:
## MinOfUpperTRange', 'AverageOfUpperTRange', 'AverageOfLowerTRange', 'MaxOfUpperTRange', 'MaxOfLowerTRange -drop