<a href="https://www.kaggle.com/code/averma111/pytorch-ps-s3e14?scriptVersionId=129403866" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import torch
from tqdm.notebook import tqdm
import torch.nn.functional as F
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
warnings.filterwarnings('ignore')
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

sns.set_style("darkgrid")
pd.set_option('mode.chained_assignment',None)


## Assigning the directory and file paths

In [None]:
train_file = '/kaggle/input/playground-series-s3e14/train.csv'
test_file = '/kaggle/input/playground-series-s3e14/test.csv'
original = '/kaggle/input/wild-blueberry-yield-prediction/Data in Brief/Data in Brief/WildBlueberryPollinationSimulationData.csv'

## Reading the datasets

In [None]:
def get_datasets(filename):
    df = pd.read_csv(filename)
    return df

In [None]:
get_datasets(train_file).head()

In [None]:
get_datasets(test_file).head()

In [None]:
get_datasets(original).head()

## Drop unwanted columns

In [None]:
def drop_columns(col_name,dataframe):
    dataframe.drop(col_name,axis=1,inplace=True)
    return dataframe

train = drop_columns('id',get_datasets(train_file))
original = drop_columns('Row#',get_datasets(original))

## Concate original and train datasets

In [None]:
def concat_dataframe(df1,df2):
    return pd.concat([df1,df2])

df_full = concat_dataframe(train, original)

## Defining the summary function

In [None]:
def summary(text, df):
    print(f'{text} shape: {df.shape}')
    summ = pd.DataFrame(df.dtypes, columns=['dtypes'])
    summ['null'] = df.isnull().sum()
    summ['unique'] = df.nunique()
    summ['min'] = df.min()
    summ['median'] = df.median()
    summ['max'] = df.max()
    summ['mean'] = df.mean()
    summ['std'] = df.std()
    summ['inf'] = np.isinf(df).sum().sum()
    summ['duplicate'] = df.duplicated().sum()
    return summ

##  Summary of the trained data

In [None]:
summary('full_dataset',df_full)


* No null values. We therefore dont need to use imputation
* Categorical data ==> No Categorical data
* Data types are all float values excluding the target (integer)
* Data is reasonably small with only 15289 datapoints
* Duplicates: 7 duplicate

## Drop duplicates from dataframe

In [None]:
def drop_dups(df):
    return df.drop_duplicates()
df_full = drop_dups(df_full)

## Feature Engineering

In [None]:
def generate_features(df):
    df["fruit_seed"] = df["fruitset"] * df["seeds"]
    df['insects'] = df['honeybee'] + df['bumbles'] + df['andrena'] + df['osmia']
    df["AverageTRange"]=(df["AverageOfUpperTRange"]+df["AverageOfLowerTRange"])/2
    return df



df_full=generate_features(df_full)
test = generate_features(get_datasets(test_file))

## Distribution plot  of the full dataset

* Distribution looks fairely normal with -negative skewness

In [None]:
sns.kdeplot(df_full,x='yield',color='r')


## Distribution of Train vs Test data

* The distribution of Test and Train datasets seem to align

In [None]:
def generate_features_labels(df,target_name):
    if target_name =='yield':
        label = df[target_name]
        features=drop_columns(target_name,df)
        return features,label
    elif target_name=='test':
        features=df.loc[:, df.columns != 'id']
        return features

X,y = generate_features_labels(df_full,'yield')


## Distribution of Train and Test look synonimous

In [None]:
def plot_kde_train_test(features,test_df):
    fig,ax = plt.subplots(int(np.ceil(len(features.columns)/4)),4, figsize = (30,25))
    for i,col in enumerate(features.columns):
        ax = np.ravel(ax)
        sns.kdeplot(x= features[col] , label = 'Train', ax = ax[i],color='r')
        sns.kdeplot(x= test_df[col], label = 'Test', ax = ax[i] ,color='b')
        ax[i].legend()
        ax[i].set_title(f"col")

    plt.suptitle("Distribution of Train vs Test Dataset",fontsize = 30)
    plt.tight_layout(pad =3)
    plt.show()
    
plot_kde_train_test(X,test)

## Correlation Matrix

* Dataset looks highly correlated with target field

In [None]:
def  plot_correlation_dataset(df):
    plt.figure(figsize = (25,12))
    corr = df.corr()
    upper_triangle = np.triu(np.ones_like(corr, dtype=bool))
    sns.heatmap(corr,vmin = -1, vmax = 1, cmap = "rocket", annot = True, mask = upper_triangle)
    plt.title("Correlation of all features and target", fontsize= 18)
    plt.show()
    
plot_correlation_dataset(X)

## Outlier Analaysis

In [None]:
def show_outlier(features):
    fig,ax = plt.subplots(int(np.ceil(len(X.columns)/4)),4,figsize = (30,15))
    ax = np.ravel(ax)
    for i,col in enumerate(X.columns):
        sns.boxplot(ax = ax[i], x = X[col], color= "red")
    fig.suptitle("Box plots of all data ",fontsize = 20)
    plt.tight_layout(pad=3)
    plt.show()

show_outlier(X)

## Preprocessing of the datasets

In [None]:
def preprocessing(features,label=None):
    scaler = StandardScaler()
    num_cols = list(features.select_dtypes(include=['int','float']))
    features = scaler.fit_transform(features[num_cols].values)
    if label is not None:
        return train_test_split(features,label.to_numpy(),test_size=0.2,random_state=42)
    elif label is None:
        return features

X_train,X_test,y_train,y_test = preprocessing(X,y)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

## Converting dataframe to tensors

In [None]:
def convert_to_torch(value):
    return torch.tensor(data=value,dtype=torch.float32,requires_grad=True)

## Data class for datasets

In [None]:
class Data(Dataset):
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
            return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)

X_data = convert_to_torch(X_train)
y_data = convert_to_torch(y_train)
X_test = convert_to_torch(X_test)
y_test = convert_to_torch(y_test)
train_data = Data(X_data,y_data)
test_data = Data(X_test,y_test)

## Creating the baseline model

In [None]:
class RegressionBaseModel(torch.nn.Module):
    
    def training_step(self,batch):
        features,labels = batch
        out = self(features)
        loss = F.l1_loss(out,labels.unsqueeze(1))
        return loss
    
    def test_step(self, batch):
        features, labels = batch 
        out = self(features)                    
        loss = F.l1_loss(out, labels.unsqueeze(1))          
        return {'test_loss': loss.detach()}
        
    def test_epoch_end(self, outputs):
        batch_losses = [x['test_loss'] for x in outputs]
        epoch_loss = torch.stack(batch_losses).mean()   
        return {'test_loss': epoch_loss.item()}
    
    
    def epoch_end(self, epoch, result):
        if epoch %10 ==0:
            print("Epoch [{}], train_loss: {:.5f}, test_loss: {:.5f}".format(
                (epoch+10), result['train_loss'], result['test_loss']))


## Pytorch Model

In [None]:
class RegressionBlueBerryNNet(RegressionBaseModel):
    def __init__(self,input_features):
        super().__init__()
        self.network = torch.nn.Sequential(
        torch.nn.Linear(input_features,1),
        torch.nn.ReLU()
        )      
    
    def forward(self,inputs):
        return self.network(inputs)

## Defining the code to run both on CPU and GPU

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

## Instantiating the model

In [None]:
model = RegressionBlueBerryNNet(X_train.shape[1])
model.to(device)

## Hyper parameter tunning 

In [None]:
BATCH_SIZE = 32
EPOCHS = 100
LEARNING_RATE = 0.1
MOMENTUM = 0.9
OPT_FUNC= torch.optim.SGD


## Creating the Dataloader for train and test 

In [None]:
def get_dataloaders(dataset_type,batch,shuffle):
    if shuffle:
         return DataLoader(dataset=dataset_type, batch_size=batch, shuffle=True)
    else:
        return DataLoader(dataset=dataset_type, batch_size=batch,shuffle=False)

## Traning the Model :)

In [None]:
@torch.no_grad()
def evaluate(model, test_loader):
    model.eval()
    outputs = [model.test_step(batch) for batch in test_loader]
    return model.test_epoch_end(outputs)

  
def fit(epochs, lr, model, train_loader, test_loader, opt_func):
    model.train()
    history = []
    optimizer = opt_func(model.parameters(),lr,MOMENTUM)
    for epoch in tqdm(range(epochs)): 
        train_losses = []
        for batch in train_loader:
            optimizer.zero_grad()
            loss = model.training_step(batch)
            loss.backward()
            optimizer.step()
            train_losses.append(loss)
            
        result = evaluate(model, test_loader)
        result['train_loss'] = torch.stack(train_losses).mean().item()
        model.epoch_end(epoch, result)
        history.append(result)
    print('Training is completed!!')
    return history

## Fitting the model

In [None]:
train_dataloader = get_dataloaders(train_data,BATCH_SIZE,True)
test_dataloader = get_dataloaders(test_data,BATCH_SIZE,False)
history = fit(EPOCHS, LEARNING_RATE, model, train_dataloader, test_dataloader,OPT_FUNC)

## Plot Loss vs Epoch Curve

In [None]:
def plot_losses(history):
    """ Plot the losses in each epoch"""
    train_losses = [x.get('train_loss') for x in history]
    test_losses = [x['test_loss'] for x in history]
    plt.plot(train_losses, '-bx')
    plt.plot(test_losses, '-rx')
    plt.xlabel('epoch')
    plt.ylabel('loss')
    plt.legend(['Training', 'Testing'])
    plt.title('Loss vs. No. of epochs');

plot_losses(history)

## Saving the model to Output Directory

In [None]:
torch.save(model,'/kaggle/working/RegressionBB.pt')

## Summary of the test dataset

In [None]:
summary('test',test)

## Preprocessing the test dataset 

In [None]:
X_val=generate_features_labels(test,'test')
X_num_test=preprocessing(X_val)

## Loading the saved model

In [None]:
model_test = torch.load('/kaggle/working/RegressionBB.pt')
model_test.to(device)

## Defining the Test Dataset class

In [None]:
class Data_Test(Dataset):
    
    def __init__(self, X_test_data):
        self.X_test_data = X_test_data
        
    def __getitem__(self, index):
        return self.X_test_data[index]
        
    def __len__ (self):
        return len(self.X_test_data)

## Generating the yield values for test data

In [None]:
def eval_test_data(model,testing_data_dl):
    yield_target = []
    model.eval()
    with torch.no_grad():
        for X_batch_test in testing_data_dl:
            X_batch_test = X_batch_test.to(device)
            y_test_pred = model(X_batch_test)
            y_pred_tag = torch.round(y_test_pred)
            yield_target.append(y_pred_tag.cpu().numpy())
    return [a.squeeze().tolist() for a in yield_target]

## Converting test data into Dataloaders 

In [None]:
testing_data = Data_Test(convert_to_torch(X_num_test))
testing_data_loader = DataLoader(dataset=testing_data, batch_size=BATCH_SIZE)

## Creating prediction on test data 

In [None]:
def submit_test_data():
    yield_submission=[]
    for elements in  eval_test_data(model_test,testing_data_loader):
        for field in elements:
            yield_submission.append(field)    
    return yield_submission

## Saving the file for evaluation

In [None]:
yhat = submit_test_data()
df_submit = pd.DataFrame(data={'id': test['id'],'yield': yhat})
convert_to_torch
print('Submission Completed!!')