In [None]:
import random
import warnings

import matplotlib.pyplot as plt
import missingno as mn
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from torch import optim
from tqdm import tqdm

RANDOM_SEED = 42

np.random.seed(RANDOM_SEED)

torch.manual_seed(RANDOM_SEED)
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
ss = pd.read_csv('SampleSubmission.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
ss.head()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
mn.matrix(train)

In [None]:
train = train.drop('ID',axis=1)
test = test.drop('ID',axis=1)

In [None]:
train.loc[:, 'Sensor1_PM2.5':'Offset_fault'].describe().T.style.bar(subset=['mean'], color='#206ff2')\
                            .background_gradient(subset=['std'], cmap='Reds')\
                            .background_gradient(subset=['50%'], cmap='coolwarm')

In [None]:
### Now let's visualize 
sns.countplot(train.Offset_fault)

In [None]:
# Extract day, month year and hour from the Datetime column
# day
def converte_dates(df):
    
    df['Datetime'] = pd.to_datetime(df['Datetime'])
    
    #
    df['Datetime_day'] = df.Datetime.dt.day

    # month
    df['Datetime_month'] = df.Datetime.dt.month

    # year
    df['Datetime_year'] = df.Datetime.dt.year

    # hour
    df['Datetime_hour'] = df.Datetime.dt.hour
    
    # minute
    df['Datetime_minute'] = df.Datetime.dt.minute
    
    # day of week
    df['Datetime_dayofweek'] = df.Datetime.dt.weekday
    
    
    return df


train = converte_dates(train)
test = converte_dates(test)

In [None]:
def more_features(df):
    df['is_morning'] = (6 <= df['Datetime_hour']) & (df['Datetime_hour'] < 12)#.median()
    df['is_afternoon'] = (12 <= df['Datetime_hour']) & (df['Datetime_hour'] < 18)#.median()
    df['is_evening'] = (18 <= df['Datetime_hour']) & (df['Datetime_hour'] <= 23)#.median()
    df['is_night'] = (0 <= df['Datetime_hour']) & (df['Datetime_hour'] < 6)#.median()
    return df


train = more_features(train)
test = more_features(test)

In [None]:
corr = train.corr()
corr.style.background_gradient()

In [None]:
train.head()

In [None]:
train.shape,test.shape

In [None]:
test.head()

In [None]:
train = train.drop('Datetime',axis=1)
test = test.drop('Datetime',axis=1)

In [None]:
train.head()

In [None]:
X = train.drop('Offset_fault',axis=1)
y = train['Offset_fault']

X_test = test.copy



Here I am capturing NaN per row and making new feature


In [None]:
def feature_engineering(df):
    df['NaN_row'] = df.isna().sum(axis=1)
    df['std'] = df.std(axis=1)
    return df

X = feature_engineering(X)
test = feature_engineering(test)

# added code

In [None]:
def feature_engineering(df):
    df['AverageTemperature'] = df['Temperature'].mean(axis=0)
    df['Relative_Humidity'] = df['Relative_Humidity'].mean(axis=0)
    df['Total_sensor'] = df['Sensor1_PM2.5'] + df['Sensor2_PM2.5']
    return df

X = feature_engineering(X)
test = feature_engineering(test)

X.head()

In [None]:
pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', StandardScaler())
])

X = pd.DataFrame(columns=X.columns, data=pipeline.fit_transform(X))
test = pd.DataFrame(columns=test.columns, data=pipeline.transform(test))

In [None]:
X.head()

In [None]:
test.head()

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X.values,y.values,random_state=42, test_size=0.01)

In [None]:
X.shape

In [None]:
X_train = torch.Tensor(X_train)
X_test = torch.Tensor(X_test)
y_train = torch.Tensor(y_train)
y_test = torch.Tensor(y_test)

test = torch.Tensor(test.values)

In [None]:


print(X_train.shape, y_train.shape)

print(X_test.shape, y_test.shape)

In [None]:


n_samples, n_features = X_train.shape
n_features

In [None]:
class Net(nn.Module):

  def __init__(self, n_features):

    super(Net, self).__init__()

    self.fc1 = nn.Linear(n_features, 5)

    self.fc2 = nn.Linear(5, 3)

    self.fc3 = nn.Linear(3, 1)

  def forward(self, x):

    x = F.relu(self.fc1(x))

    x = F.relu(self.fc2(x))

    x = torch.sigmoid(self.fc3(x))
    return x

net = Net(X_train.shape[1])

In [None]:
criterion = nn.BCELoss()


optimizer = optim.Adam(net.parameters(), lr=0.001)

def calculate_accuracy(y_true, y_pred):
    predicted = y_pred.ge(.5).view(-1)

    return (y_true == predicted).sum().float() / len(y_true)

In [None]:
def round_tensor(t, decimal_places=3):

  return round(t.item(), decimal_places)

for epoch in range(1000):

    y_pred = net(X_train)

    y_pred = torch.squeeze(y_pred)

    train_loss = criterion(y_pred, y_train)

    if epoch % 100 == 0:

      train_acc = calculate_accuracy(y_train, y_pred)

      y_test_pred = net(X_test)

      y_test_pred = torch.squeeze(y_test_pred)

      test_loss = criterion(y_test_pred, y_test)

      test_acc = calculate_accuracy(y_test, y_test_pred)

      print(

f'''epoch {epoch}

Train set - loss: {round_tensor(train_loss)}, accuracy: {round_tensor(train_acc)}

Test  set - loss: {round_tensor(test_loss)}, accuracy: {round_tensor(test_acc)}

''')

    optimizer.zero_grad()

    train_loss.backward()

    optimizer.step()

In [None]:
from sklearn.metrics import classification_report

classes = ['0', '1']

y_pred = net(X_test)

y_pred = y_pred.ge(.5).view(-1).cpu()

y_test = y_test.cpu()

print(classification_report(y_test, y_pred, target_names=classes))

In [None]:
y_pred = net(test)

y_pred = y_pred.ge(.5).view(-1).cpu()

In [None]:
y_pred.shape

In [None]:
test.shape

In [None]:
import numpy as np

y_pred = y_pred.numpy()

In [None]:
y_pred = pd.DataFrame(y_pred)

In [None]:
y_pred

In [None]:
pred = y_pred[0]

In [None]:
pred_l = pred.replace({'True':1, 'False':0})

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
encode = LabelEncoder()

pred_lg = encode.fit_transform(y_pred[0])

In [None]:
pred_lg

In [None]:
sub = pd.DataFrame({'ID':ss.ID, 'Offset_fault':pred_l})
sub.to_csv('pytorch.csv',index=False)

In [None]:
sub = pd.read_csv('pytorch.csv')
sub.head()

In [None]:
encode = pd.

In [None]:
# model architecture
class BinaryNetwork(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.l1 = nn.Linear(input_size, 64)
        self.l2 = nn.Linear(64, 32)
        self.l3 = nn.Linear(32, 16)
        self.out = nn.Linear(16, output_size)
        
    def forward(self, x):
        x = self.l1(x)
        x = F.relu(x)
        x = self.l2(x)
        x = F.relu(x)
        x = self.l3(x)
        x = F.relu(x)
        x = self.out(x)
        return torch.sigmoid(x) # scaling values between 0 and 1

In [None]:
input_size = X.shape[1] # number of features which is 13
output_size = 1
model = BinaryNetwork(input_size, output_size)
loss_fn = nn.BCELoss() # Binary Cross Entropy
optim = torch.optim.Adam(model.parameters(), lr=1e-3)
model

In [None]:
epochs = 500
losses = []
for i in range(epochs):
    epoch_loss = 0
    for feat, target in X_train:
        optim.zero_grad()
        out = model(feat)
        loss = loss_fn(out, target.unsqueeze(1))
        epoch_loss += loss.item()
        loss.backward()
        optim.step()
    losses.append(epoch_loss)
    # print loss every 10 
    if i % 10 == 0:
        print(f"Epoch: {i}/{epochs}, Loss = {loss:.5f}")