In [None]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.max_rows = 10
pd.options.display.float_format = "{:.1f}".format

import warnings
warnings.filterwarnings("ignore")

torch.manual_seed(1)

## Exploring Dataset

In [None]:
df = pd.read_csv('./../data/imdbtop1000/imdb_data.csv', sep='\t')
df = df.rename(columns={'User Votes': 'Votes',
                        'Imdb Rating': 'Rating',
                       'Gross(in Million Dollars)': 'Earnings',
                       'Runtime(Minutes)' : 'Runtime'})

df.corr()

In [None]:
dataframe = df[['Votes', 'Rating']]
#It is very important to normalise the input features in a proper range
#It helps in avoiding very large calculations
dataframe['Votes'] = dataframe['Votes'] / 1000000
dataframe['Rating'] = dataframe.Rating.map(lambda p : 0.0 if p <= 7.6 else 1.0)

dataframe.describe()

In [None]:
plt.figure(figsize=(11,5))
plt.title("Analysis of data points Votes Vs Rating")

ax0 = plt.subplot(121)
ax0 = sns.swarmplot(x=dataframe.Rating, y=dataframe.Votes)
ax0.set_ylabel('User Votes')
ax0.set_xlabel('IMDB Rating')

ax1 = plt.subplot(122)
ax1 = sns.scatterplot(y=dataframe.Rating, x=dataframe.Votes, hue=dataframe.Rating)
ax1.set_yticks([0,1])
ax1.set_xlabel('User Votes')
ax1.set_ylabel('IMDB Rating')

plt.show()

## Creating Dataset 

In [None]:
from torch.utils.data import Dataset, DataLoader, random_split

In [None]:
class ratingData(Dataset):
    def __init__(self, df, transform=None):
        self.data = df
        self.transform = transform
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        sample = self.data.iloc[idx, 0], self.data.iloc[idx, 1]
        if self.transform:
            sample = self.transform(sample)
        return sample

In [None]:
dataset = ratingData(dataframe)

In [None]:
train, test = random_split(dataset, lengths=[800, 200])
len(train), len(test)

## Building model

In [None]:
from torch import nn, optim

## Logistic Regression using nn.Sequential class

In [None]:
model_seq = nn.Sequential(nn.Linear(1,1), 
                      nn.Sigmoid())

In [None]:
model_seq(torch.tensor([[0.3], [2.5], [1.0]]))

In [None]:
list(model_seq.parameters())

## Logistic Regression using custom class

In [None]:
class LogisticRegression(nn.Module):
    def __init__(self, input_size):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_size, 1)
    
    def forward(self, x):
        pred = torch.sigmoid(self.linear(x))
        
        return pred

In [None]:
model_custom = LogisticRegression(1)

In [None]:
model_custom(torch.tensor([[0.3], [2.5], [1.0]]))

In [None]:
list(model_custom.parameters())

In [None]:
model_custom_multi = LogisticRegression(2)

In [None]:
model_custom_multi(torch.tensor([[0.3, 2.1], 
                           [2.5, 1.3],
                           [1.0, 0.9]]))

### Initilising criterion / loss function

In [None]:
criterion = nn.BCELoss()

### Training model

In [None]:
learning_rate = 2
epochs = 300
trainloader = DataLoader(train, shuffle=True, batch_size=len(train))
LOSS = []

In [None]:
def train_model(dataloader, lr, epochs):
    model = LogisticRegression(1)
    optimizer = optim.SGD(model.parameters(), lr=lr)
    
    for epoch in range(epochs):
        epochloss = []
        for x, y in dataloader:
            pred = model(x.view(-1,1))
            
            loss = criterion(pred, y)
            epochloss.append(loss)
            
            optimizer.zero_grad()
            
            loss.backward()
            
            optimizer.step()
        print("Epoch :: {},  Loss :: {}".format(epoch, torch.mean(torch.tensor(epochloss))))
        LOSS.append(torch.mean(torch.tensor(epochloss)))
    
    return model

In [None]:
model = train_model(trainloader, learning_rate, epochs)

In [None]:
plt.figure(figsize=(8, 6))
plt.plot(LOSS, label="LOSS")
plt.legend()
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.show()

### Model's Accuracy

In [None]:

for x, y in trainloader:
    pred = model(x.view(-1,1))
    label = pred > 0.5
    print("Accuracy of the model on train data :: {:.2f}".format(torch.mean((y.view(-1,1) == label.float()).float())))

plt.figure()
sns.scatterplot(x=x.numpy(),y=y.numpy(), label='original', alpha=0.5)
sns.scatterplot(x=x.numpy(),y=label.float().numpy().reshape(1,-1)[0], label='predicted', alpha=0.5)
plt.legend()
plt.show()

In [None]:
testloader = DataLoader(test, shuffle=True, batch_size=len(test))
for x, y in testloader:
    pred = model(x.view(-1,1))
    label = pred > 0.5
    print("Accuracy of the model on test data :: {:.2f}".format(torch.mean((y.view(-1,1) == label.float()).float())))
    
plt.figure()
sns.scatterplot(x=x.numpy(),y=y.numpy(), label='original', alpha=0.5)
sns.scatterplot(x=x.numpy(),y=label.float().numpy().reshape(1,-1)[0], label='predicted', alpha=0.5)
plt.legend()
plt.show()