In [None]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.max_rows = 10
pd.options.display.float_format = "{:.1f}".format

import warnings
warnings.filterwarnings("ignore")

torch.manual_seed(1)

### Loading dataset using Pandas
For detailed explaination click [here](https://github.com/akshayparakh25/Python-for-Data_Science/blob/master/jupyter-notebooks/pandas.ipynb) <br>
The dataset is available [here](https://github.com/akshayparakh25/imdb_1000_scraper/blob/master/imdb_data.csv)

In [None]:
df = pd.read_csv('./../data/imdbtop1000/imdb_data.csv', sep='\t')
df = df.rename(columns={'User Votes': 'Votes',
                        'Imdb Rating': 'Rating',
                       'Gross(in Million Dollars)': 'Earnings',
                       'Runtime(Minutes)' : 'Runtime'})
df.head()

#### Statistical analysis of data to find the best input feature for target *quality*

In [None]:
df.describe()

In [None]:
#Correlation between columns to identify best feature for training a model
df.corr()

## Linear Regression with one variable

In [None]:
dataframe = df[['Votes', 'Rating']]
#It is very important to normalise the input features in a proper range
#It helps in avoiding very large calculations
dataframe['Votes'] = dataframe['Votes'] / 1000000
dataframe.head()

In [None]:
#Checking if there's any null values in the dataset
dataframe[dataframe.Votes.isnull()]

#### Visualizing dataset

In [None]:
plt.figure(figsize=(8,6))
plt.title("Analysis of data points Votes Vs Rating")
sns.scatterplot(x=dataframe.Votes, y=dataframe.Rating)
plt.xlabel('User Votes')
plt.ylabel('IMDB Rating')
plt.show()

### Creating custom dataset using Dataset class from torch.utils.data
for detailed explaination check [here](https://github.com/akshayparakh25/Python-for-Data_Science/blob/master/jupyter-notebooks/pytorch.ipynb)

In [None]:
from torch.utils.data import Dataset, DataLoader

In [None]:
# x = torch.arange(-4, 4, 0.1).view(-1,1)
# f = -2*x + 3
# df = pd.DataFrame({'X' : x.view(1,-1).numpy()[0], 'Y' : f.view(1,-1).numpy()[0]})
# df.shape[0]

In [None]:
class ratingData(Dataset):
    def __init__(self, df, transform=None):
        self.data = df
        self.transform = transform
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        sample = self.data.iloc[idx, 0], self.data.iloc[idx, 1]
        if self.transform:
            sample = self.transform(sample)
        return sample

#### Creating instance of Rating Dataset

In [None]:
dataset = ratingData(dataframe)

In [None]:
dataset[123]

In [None]:
len(dataset)

### Creating instance of Dataloader class to iterate over dataset in batches

In [None]:
dataloader = DataLoader(dataset, shuffle=True, batch_size=1)

### Creating a Linear Regression model

In [None]:
from torch import nn, optim

In [None]:
class LinearRegression(nn.Module):
    
    #Constructor for defining the model
    def __init__(self, inp_size, out_size):
        super(LinearRegression, self).__init__()
        self.linear = nn.Linear(inp_size, out_size)
        
    #Prediction/Forward Pass
    def forward(self, x):
        pred = self.linear(x)
        return pred

In [None]:
model = LinearRegression(1,1)

In [None]:
#Randomly initialised parameters
list(model.parameters())

In [None]:
model.state_dict

In [None]:
model.linear

In [None]:
model.linear.weight

In [None]:
model.linear.bias

#### Defining loss function

In [None]:
criterion = nn.MSELoss()

#### Defining a Stochastic Gradient Descent (SGD) Optimizer

In [None]:
optimizer = optim.SGD(model.parameters(), lr=0.02)

In [None]:
#Like models, optimizers as well have state dictionary
optimizer.state_dict()

Many of the key corresponds to other advanced optimizer

In [None]:
# #Model weights and bias can also be manually set 
# model.state_dict()['linear.weight'][0] = -10.0
# model.state_dict()['linear.bias'][0] = -15.0

In [None]:
def train(model, dataloader, epochs, lr):
    LOSS = []
    
    criterion = nn.MSELoss()
    
    optimizer = optim.SGD(model.parameters(), lr=lr)
    
    for epoch in range(epochs):
        print(" Epoch :: ", epoch)
        epochloss = []
        for x, y in dataloader:
            #Making predictions
            pred = model(x.view(-1,1))

            #Claculating loss
            loss = criterion(pred, y.float())
            epochloss.append(loss)

            #Clears the gradients of all optimized tensors
            #Setting gradients to 0
            optimizer.zero_grad()

            #Calculate gradient for loss
            loss.backward()

            #To update the learnable parameters (weight and bias)
            optimizer.step()
        LOSS.append(torch.mean(torch.tensor(epochloss)))
        print("Total Losses :: ",torch.mean(torch.tensor(epochloss)))
    return LOSS

#### Batch Gradient Descent

In [None]:
LOSS = []
epochs = 15
batch_size = len(dataset)
learning_rate = 0.1

model = LinearRegression(1,1)
print("Initialised Model's parameters :: ", model.parameters())

dataloader = DataLoader(dataset, shuffle=True, batch_size=batch_size)

LOSS = train(model=model, dataloader=dataloader, epochs=epochs, lr=learning_rate)

print(list(model.parameters()))

In [None]:
weight = model.linear.weight[0].item()
bias = model.linear.bias[0].item()
print(weight)
print(bias)
predictions = weight * dataframe.Votes + bias

plt.figure(figsize=(8,6))
plt.title("Analysis of trained model and data points")
sns.scatterplot(x=dataframe.Votes, y=dataframe.Rating)
sns.lineplot(x=dataframe.Votes, y=predictions, color='red')
plt.xlabel('User Votes')
plt.ylabel('IMDB Rating')
plt.show()

In [None]:
plt.figure(figsize=(8,6))
plt.plot(LOSS, label='BGD')
plt.xlabel('epochs')
plt.ylabel('loss')
plt.legend()
plt.show()

#### Stochastic Gradient Descent

In [None]:
LOSS = []
epochs = 15
batch_size = 1
learning_rate = 0.1

model = LinearRegression(1,1)
print("Initialised Model's parameters :: ", model.parameters())

dataloader = DataLoader(dataset, shuffle=True, batch_size=batch_size)

LOSS = train(model=model, dataloader=dataloader, epochs=epochs, lr=learning_rate)

print(list(model.parameters()))

In [None]:
weight = model.linear.weight[0].item()
bias = model.linear.bias[0].item()
print(weight)
print(bias)
predictions = weight * dataframe.Votes + bias

plt.figure(figsize=(8,6))
plt.title("Analysis of trained model and data points")
sns.scatterplot(x=dataframe.Votes, y=dataframe.Rating)
sns.lineplot(x=dataframe.Votes, y=predictions, color='red')
plt.xlabel('User Votes')
plt.ylabel('IMDB Rating')
plt.show()

In [None]:
plt.figure(figsize=(8,6))
plt.plot(LOSS, label='SGD')
plt.xlabel('epochs')
plt.ylabel('loss')
plt.legend()
plt.show()

#### Mini-Batch Gradient Descent

In [None]:
LOSS = []
epochs = 15
batch_size = 15
learning_rate = 0.1

model = LinearRegression(1,1)
print("Initialised Model's parameters :: ", model.parameters())

dataloader = DataLoader(dataset, shuffle=True, batch_size=batch_size)

LOSS = train(model=model, dataloader=dataloader, epochs=epochs, lr=learning_rate)

print(list(model.parameters()))

In [None]:
weight = model.linear.weight[0].item()
bias = model.linear.bias[0].item()
print(weight)
print(bias)
predictions = weight * dataframe.Votes + bias

plt.figure(figsize=(8,6))
plt.title("Analysis of trained model and data points")
sns.scatterplot(x=dataframe.Votes, y=dataframe.Rating)
sns.lineplot(x=dataframe.Votes, y=predictions, color='red')
plt.xlabel('User Votes')
plt.ylabel('IMDB Rating')
plt.show()

In [None]:
plt.figure(figsize=(8,6))
plt.plot(LOSS, label='MBGD')
plt.xlabel('epochs')
plt.ylabel('loss')
plt.legend()
plt.show()