In [1]:
import pandas as pd
import sqlite3
import numpy as np

## Checking the dataframe and choosing features
First, we import the cleaned dataset. The whole dataset has 12 features as well as over 400,000 movie data.

In [2]:
imdb = pd.read_csv('imdb.csv')
imdb

Unnamed: 0.1,Unnamed: 0,tconst,nconst,category,averageRating,numVotes,primaryTitle,isAdult,startYear,runtimeMinutes,genres,primaryName
0,92759,tt0013274,nm0412842,director,6.7,48,Istoriya grazhdanskoy voyny,0,2021,133,Documentary,Nikolai Izvolov
1,92760,tt0013274,nm0895048,director,6.7,48,Istoriya grazhdanskoy voyny,0,2021,133,Documentary,Dziga Vertov
2,522892,tt0062336,nm0815612,actor,6.4,164,The Tango of the Widower and Its Distorting Mi...,0,2020,70,Drama,Rubén Sotoconil
3,522893,tt0062336,nm1860495,actress,6.4,164,The Tango of the Widower and Its Distorting Mi...,0,2020,70,Drama,Claudia Paz
4,522894,tt0062336,nm0016013,actor,6.4,164,The Tango of the Widower and Its Distorting Mi...,0,2020,70,Drama,Luis Alarcón
...,...,...,...,...,...,...,...,...,...,...,...,...
404310,54610727,tt9916730,nm6096005,actor,8.3,10,6 Gunn,0,2017,116,\N,Devadhar Archit
404311,54610728,tt9916730,nm0059461,actor,8.3,10,6 Gunn,0,2017,116,\N,Sunil Barve
404312,54610729,tt9916730,nm13233318,actor,8.3,10,6 Gunn,0,2017,116,\N,Ganesh Vasant Patil
404313,54610730,tt9916730,nm4852679,actor,8.3,10,6 Gunn,0,2017,116,\N,Bhushan Pradhan


In [3]:
df1 = imdb
df1 = df1.drop(['genres'], axis=1).join(df1['genres'].str.split(',', expand=True).stack().reset_index(level=1, drop=True).rename('genres'))
df1

Unnamed: 0.1,Unnamed: 0,tconst,nconst,category,averageRating,numVotes,primaryTitle,isAdult,startYear,runtimeMinutes,primaryName,genres
0,92759,tt0013274,nm0412842,director,6.7,48,Istoriya grazhdanskoy voyny,0,2021,133,Nikolai Izvolov,Documentary
1,92760,tt0013274,nm0895048,director,6.7,48,Istoriya grazhdanskoy voyny,0,2021,133,Dziga Vertov,Documentary
2,522892,tt0062336,nm0815612,actor,6.4,164,The Tango of the Widower and Its Distorting Mi...,0,2020,70,Rubén Sotoconil,Drama
3,522893,tt0062336,nm1860495,actress,6.4,164,The Tango of the Widower and Its Distorting Mi...,0,2020,70,Claudia Paz,Drama
4,522894,tt0062336,nm0016013,actor,6.4,164,The Tango of the Widower and Its Distorting Mi...,0,2020,70,Luis Alarcón,Drama
...,...,...,...,...,...,...,...,...,...,...,...,...
404310,54610727,tt9916730,nm6096005,actor,8.3,10,6 Gunn,0,2017,116,Devadhar Archit,\N
404311,54610728,tt9916730,nm0059461,actor,8.3,10,6 Gunn,0,2017,116,Sunil Barve,\N
404312,54610729,tt9916730,nm13233318,actor,8.3,10,6 Gunn,0,2017,116,Ganesh Vasant Patil,\N
404313,54610730,tt9916730,nm4852679,actor,8.3,10,6 Gunn,0,2017,116,Bhushan Pradhan,\N


In [118]:
drop_columns = ["Unnamed: 0", "tconst", "nconst", "category", "primaryTitle", "primaryName"]
df_model = df1.drop(drop_columns, axis=1)
df_model

Unnamed: 0,averageRating,numVotes,isAdult,startYear,runtimeMinutes,genres
0,6.7,48,0,2021,133,Documentary
1,6.7,48,0,2021,133,Documentary
2,6.4,164,0,2020,70,Drama
3,6.4,164,0,2020,70,Drama
4,6.4,164,0,2020,70,Drama
...,...,...,...,...,...,...
404310,8.3,10,0,2017,116,\N
404311,8.3,10,0,2017,116,\N
404312,8.3,10,0,2017,116,\N
404313,8.3,10,0,2017,116,\N


In [119]:
df_model['genres'] = df_model['genres'].replace({r'\N': None})
df_model['runtimeMinutes'] = df_model['runtimeMinutes'].replace({r'\N': None}).astype(float)

In [120]:
df_model = df_model.dropna()

In [121]:
df_model = df_model.reset_index(drop=True)

In [122]:
from sklearn.preprocessing import LabelEncoder

# creating instances of labelencoder
labelencoder = LabelEncoder()

# Assigning numeric values and convert the non-numeric column
df_model['genres'] = labelencoder.fit_transform(df_model['genres'])
df_model.head()

Unnamed: 0,averageRating,numVotes,isAdult,startYear,runtimeMinutes,genres
0,6.7,48,0,2021,133.0,7
1,6.7,48,0,2021,133.0,7
2,6.4,164,0,2020,70.0,8
3,6.4,164,0,2020,70.0,8
4,6.4,164,0,2020,70.0,8


In [123]:
y = df_model['averageRating']
x = df_model.drop(['averageRating'], axis = 1)

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.model_selection import train_test_split

In [124]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [125]:
# Pipeline
numerical_features = list(X_train)

# construct a pipeline and scale the numeric features
num_pipeline = Pipeline([
        ('std_scaler', StandardScaler())
    ])

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, numerical_features)
    ])

X_prepared_train = full_pipeline.fit_transform(X_train)

# Pipeline the test data
X_prepared_test = full_pipeline.transform(X_test)

In [12]:
from sklearn import decomposition

# First we create a PCA object with the 4 components as a parameter
pca = decomposition.PCA(n_components=4)

# Now we run the fit operation to convert our 
# data to a PCA transformmed data
X_pca_train = pca.fit_transform(X_prepared_train)
X_pca_test = pca.transform(X_prepared_test)

In [126]:
X_train, X_val, y_train, y_val = train_test_split(X_prepared_train, y_train, test_size=0.15, random_state=42)
X_test = X_prepared_test

In [127]:
X_train

array([[-0.13740275, -0.02711564, -1.55371813,  0.17461705, -0.73087019],
       [-0.14639556, -0.02711564, -0.1297384 , -0.07714882, -1.4949215 ],
       [-0.09720795, -0.02711564,  1.65023626,  0.08528077, -0.11962915],
       ...,
       [-0.14176865, -0.02711564,  1.29424133, -0.04466291,  1.56128372],
       [-0.14615829, -0.02711564, -0.84172827, -0.00405551,  2.17252476],
       [-0.13543335, -0.02711564, -0.1297384 , -0.03654143, -0.27243941]])

In [1]:
import nn

In [17]:
# fix random seed
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7fedbddc2550>

In [129]:
# Data Preparation
train_batch_size = 16
test_batch_size = 32

## Load data
X_train, y_train, X_val, y_val, X_test, y_test = create_data(X_train, y_train, X_val, y_val, X_test, y_test)

## Create data loaders
train_loader, val_loader, test_loader = get_dataloaders(X_train, y_train, X_val, y_val, 
                                                        X_test, y_test, train_batch_size=train_batch_size, 
                                                        test_batch_size=test_batch_size)

In [137]:
## Define hyperparameters
lr = 0.001
num_epochs = 20

## Init models
oneLayerNN, criterion, optimizer = init_oneLayerNN(X_train.shape[1], lr=lr)
results_one = train(oneLayerNN, criterion, optimizer, train_loader, val_loader, num_epochs=num_epochs)

Start training model...
| epoch  1 | train loss mean 0.138347 | train lost best 0.270791 | valid loss mean 0.065894 | valid loss best 0.667291 |
| epoch  2 | train loss mean 0.138103 | train lost best 0.270779 | valid loss mean 0.065897 | valid loss best 0.667469 |
| epoch  3 | train loss mean 0.138067 | train lost best 0.270778 | valid loss mean 0.065898 | valid loss best 0.667496 |
| epoch  4 | train loss mean 0.138062 | train lost best 0.270778 | valid loss mean 0.065898 | valid loss best 0.667500 |
| epoch  5 | train loss mean 0.138061 | train lost best 0.270778 | valid loss mean 0.065898 | valid loss best 0.667500 |
| epoch  6 | train loss mean 0.138061 | train lost best 0.270778 | valid loss mean 0.065898 | valid loss best 0.667500 |
| epoch  7 | train loss mean 0.138061 | train lost best 0.270778 | valid loss mean 0.065898 | valid loss best 0.667500 |
| epoch  8 | train loss mean 0.138061 | train lost best 0.270778 | valid loss mean 0.065898 | valid loss best 0.667500 |
| epoch 

In [138]:
## Define hyperparameters
lr = 0.001
hidden_size = 6
activation = 'ReLU'
num_epochs = 20

## Init models
twoLayerNN, criterion, optimizer = init_twoLayerNN(X_train.shape[1], hidden_size, activation, lr=lr)
results_two = train(twoLayerNN, criterion, optimizer, train_loader, val_loader, num_epochs=num_epochs)

Start training model...
| epoch  1 | train loss mean 0.128697 | train lost best 0.277508 | valid loss mean 0.064111 | valid loss best 0.658531 |
| epoch  2 | train loss mean 0.126034 | train lost best 0.265475 | valid loss mean 0.062807 | valid loss best 0.671580 |
| epoch  3 | train loss mean 0.128931 | train lost best 0.264889 | valid loss mean 0.064265 | valid loss best 0.657186 |
| epoch  4 | train loss mean 0.126843 | train lost best 0.296647 | valid loss mean 0.063291 | valid loss best 0.667463 |
| epoch  5 | train loss mean 0.125874 | train lost best 0.294685 | valid loss mean 0.062863 | valid loss best 0.673025 |
| epoch  6 | train loss mean 0.125299 | train lost best 0.288160 | valid loss mean 0.062533 | valid loss best 0.671832 |
| epoch  7 | train loss mean 0.125060 | train lost best 0.300616 | valid loss mean 0.062409 | valid loss best 0.671432 |
| epoch  8 | train loss mean 0.124963 | train lost best 0.304942 | valid loss mean 0.062360 | valid loss best 0.667838 |
| epoch 

In [142]:
test_acc_oneLayer = evaluate_loss(oneLayerNN, torch.nn.MSELoss(), test_loader)
test_acc_twoLayer = evaluate_loss(twoLayerNN, torch.nn.MSELoss(), test_loader)

print("Test loss mean for OneLayerNetwork: ", test_acc_oneLayer)
print("Test loss mean for TwoLayerNetwork: ", test_acc_twoLayer)

Test loss mean for OneLayerNetwork:  0.06895994353409356
Test loss mean for TwoLayerNetwork:  0.062321129022988996
