In [116]:
import duckdb
import pandas as pd
from functools import reduce
import numpy as np
import os
import math


In [104]:
conn = duckdb.connect('db/db.duckdb', read_only=False)


In [105]:
print(conn.execute('PRAGMA show_tables').fetchdf()['name'].tolist())


['end_year_test', 'end_year_train', 'end_year_validation', 'labels_train', 'movielens_ratings_test', 'movielens_ratings_train', 'movielens_ratings_validation', 'num_votes_test', 'num_votes_train', 'num_votes_validation', 'original_title_test', 'original_title_train', 'original_title_validation', 'primary_title_test', 'primary_title_train', 'primary_title_validation', 'runtime_minutes_test', 'runtime_minutes_train', 'runtime_minutes_validation', 'start_year_test', 'start_year_train', 'start_year_validation', 'test_hidden', 'tmbdb_ratings_test', 'tmbdb_ratings_train', 'tmbdb_ratings_validation', 'tmdb_ratings_test', 'tmdb_ratings_train', 'tmdb_ratings_validation', 'train1', 'train2', 'train3', 'train4', 'train5', 'train6', 'train7', 'train8', 'user_ratings_test', 'user_ratings_train', 'user_ratings_validation', 'validation_hidden']


In [107]:
val = "tmdb_ratings"
print(conn.execute(f'SELECT * FROM {val}_train').fetchdf())
print(conn.execute(f'SELECT * FROM {val}_test').fetchdf())
print(conn.execute(f'SELECT * FROM {val}_validation').fetchdf())



         tconst  tmdb_ratings
0     tt0010600           7.3
1     tt0011841           7.0
2     tt0012494           7.5
3     tt0015163           7.4
4     tt0016220           7.2
...         ...           ...
7920  tt9625664           5.7
7921  tt9741310           5.8
7922  tt9742392           5.9
7923  tt9850386           8.0
7924  tt9911196           8.0

[7925 rows x 2 columns]
         tconst  tmdb_ratings
0     tt0014972           7.5
1     tt0015016           6.7
2     tt0015174           7.6
3     tt0015214           6.8
4     tt0015863           7.0
...         ...           ...
1076  tt9430698           7.9
1077  tt9441638           5.3
1078  tt9495690           4.8
1079  tt9519642           6.8
1080  tt9526826           7.6

[1081 rows x 2 columns]
        tconst  tmdb_ratings
0    tt0003740           7.2
1    tt0008663           7.1
2    tt0010307           7.5
3    tt0014429           7.9
4    tt0015175           7.7
..         ...           ...
943  tt9686154           7.

In [242]:
conn.close()

## Connect to duckdb and create dataframe

In [3]:
conn = duckdb.connect('db/db.duckdb', read_only=False)
# Create dataframe from all tables in the db
print(conn.execute('PRAGMA show_tables').fetchdf()['name'].tolist())


['end_year_test', 'end_year_train', 'end_year_validation', 'labels_train', 'num_votes_test', 'num_votes_train', 'num_votes_validation', 'original_title_test', 'original_title_train', 'original_title_validation', 'primary_title_test', 'primary_title_train', 'primary_title_validation', 'runtime_minutes_test', 'runtime_minutes_train', 'runtime_minutes_validation', 'start_year_test', 'start_year_train', 'start_year_validation', 'user_ratings_test', 'user_ratings_train', 'user_ratings_validation']


In [222]:
merged_df = conn.execute('''SELECT end_year_train.tconst, end_year_train.end_year, labels_train.labels, num_votes_train.num_votes, original_title_train.original_title, primary_title_train.primary_title, runtime_minutes_train.runtime_minutes, start_year_train.start_year, user_ratings_train.user_ratings, tmdb_ratings_train.tmdb_ratings  
                         FROM end_year_train 
                         INNER JOIN labels_train ON labels_train.tconst = end_year_train.tconst
                         INNER JOIN num_votes_train ON end_year_train.tconst = num_votes_train.tconst
                         INNER JOIN original_title_train ON end_year_train.tconst = original_title_train.tconst
                         INNER JOIN primary_title_train ON end_year_train.tconst = primary_title_train.tconst
                         INNER JOIN runtime_minutes_train ON end_year_train.tconst = runtime_minutes_train.tconst
                         INNER JOIN start_year_train ON end_year_train.tconst = start_year_train.tconst
                         FULL OUTER JOIN user_ratings_train ON end_year_train.tconst = user_ratings_train.tconst
                         FULL OUTER JOIN tmdb_ratings_train ON end_year_train.tconst = tmdb_ratings_train.tconst
                         ''').fetchdf()

tconst_order = conn.execute('SELECT tconst FROM end_year_train').fetchdf()
tconst_order['order'] = range(len(tconst_order))
merged_df = merged_df.merge(tconst_order, on='tconst').sort_values(by=['order']).drop('order', axis=1)
print(f"Any NaN values in the df: {merged_df.isnull().values.any()}")

Any NaN values in the df: True


In [223]:
print(merged_df)

         tconst  end_year  labels  num_votes                  original_title  \
934   tt0010600         0    True       1898                       Die Puppe   
881   tt0011841         0    True       5376                   Way Down East   
0     tt0012494         0    True       5842                    Der mude Tod   
1     tt0015163         0    True       9652                   The Navigator   
2     tt0016220         0    True      17887        The Phantom of the Opera   
...         ...       ...     ...        ...                             ...   
7693  tt9625664         0   False      12951                                   
7953  tt9741310         0   False       2464                           Slaxx   
7694  tt9742392         0   False       1719                         Kindred   
7695  tt9850386         0    True       4144                                   
7696  tt9911196         0    True       3242  De beentjes van Sint-Hildegard   

                                      p

Add value to indicate wether the title has been changed

In [224]:
renamed = []
for i in range(len(merged_df)):
    curr_original = merged_df.iloc[i]['original_title']
    curr_primary = merged_df.iloc[i]['primary_title']
    if curr_original != "" and curr_primary != curr_original:
        renamed.append(1)
    else:
        renamed.append(0)
    
merged_df['renamed'] = renamed

In [225]:
print(merged_df['user_ratings'].isna().sum())
print(merged_df['tmdb_ratings'].isna().sum())

2419
34


In [220]:
count = 0
for i in range(len(merged_df)):
    curr = merged_df.iloc[i]
    if math.isnan(curr['tmdb_ratings']):
#         merged_df.iloc[i]['user_ratings'] = curr['tmdb_ratings']
        print(curr['user_ratings'])
        merged_df.at[i,'tmdb_ratings'] = curr['user_ratings']
        count += 1
print(count)
# merged_df = merged_df.drop('tmdb_ratings', 1)

nan
nan
nan
nan
7.5
nan
6.0
nan
nan
nan
nan
8.428572
nan
nan
nan
nan
7.0
nan
8.0
nan
nan
nan
nan
nan
nan
7.8
7.571429
nan
nan
nan
nan
5.7
nan
nan
8.2
nan
nan
nan
nan
7.5
5.133333
9.166667
7.5
7.0
nan
nan
46


In [221]:
print(merged_df['user_ratings'].isna().sum())
print(merged_df['tmdb_ratings'].isna().sum())


2419
66


In [200]:
merged_df['user_ratings'] = merged_df['tmdb_ratings']
merged_df = merged_df.drop('tmdb_ratings', 1)
print(merged_df)

         tconst  end_year  labels  num_votes                  original_title  \
934   tt0010600         0    True       1898                       Die Puppe   
881   tt0011841         0    True       5376                   Way Down East   
0     tt0012494         0    True       5842                    Der mude Tod   
1     tt0015163         0    True       9652                   The Navigator   
2     tt0016220         0    True      17887        The Phantom of the Opera   
...         ...       ...     ...        ...                             ...   
7693  tt9625664         0   False      12951                                   
7953  tt9741310         0   False       2464                           Slaxx   
7694  tt9742392         0   False       1719                         Kindred   
7695  tt9850386         0    True       4144                                   
7696  tt9911196         0    True       3242  De beentjes van Sint-Hildegard   

                                      p

  merged_df = merged_df.drop('tmdb_ratings', 1)


## Transform dataframe to ML suited data

Transform dataframe to array

In [201]:
print(merged_df.dtypes)
merged_df = merged_df.drop('original_title', 1)
merged_df = merged_df.drop('primary_title', 1)
merged_df = merged_df.drop('tconst', 1)

tconst              object
end_year             int32
labels                bool
num_votes            int32
original_title      object
primary_title       object
runtime_minutes      int32
start_year           int32
user_ratings       float32
renamed              int64
dtype: object


  merged_df = merged_df.drop('original_title', 1)
  merged_df = merged_df.drop('primary_title', 1)
  merged_df = merged_df.drop('tconst', 1)


In [202]:
print(merged_df)

      end_year  labels  num_votes  runtime_minutes  start_year  user_ratings  \
934          0    True       1898               66        1919           7.3   
881          0    True       5376              145        1920           7.0   
0            0    True       5842               97        1921           7.5   
1            0    True       9652               59        1924           7.4   
2            0    True      17887               93        1925           7.2   
...        ...     ...        ...              ...         ...           ...   
7693         0   False      12951               87        2019           5.7   
7953         0   False       2464               77        2020           5.8   
7694         0   False       1719              101        2020           5.9   
7695         0    True       4144              111        2020           8.0   
7696         0    True       3242              103        2020           8.0   

      renamed  
934         1  
881    

In [203]:
merged_df_without_ratings = merged_df.drop('user_ratings', 1)
merged_df_with_ratings = merged_df.dropna()

  merged_df_without_ratings = merged_df.drop('user_ratings', 1)


In [204]:
labels_without_ratings = merged_df_without_ratings['labels']
labels_without_ratings = np.array(labels_without_ratings.astype('int').tolist())
merged_df_without_ratings = merged_df_without_ratings.drop('labels', 1)

  merged_df_without_ratings = merged_df_without_ratings.drop('labels', 1)


In [205]:
labels_with_ratings = merged_df_with_ratings['labels']
labels_with_ratings = np.array(labels_with_ratings.astype('int').tolist())
merged_df_with_ratings = merged_df_with_ratings.drop('labels', 1)

  merged_df_with_ratings = merged_df_with_ratings.drop('labels', 1)


In [206]:
print(len(merged_df_without_ratings))
print(len(labels_without_ratings))
print(len(merged_df_with_ratings))
print(len(labels_with_ratings))

7959
7959
7925
7925


In [207]:
full_array_without_ratings = merged_df_without_ratings.to_numpy()
full_array_with_ratings = merged_df_with_ratings.to_numpy()

In [208]:
print(len(full_array_without_ratings))
print(len(full_array_with_ratings))


7959
7925


Train / test split

In [209]:
from sklearn.preprocessing import StandardScaler
standardizer = StandardScaler()
full_array_without_ratings = standardizer.fit_transform(full_array_without_ratings)

split = int(len(full_array_without_ratings) * 0.8)

X_train_without_ratings = full_array_without_ratings[:split]
y_train_without_ratings = labels_without_ratings[:split]

X_test_without_ratings = full_array_without_ratings[split:]
y_test_without_ratings = labels_without_ratings[split:]

print(X_train_without_ratings.shape)
print(y_train_without_ratings.shape)
print(X_test_without_ratings.shape)
print(y_test_without_ratings.shape)



(6367, 5)
(6367,)
(1592, 5)
(1592,)


In [210]:
full_array_with_ratings = standardizer.fit_transform(full_array_with_ratings)

split = int(len(full_array_with_ratings) * 0.8)

X_train_with_ratings = full_array_with_ratings[:split]
y_train_with_ratings = labels_with_ratings[:split]

X_test_with_ratings = full_array_with_ratings[split:]
y_test_with_ratings = labels_with_ratings[split:]

print(X_train_with_ratings.shape)
print(y_train_with_ratings.shape)
print(X_test_with_ratings.shape)
print(y_test_with_ratings.shape)

(6340, 6)
(6340,)
(1585, 6)
(1585,)


In [211]:
print(X_train_with_ratings)

[[-0.33132149 -0.22709497 -1.54257308  0.19892365  0.86680574  2.60477874]
 [-0.33132149 -0.19524328  1.53868015  0.20059905  0.63952151 -0.38390977]
 [-0.33132149 -0.19097564 -0.33347372  0.20227445  1.01832833  2.60477874]
 ...
 [-0.33132149 -0.23398182 -1.19154424  0.33295556  0.48799892 -0.38390977]
 [-0.33132149  0.32807492  0.01755513  0.33965715 -0.04233084 -0.38390977]
 [ 3.0199649  -0.24447694 -0.13845769 -3.01616677  0.7152828   2.60477874]]


## Machine Learning

### Logistic Regression

In [212]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)

predictions = model.predict(X_test)

NameError: name 'X_train' is not defined

In [12]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, predictions)

TN, FP, FN, TP = confusion_matrix(y_test, predictions).ravel()

print('True Positive(TP)  = ', TP)
print('False Positive(FP) = ', FP)
print('True Negative(TN)  = ', TN)
print('False Negative(FN) = ', FN)

accuracy =  (TP+TN) /(TP+FP+TN+FN)

print('Accuracy of the binary classification = {:0.3f}'.format(accuracy))

True Positive(TP)  =  483
False Positive(FP) =  82
True Negative(TN)  =  492
False Negative(FN) =  53
Accuracy of the binary classification = 0.878


### Keras

In [36]:
import keras
from keras.models import Sequential   # importing Sequential model
from keras.layers import Dense        # importing Dense layers
import keras.optimizers
import tensorflow as tf

In [40]:
basic_model = Sequential()
# Adding layers to the model
# First layers: 16 neurons/perceptrons that takes the input and uses 'sigmoid' activation function.
basic_model.add(Dense(units = 16 , activation = 'sigmoid', input_shape = (6,))) 
# Second layer: 1 neuron/perceptron that takes the input from the 1st layers and gives output as 0 or 1.Activation used is 'Hard Sigmoid'
basic_model.add(Dense(1, activation = 'hard_sigmoid'))




In [41]:
sgd = tf.keras.optimizers.SGD(learning_rate=0.5, momentum=0.9, nesterov=True)
basic_model.compile(loss = 'binary_crossentropy', optimizer = 'sgd', metrics = ['accuracy'])

In [42]:
basic_model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
basic_model.fit(X_train, y_train, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7ffb234e7580>

In [117]:
# Test, Loss and accuracy
loss_and_metrics = basic_model.evaluate(X_test, y_test)
print('Loss = ',loss_and_metrics[0])
print('Accuracy = ',loss_and_metrics[1])

Loss =  0.6617915630340576
Accuracy =  0.6757082343101501


### Keras 2 without ratings

In [179]:
import keras
from keras.models import Sequential   # importing Sequential model
from keras.layers import Dense        # importing Dense layers
import keras.optimizers
import tensorflow as tf

In [180]:
print(X_train_without_ratings)

[[-0.33100304 -0.22659344 -1.53571186  0.19837533  2.61118194]
 [-0.33100304 -0.19467699  1.53458386  0.20005195 -0.38296834]
 [-0.33100304 -0.19040066 -0.33091227  0.20172858  2.61118194]
 ...
 [-0.33100304 -0.2334943  -1.18593133  0.33250532 -0.38296834]
 [-0.33100304  0.32970536  0.01886825  0.33921182 -0.38296834]
 [ 3.0227923  -0.24401076 -0.13658976 -3.01906779  2.61118194]]


In [181]:
keras2model_without_ratings = keras.Sequential([
    keras.layers.Flatten(input_shape=(5,)),
    keras.layers.Dense(16, activation=tf.nn.relu),
    keras.layers.Dense(32, activation=tf.nn.relu),
    keras.layers.Dense(16, activation=tf.nn.relu),
    keras.layers.Dense(1, activation=tf.nn.sigmoid),
])

In [182]:
keras2model_without_ratings.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

keras2model_without_ratings.fit(X_train_without_ratings, y_train_without_ratings, epochs=10, batch_size=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x206f79d5450>

In [183]:
# Test, Loss and accuracy
loss_and_metrics = keras2model_without_ratings.evaluate(X_test_without_ratings, y_test_without_ratings)
print('Loss without ratings = ',loss_and_metrics[0])
print('Accuracy without ratings = ',loss_and_metrics[1])

Loss without ratings =  0.5709401965141296
Accuracy without ratings =  0.7047738432884216


### Keras 2 with ratings

In [213]:
import keras
from keras.models import Sequential   # importing Sequential model
from keras.layers import Dense        # importing Dense layers
import keras.optimizers
import tensorflow as tf

In [214]:
keras2model_with_ratings = keras.Sequential([
    keras.layers.Flatten(input_shape=(6,)),
    keras.layers.Dense(16, activation=tf.nn.relu),
    keras.layers.Dense(32, activation=tf.nn.relu),
    keras.layers.Dense(16, activation=tf.nn.relu),
    keras.layers.Dense(1, activation=tf.nn.sigmoid),
])

In [215]:
keras2model_with_ratings.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

keras2model_with_ratings.fit(X_train_with_ratings, y_train_with_ratings, epochs=10, batch_size=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x206fbfc1ed0>

In [216]:
# Test, Loss and accuracy
loss_and_metrics = keras2model_with_ratings.evaluate(X_test_with_ratings, y_test_with_ratings)
print('Loss = ',loss_and_metrics[0])
print('Accuracy = ',loss_and_metrics[1])

Loss =  0.2246904969215393
Accuracy =  0.918612003326416


### Random forest classifier

In [13]:
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
# Instantiate model with 1000 decision trees
rf = RandomForestClassifier(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(X_train, y_train);

In [14]:
# Use the forest's predict method on the test data
predictions = rf.predict(X_test)
# Test, Loss and accuracy


In [15]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

0.8846846846846846

## Predict on test/validation

In [226]:
import duckdb
import pandas as pd
from functools import reduce
import numpy as np
import os
# conn = duckdb.connect('db/db.duckdb', read_only=False)

In [227]:
# tables = conn.execute('PRAGMA show_tables').fetchdf()['name'].tolist()
# frames = []
# for i in tables:
#     if i.startswith('test'):
#         frames.append(conn.execute(f"SELECT * FROM {i}").fetchdf())
# test_merged_df = reduce(lambda  left,right: pd.merge(left,right,on=['tconst'], how='outer'), frames)
# print(f"Any NaN values in the df: {test_merged_df.isnull().values.any()}")
# print(test_merged_df.isnull().sum())

In [228]:
test_merged_df = conn.execute('''SELECT end_year_test.tconst, end_year_test.end_year, num_votes_test.num_votes, original_title_test.original_title, primary_title_test.primary_title, runtime_minutes_test.runtime_minutes, start_year_test.start_year, user_ratings_test.user_ratings  
                         FROM end_year_test 
                         INNER JOIN num_votes_test ON end_year_test.tconst = num_votes_test.tconst
                         INNER JOIN original_title_test ON end_year_test.tconst = original_title_test.tconst
                         INNER JOIN primary_title_test ON end_year_test.tconst = primary_title_test.tconst
                         INNER JOIN runtime_minutes_test ON end_year_test.tconst = runtime_minutes_test.tconst
                         INNER JOIN start_year_test ON end_year_test.tconst = start_year_test.tconst
                         FULL OUTER JOIN user_ratings_test ON end_year_test.tconst = user_ratings_test.tconst
                         FULL OUTER JOIN tmdb_ratings_test ON end_year_test.tconst = tmdb_ratings_test.tconst
                         ''').fetchdf()

tconst_order = conn.execute('SELECT tconst FROM end_year_test').fetchdf()
tconst_order['order'] = range(len(tconst_order))
test_merged_df = test_merged_df.merge(tconst_order, on='tconst').sort_values(by=['order']).drop('order', axis=1)
print(f"Any NaN values in the df: {test_merged_df.isnull().values.any()}")
print(test_merged_df.isnull().sum())

Any NaN values in the df: True
tconst               0
end_year             0
num_votes            0
original_title       0
primary_title        0
runtime_minutes      0
start_year           0
user_ratings       343
dtype: int64


In [229]:
test_merged_df['user_ratings'] = merged_df['tmdb_ratings']
test_merged_df = merged_df.drop('tmdb_ratings', 1)

  test_merged_df = merged_df.drop('tmdb_ratings', 1)


In [49]:
# tables = conn.execute('PRAGMA show_tables').fetchdf()['name'].tolist()
# frames = []
# for i in tables:
#     if i.startswith('validation'):
#         frames.append(conn.execute(f"SELECT * FROM {i}").fetchdf())
# validation_merged_df = reduce(lambda  left,right: pd.merge(left,right,on=['tconst'], how='outer'), frames)
# print(f"Any NaN values in the df: {validation_merged_df.isnull().values.any()}")
# print(validation_merged_df.isnull().sum())

In [230]:
validation_merged_df = conn.execute('''SELECT end_year_validation.tconst, end_year_validation.end_year, num_votes_validation.num_votes, original_title_validation.original_title, primary_title_validation.primary_title, runtime_minutes_validation.runtime_minutes, start_year_validation.start_year, user_ratings_validation.user_ratings, tmdb_ratings_validation.tmdb_ratings   
                         FROM end_year_validation 
                         INNER JOIN num_votes_validation ON end_year_validation.tconst = num_votes_validation.tconst
                         INNER JOIN original_title_validation ON end_year_validation.tconst = original_title_validation.tconst
                         INNER JOIN primary_title_validation ON end_year_validation.tconst = primary_title_validation.tconst
                         INNER JOIN runtime_minutes_validation ON end_year_validation.tconst = runtime_minutes_validation.tconst
                         INNER JOIN start_year_validation ON end_year_validation.tconst = start_year_validation.tconst
                         FULL OUTER JOIN user_ratings_validation ON end_year_validation.tconst = user_ratings_validation.tconst
                         FULL OUTER JOIN tmdb_ratings_validation ON end_year_validation.tconst = tmdb_ratings_validation.tconst
                         ''').fetchdf()

tconst_order = conn.execute('SELECT tconst FROM end_year_validation').fetchdf()
tconst_order['order'] = range(len(tconst_order))
validation_merged_df = validation_merged_df.merge(tconst_order, on='tconst').sort_values(by=['order']).drop('order', axis=1)
print(f"Any NaN values in the df: {validation_merged_df.isnull().values.any()}")
print(validation_merged_df.isnull().sum())

Any NaN values in the df: True
tconst               0
end_year             0
num_votes            0
original_title       0
primary_title        0
runtime_minutes      0
start_year           0
user_ratings       306
tmdb_ratings         7
dtype: int64


In [231]:
validation_merged_df['user_ratings'] = merged_df['tmdb_ratings']
validation_merged_df = merged_df.drop('tmdb_ratings', 1)

  validation_merged_df = merged_df.drop('tmdb_ratings', 1)


In [232]:
print(len(test_merged_df))
bb = pd.read_csv(os.getcwd() + "/imdb/test_hidden.csv")
print(len(bb))

7959
1086


In [233]:
print(len(validation_merged_df))
dd = pd.read_csv(os.getcwd() + "/imdb/validation_hidden.csv")
print(len(dd))

7959
955


In [234]:
renamed = []
for i in range(len(test_merged_df)):
    curr_original = test_merged_df.iloc[i]['original_title']
    curr_primary = test_merged_df.iloc[i]['primary_title']
    if curr_original != "" and curr_primary != curr_original:
        renamed.append(1)
    else:
        renamed.append(0)
    
test_merged_df['renamed'] = renamed

In [235]:
renamed = []
for i in range(len(validation_merged_df)):
    curr_original = validation_merged_df.iloc[i]['original_title']
    curr_primary = validation_merged_df.iloc[i]['primary_title']
    if curr_original != "" and curr_primary != curr_original:
        renamed.append(1)
    else:
        renamed.append(0)
    
validation_merged_df['renamed'] = renamed

In [236]:
test_merged_df = test_merged_df.drop('original_title', 1)
test_merged_df = test_merged_df.drop('primary_title', 1)
test_merged_df = test_merged_df.drop('tconst', 1)

  test_merged_df = test_merged_df.drop('original_title', 1)
  test_merged_df = test_merged_df.drop('primary_title', 1)
  test_merged_df = test_merged_df.drop('tconst', 1)


In [237]:
validation_merged_df = validation_merged_df.drop('original_title', 1)
validation_merged_df = validation_merged_df.drop('primary_title', 1)
validation_merged_df = validation_merged_df.drop('tconst', 1)

  validation_merged_df = validation_merged_df.drop('original_title', 1)
  validation_merged_df = validation_merged_df.drop('primary_title', 1)
  validation_merged_df = validation_merged_df.drop('tconst', 1)


In [238]:
test_with_rating_index = np.where(test_merged_df['user_ratings'].notnull())[0]
test_without_rating_index = np.where(test_merged_df['user_ratings'].isnull())[0]

validation_with_rating_index = np.where(validation_merged_df['user_ratings'].notnull())[0]
validation_without_rating_index = np.where(validation_merged_df['user_ratings'].isnull())[0]

print(f"{len(test_with_rating_index)} + {len(test_without_rating_index)} = {len(test_merged_df)}")
print(f"{len(validation_with_rating_index)} + {len(validation_without_rating_index)} = {len(validation_merged_df)}")


test_with_rating = test_merged_df.iloc[test_with_rating_index]
test_without_rating = test_merged_df.iloc[test_without_rating_index]
test_without_rating = test_without_rating.drop('user_ratings', 1)
print(f"{len(test_with_rating)} + {len(test_without_rating)} = {len(test_merged_df)}")

validation_with_rating = validation_merged_df.iloc[validation_with_rating_index]
validation_without_rating = validation_merged_df.iloc[validation_without_rating_index]
validation_without_rating = validation_without_rating.drop('user_ratings', 1)
print(f"{len(validation_with_rating)} + {len(validation_without_rating)} = {len(validation_merged_df)}")

5540 + 2419 = 7959
5540 + 2419 = 7959
5540 + 2419 = 7959
5540 + 2419 = 7959


  test_without_rating = test_without_rating.drop('user_ratings', 1)
  validation_without_rating = validation_without_rating.drop('user_ratings', 1)


In [239]:
full_test_array_without_ratings = test_without_rating.to_numpy()
full_test_array_with_ratings = test_with_rating.to_numpy()

full_validation_array_without_ratings = validation_without_rating.to_numpy()
full_validation_array_with_ratings = validation_with_rating.to_numpy()

print(len(full_test_array_without_ratings))
print(len(full_test_array_with_ratings))
print(len(full_validation_array_without_ratings))
print(len(full_validation_array_with_ratings))

2419
5540
2419
5540


In [240]:
from sklearn.preprocessing import StandardScaler
standardizer = StandardScaler()
full_test_array_without_ratings = standardizer.fit_transform(full_test_array_without_ratings)
full_test_array_with_ratings = standardizer.fit_transform(full_test_array_with_ratings)
full_validation_array_without_ratings = standardizer.fit_transform(full_validation_array_without_ratings)
full_validation_array_with_ratings = standardizer.fit_transform(full_validation_array_with_ratings)


In [241]:
test_predictions_without_ratings = keras2model_without_ratings.predict(full_test_array_without_ratings)
test_predictions_without_ratings = list(map(lambda x: False if x<0.5 else True, test_predictions_without_ratings))

test_predictions_with_ratings = keras2model_with_ratings.predict(full_test_array_with_ratings)
test_predictions_with_ratings = list(map(lambda x: False if x<0.5 else True, test_predictions_with_ratings))


validation_predictions_without_ratings = keras2model_without_ratings.predict(full_validation_array_without_ratings)
validation_predictions_without_ratings = list(map(lambda x: False if x<0.5 else True, validation_predictions_without_ratings))


validation_predictions_with_ratings = keras2model_with_ratings.predict(full_validation_array_with_ratings)
validation_predictions_with_ratings = list(map(lambda x: False if x<0.5 else True, validation_predictions_with_ratings))


print(len(test_predictions_without_ratings))
print(len(test_without_rating_index))
print(len(test_predictions_with_ratings))
print(len(test_with_rating_index))
print(len(validation_predictions_without_ratings))
print(len(validation_without_rating_index))
print(len(validation_predictions_with_ratings))
print(len(validation_with_rating_index))


ValueError: in user code:

    File "C:\Users\casbe\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\training.py", line 1801, in predict_function  *
        return step_function(self, iterator)
    File "C:\Users\casbe\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\training.py", line 1790, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\casbe\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\training.py", line 1783, in run_step  **
        outputs = model.predict_step(data)
    File "C:\Users\casbe\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\training.py", line 1751, in predict_step
        return self(x, training=False)
    File "C:\Users\casbe\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\casbe\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\input_spec.py", line 264, in assert_input_compatibility
        raise ValueError(f'Input {input_index} of layer "{layer_name}" is '

    ValueError: Input 0 of layer "sequential_2" is incompatible with the layer: expected shape=(None, 5), found shape=(None, 6)


In [61]:
test_predictions_without_ratings_df = pd.DataFrame(test_predictions_without_ratings, index=test_without_rating_index)
test_predictions_with_ratings_df = pd.DataFrame(test_predictions_with_ratings, index=test_with_rating_index)

validation_predictions_without_ratings_df = pd.DataFrame(validation_predictions_without_ratings, index=validation_without_rating_index)
validation_predictions_with_ratings_df = pd.DataFrame(validation_predictions_with_ratings, index=validation_with_rating_index)


final_test_predictions = pd.concat([test_predictions_without_ratings_df, test_predictions_with_ratings_df], axis=0).sort_index()[0].tolist()
final_validation_predictions = pd.concat([validation_predictions_without_ratings_df, validation_predictions_with_ratings_df], axis=0).sort_index()[0].tolist()

print(len(final_test_predictions))
print(len(final_validation_predictions))




1086
955


In [62]:

validation_merged_df['label'] = final_validation_predictions
print(validation_merged_df['label'].value_counts())

test_merged_df['label'] = final_test_predictions
print(test_merged_df['label'].value_counts())


True     491
False    464
Name: label, dtype: int64
False    562
True     524
Name: label, dtype: int64


In [63]:
num = 6
with open(f'test_predictions{num}.txt', 'w') as f:
    for item in final_test_predictions:
        f.write("%s\n" % item)
with open(f'validation_predictions{num}.txt', 'w') as f:
    for item in final_validation_predictions:
        f.write("%s\n" % item)