In [2]:
import duckdb
import pandas as pd
from functools import reduce
import numpy as np
import os


## Connect to duckdb and create dataframe

In [4]:
conn = duckdb.connect('db/db.duckdb', read_only=False)

print(conn.execute('PRAGMA show_tables').fetchdf()['name'].tolist())

['end_year', 'labels', 'num_votes', 'original_title', 'primary_title', 'runtime', 'start_year', 'test_end_year', 'test_num_votes', 'test_original_title', 'test_primary_title', 'test_runtime', 'test_start_year', 'test_user_ratings', 'user_ratings', 'validation_end_year', 'validation_num_votes', 'validation_original_title', 'validation_primary_title', 'validation_runtime', 'validation_start_year', 'validation_user_ratings']


In [5]:
tables = conn.execute('PRAGMA show_tables').fetchdf()['name'].tolist()
frames = []
for i in tables:
    if not i.startswith('test') and not i.startswith('validation'):
        frames.append(conn.execute(f"SELECT * FROM {i}").fetchdf())
merged_df = reduce(lambda  left,right: pd.merge(left,right,on=['tconst'], how='outer'), frames)
print(f"Any NaN values in the df: {merged_df.isnull().values.any()}")

Any NaN values in the df: True


In [6]:
print(len(merged_df))
print(f"Any NaN values in the df: {merged_df.isnull().values.any()}")

7959
Any NaN values in the df: True


Add value to indicate wether the title has been changed

In [7]:
renamed = []
for i in range(len(merged_df)):
    curr_original = merged_df.iloc[i]['original_title']
    curr_primary = merged_df.iloc[i]['primary_title']
    if curr_original != "" and curr_primary != curr_original:
        renamed.append(1)
    else:
        renamed.append(0)
    
merged_df['renamed'] = renamed

## Transform dataframe to ML suited data

Transform dataframe to array

In [8]:
print(merged_df.dtypes)
merged_df = merged_df.drop('original_title', 1)
merged_df = merged_df.drop('primary_title', 1)
merged_df = merged_df.drop('tconst', 1)

tconst              object
end_year             int32
labels                bool
num_votes            int32
original_title      object
primary_title       object
runtime_minutes      int32
start_year           int32
rating             float64
renamed              int64
dtype: object


In [9]:
print(merged_df)

      end_year  labels  num_votes  runtime_minutes  start_year  rating  \
0            0    True       1898               66        1919     NaN   
1            0    True       5376              145        1920     7.0   
2            0    True       5842               97        1921     9.0   
3            0    True       9652               59        1924     8.0   
4            0    True      17887               93        1925     7.0   
...        ...     ...        ...              ...         ...     ...   
7954         0   False      12951               87        2019     3.0   
7955         0   False       2464               77        2020     NaN   
7956         0   False       1719              101        2020     6.0   
7957         0    True       4144              111        2020     8.0   
7958         0    True       3242              103        2020     8.0   

      renamed  
0           1  
1           0  
2           1  
3           0  
4           0  
...       ...  

In [10]:
merged_df_without_ratings = merged_df.drop('rating', 1)
merged_df_with_ratings = merged_df.dropna()

In [11]:
labels_without_ratings = merged_df_without_ratings['labels']
labels_without_ratings = np.array(labels_without_ratings.astype('int').tolist())
merged_df_without_ratings = merged_df_without_ratings.drop('labels', 1)

In [12]:
labels_with_ratings = merged_df_with_ratings['labels']
labels_with_ratings = np.array(labels_with_ratings.astype('int').tolist())
merged_df_with_ratings = merged_df_with_ratings.drop('labels', 1)

In [13]:
print(len(merged_df_without_ratings))
print(len(labels_without_ratings))
print(len(merged_df_with_ratings))
print(len(labels_with_ratings))

7959
7959
5540
5540


In [14]:
full_array_without_ratings = merged_df_without_ratings.to_numpy()
full_array_with_ratings = merged_df_with_ratings.to_numpy()

In [15]:
print(len(full_array_without_ratings))
print(len(full_array_with_ratings))


7959
5540


Train / test split

In [16]:
from sklearn.preprocessing import StandardScaler
standardizer = StandardScaler()
full_array_without_ratings = standardizer.fit_transform(full_array_without_ratings)

split = 6367

X_train_without_ratings = full_array_without_ratings[:split]
y_train_without_ratings = labels_without_ratings[:split]

X_test_without_ratings = full_array_without_ratings[split:]
y_test_without_ratings = labels_without_ratings[split:]

print(X_train_without_ratings.shape)
print(y_train_without_ratings.shape)
print(X_test_without_ratings.shape)
print(y_test_without_ratings.shape)



(6367, 5)
(6367,)
(1592, 5)
(1592,)


In [17]:
full_array_with_ratings = standardizer.fit_transform(full_array_with_ratings)

split = 4432

X_train_with_ratings = full_array_with_ratings[:split]
y_train_with_ratings = labels_with_ratings[:split]

X_test_with_ratings = full_array_with_ratings[split:]
y_test_with_ratings = labels_with_ratings[split:]

print(X_train_with_ratings.shape)
print(y_train_with_ratings.shape)
print(X_test_with_ratings.shape)
print(y_test_with_ratings.shape)

(4432, 6)
(4432,)
(1108, 6)
(1108,)


In [18]:
print(X_train_with_ratings)

[[-0.32995896 -0.24666328  1.56608671  0.19767778  0.35489939 -0.37905528]
 [-0.32995896 -0.2430555  -0.37551326  0.19935816  1.48714391  2.63813762]
 [-0.32995896 -0.21355846 -1.91261324  0.20439929  0.92102165 -0.37905528]
 ...
 [-0.32995896 -0.2882843   1.9705867   0.33714923 -1.90958966  2.63813762]
 [-0.32995896 -0.25399496 -0.49686326  0.33714923 -0.21122287 -0.37905528]
 [-0.32995896  2.53056486  0.51438673  0.33882961  0.92102165  2.63813762]]


## Machine Learning

### Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)

predictions = model.predict(X_test)

In [12]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, predictions)

TN, FP, FN, TP = confusion_matrix(y_test, predictions).ravel()

print('True Positive(TP)  = ', TP)
print('False Positive(FP) = ', FP)
print('True Negative(TN)  = ', TN)
print('False Negative(FN) = ', FN)

accuracy =  (TP+TN) /(TP+FP+TN+FN)

print('Accuracy of the binary classification = {:0.3f}'.format(accuracy))

True Positive(TP)  =  483
False Positive(FP) =  82
True Negative(TN)  =  492
False Negative(FN) =  53
Accuracy of the binary classification = 0.878


### Keras

In [36]:
import keras
from keras.models import Sequential   # importing Sequential model
from keras.layers import Dense        # importing Dense layers
import keras.optimizers
import tensorflow as tf

In [40]:
basic_model = Sequential()
# Adding layers to the model
# First layers: 16 neurons/perceptrons that takes the input and uses 'sigmoid' activation function.
basic_model.add(Dense(units = 16 , activation = 'sigmoid', input_shape = (6,))) 
# Second layer: 1 neuron/perceptron that takes the input from the 1st layers and gives output as 0 or 1.Activation used is 'Hard Sigmoid'
basic_model.add(Dense(1, activation = 'hard_sigmoid'))




In [41]:
sgd = tf.keras.optimizers.SGD(learning_rate=0.5, momentum=0.9, nesterov=True)
basic_model.compile(loss = 'binary_crossentropy', optimizer = 'sgd', metrics = ['accuracy'])

In [42]:
basic_model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
basic_model.fit(X_train, y_train, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7ffb234e7580>

In [117]:
# Test, Loss and accuracy
loss_and_metrics = basic_model.evaluate(X_test, y_test)
print('Loss = ',loss_and_metrics[0])
print('Accuracy = ',loss_and_metrics[1])

Loss =  0.6617915630340576
Accuracy =  0.6757082343101501


### Keras 2 without ratings

In [19]:
import keras
from keras.models import Sequential   # importing Sequential model
from keras.layers import Dense        # importing Dense layers
import keras.optimizers
import tensorflow as tf

In [20]:
print(X_train_without_ratings)

[[-0.33100304 -0.22659344 -1.53571186  0.19837533  2.61118194]
 [-0.33100304 -0.19467699  1.53458386  0.20005195 -0.38296834]
 [-0.33100304 -0.19040066 -0.33091227  0.20172858  2.61118194]
 ...
 [-0.33100304 -0.2334943  -1.18593133  0.33250532 -0.38296834]
 [-0.33100304  0.32970536  0.01886825  0.33921182 -0.38296834]
 [ 3.0227923  -0.24401076 -0.13658976 -3.01906779  2.61118194]]


In [21]:
keras2model_without_ratings = keras.Sequential([
    keras.layers.Flatten(input_shape=(5,)),
    keras.layers.Dense(16, activation=tf.nn.relu),
    keras.layers.Dense(32, activation=tf.nn.relu),
    keras.layers.Dense(16, activation=tf.nn.relu),
    keras.layers.Dense(1, activation=tf.nn.sigmoid),
])

In [22]:
keras2model_without_ratings.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

keras2model_without_ratings.fit(X_train_without_ratings, y_train_without_ratings, epochs=10, batch_size=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fea9131fd90>

In [23]:
# Test, Loss and accuracy
loss_and_metrics = keras2model_without_ratings.evaluate(X_test_without_ratings, y_test_without_ratings)
print('Loss = ',loss_and_metrics[0])
print('Accuracy = ',loss_and_metrics[1])

Loss =  0.6089065074920654
Accuracy =  0.697236180305481


### Keras 2 with ratings

In [24]:
import keras
from keras.models import Sequential   # importing Sequential model
from keras.layers import Dense        # importing Dense layers
import keras.optimizers
import tensorflow as tf

In [25]:
keras2model_with_ratings = keras.Sequential([
    keras.layers.Flatten(input_shape=(6,)),
    keras.layers.Dense(16, activation=tf.nn.relu),
    keras.layers.Dense(32, activation=tf.nn.relu),
    keras.layers.Dense(16, activation=tf.nn.relu),
    keras.layers.Dense(1, activation=tf.nn.sigmoid),
])

In [26]:
keras2model_with_ratings.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

keras2model_with_ratings.fit(X_train_with_ratings, y_train_with_ratings, epochs=10, batch_size=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fea913a2d90>

In [27]:
# Test, Loss and accuracy
loss_and_metrics = keras2model_with_ratings.evaluate(X_test_with_ratings, y_test_with_ratings)
print('Loss = ',loss_and_metrics[0])
print('Accuracy = ',loss_and_metrics[1])

Loss =  0.27319660782814026
Accuracy =  0.8880866169929504


### Random forest classifier

In [13]:
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
# Instantiate model with 1000 decision trees
rf = RandomForestClassifier(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(X_train, y_train);

In [14]:
# Use the forest's predict method on the test data
predictions = rf.predict(X_test)
# Test, Loss and accuracy


In [15]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

0.8846846846846846

## Predict on test/validation

In [30]:
import duckdb
import pandas as pd
from functools import reduce
import numpy as np
import os
conn = duckdb.connect('db/db.duckdb', read_only=False)

In [31]:
tables = conn.execute('PRAGMA show_tables').fetchdf()['name'].tolist()
frames = []
for i in tables:
    if i.startswith('test'):
        frames.append(conn.execute(f"SELECT * FROM {i}").fetchdf())
test_merged_df = reduce(lambda  left,right: pd.merge(left,right,on=['tconst'], how='outer'), frames)
print(f"Any NaN values in the df: {test_merged_df.isnull().values.any()}")
print(test_merged_df.isnull().sum())

Any NaN values in the df: True
tconst               0
end_year             0
num_votes            0
original_title       0
primary_title        0
runtime_minutes      0
start_year           0
rating             343
dtype: int64


In [32]:
tables = conn.execute('PRAGMA show_tables').fetchdf()['name'].tolist()
frames = []
for i in tables:
    if i.startswith('validation'):
        frames.append(conn.execute(f"SELECT * FROM {i}").fetchdf())
validation_merged_df = reduce(lambda  left,right: pd.merge(left,right,on=['tconst'], how='outer'), frames)
print(f"Any NaN values in the df: {validation_merged_df.isnull().values.any()}")
print(validation_merged_df.isnull().sum())

Any NaN values in the df: True
tconst               0
end_year             0
num_votes            0
original_title       0
primary_title        0
runtime_minutes      0
start_year           0
rating             306
dtype: int64


In [33]:
print(len(test_merged_df))
bb = pd.read_csv(os.getcwd() + "/imdb/test_hidden.csv")
print(len(bb))

1086
1086


In [34]:
print(len(validation_merged_df))
dd = pd.read_csv(os.getcwd() + "/imdb/validation_hidden.csv")
print(len(dd))

955
955


In [35]:
renamed = []
for i in range(len(test_merged_df)):
    curr_original = test_merged_df.iloc[i]['original_title']
    curr_primary = test_merged_df.iloc[i]['primary_title']
    if curr_original != "" and curr_primary != curr_original:
        renamed.append(1)
    else:
        renamed.append(0)
    
test_merged_df['renamed'] = renamed

In [36]:
renamed = []
for i in range(len(validation_merged_df)):
    curr_original = validation_merged_df.iloc[i]['original_title']
    curr_primary = validation_merged_df.iloc[i]['primary_title']
    if curr_original != "" and curr_primary != curr_original:
        renamed.append(1)
    else:
        renamed.append(0)
    
validation_merged_df['renamed'] = renamed

In [37]:
test_merged_df = test_merged_df.drop('original_title', 1)
test_merged_df = test_merged_df.drop('primary_title', 1)
test_merged_df = test_merged_df.drop('tconst', 1)

In [38]:
validation_merged_df = validation_merged_df.drop('original_title', 1)
validation_merged_df = validation_merged_df.drop('primary_title', 1)
validation_merged_df = validation_merged_df.drop('tconst', 1)

In [39]:
test_with_rating_index = np.where(test_merged_df['rating'].notnull())[0]
test_without_rating_index = np.where(test_merged_df['rating'].isnull())[0]

validation_with_rating_index = np.where(validation_merged_df['rating'].notnull())[0]
validation_without_rating_index = np.where(validation_merged_df['rating'].isnull())[0]

print(f"{len(test_with_rating_index)} + {len(test_without_rating_index)} = {len(test_merged_df)}")
print(f"{len(validation_with_rating_index)} + {len(validation_without_rating_index)} = {len(validation_merged_df)}")


test_with_rating = test_merged_df.iloc[test_with_rating_index]
test_without_rating = test_merged_df.iloc[test_without_rating_index]
test_without_rating = test_without_rating.drop('rating', 1)
print(f"{len(test_with_rating)} + {len(test_without_rating)} = {len(test_merged_df)}")

validation_with_rating = validation_merged_df.iloc[validation_with_rating_index]
validation_without_rating = validation_merged_df.iloc[validation_without_rating_index]
validation_without_rating = validation_without_rating.drop('rating', 1)
print(f"{len(validation_with_rating)} + {len(validation_without_rating)} = {len(validation_merged_df)}")

743 + 343 = 1086
649 + 306 = 955
743 + 343 = 1086
649 + 306 = 955


In [41]:
full_test_array_without_ratings = test_without_rating.to_numpy()
full_test_array_with_ratings = test_with_rating.to_numpy()

full_validation_array_without_ratings = validation_without_rating.to_numpy()
full_validation_array_with_ratings = validation_with_rating.to_numpy()

print(len(full_test_array_without_ratings))
print(len(full_test_array_with_ratings))
print(len(full_validation_array_without_ratings))
print(len(full_validation_array_with_ratings))

343
743
306
649


In [42]:
from sklearn.preprocessing import StandardScaler
standardizer = StandardScaler()
full_test_array_without_ratings = standardizer.fit_transform(full_test_array_without_ratings)
full_test_array_with_ratings = standardizer.fit_transform(full_test_array_with_ratings)
full_validation_array_without_ratings = standardizer.fit_transform(full_validation_array_without_ratings)
full_validation_array_with_ratings = standardizer.fit_transform(full_validation_array_with_ratings)


In [64]:
test_predictions_without_ratings = keras2model_without_ratings.predict(full_test_array_without_ratings)
test_predictions_without_ratings = list(map(lambda x: False if x<0.5 else True, test_predictions_without_ratings))

test_predictions_with_ratings = keras2model_with_ratings.predict(full_test_array_with_ratings)
test_predictions_with_ratings = list(map(lambda x: False if x<0.5 else True, test_predictions_with_ratings))


validation_predictions_without_ratings = keras2model_without_ratings.predict(full_validation_array_without_ratings)
validation_predictions_without_ratings = list(map(lambda x: False if x<0.5 else True, validation_predictions_without_ratings))


validation_predictions_with_ratings = keras2model_with_ratings.predict(full_validation_array_with_ratings)
validation_predictions_with_ratings = list(map(lambda x: False if x<0.5 else True, validation_predictions_with_ratings))


print(len(test_predictions_without_ratings))
print(len(test_without_rating_index))
print(len(test_predictions_with_ratings))
print(len(test_with_rating_index))
print(len(validation_predictions_without_ratings))
print(len(validation_without_rating_index))
print(len(validation_predictions_with_ratings))
print(len(validation_with_rating_index))


343
343
743
743
306
306
649
649


In [65]:
test_predictions_without_ratings_df = pd.DataFrame(test_predictions_without_ratings, index=test_without_rating_index)
test_predictions_with_ratings_df = pd.DataFrame(test_predictions_with_ratings, index=test_with_rating_index)

validation_predictions_without_ratings_df = pd.DataFrame(validation_predictions_without_ratings, index=validation_without_rating_index)
validation_predictions_with_ratings_df = pd.DataFrame(validation_predictions_with_ratings, index=validation_with_rating_index)


final_test_predictions = pd.concat([test_predictions_without_ratings_df, test_predictions_with_ratings_df], axis=0).sort_index()[0].tolist()
final_validation_predictions = pd.concat([validation_predictions_without_ratings_df, validation_predictions_with_ratings_df], axis=0).sort_index()[0].tolist()

print(len(final_test_predictions))
print(len(final_validation_predictions))




1086
955


In [69]:

validation_merged_df['label'] = final_validation_predictions
print(validation_merged_df['label'].value_counts())

test_merged_df['label'] = final_test_predictions
print(test_merged_df['label'].value_counts())


True     498
False    457
Name: label, dtype: int64
True     549
False    537
Name: label, dtype: int64


In [68]:
with open('test_predictions4.txt', 'w') as f:
    for item in final_test_predictions:
        f.write("%s\n" % item)

In [67]:
with open('validation_predictions4.txt', 'w') as f:
    for item in final_validation_predictions:
        f.write("%s\n" % item)