In [37]:
!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[K     |████████████████████████████████| 981 kB 5.9 MB/s eta 0:00:01
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25ldone
[?25h  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993242 sha256=d755e201b825a2b91d5f18e285a402c9e996a669d3d52bb40ea4592fb2443986
  Stored in directory: /Users/casbertrams/Library/Caches/pip/wheels/13/c7/b0/79f66658626032e78fc1a83103690ef6797d551cb22e56e734
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [91]:
import duckdb
import pandas as pd
from functools import reduce
import numpy as np


In [2]:
conn = duckdb.connect('db/db.duckdb', read_only=False)
print(conn.execute('PRAGMA show_tables').fetchdf()['name'].tolist())


['end_year', 'labels', 'num_votes', 'original_title', 'primary_title', 'runtime', 'start_year']


In [7]:
print(conn.execute('SELECT * FROM original_title').fetchdf())

         tconst                  original_title
0     tt0010600                       Die Puppe
1     tt0011841                   Way Down East
2     tt0012494                    Der mude Tod
3     tt0015163                   The Navigator
4     tt0016220        The Phantom of the Opera
...         ...                             ...
7954  tt9625664                                
7955  tt9741310                           Slaxx
7956  tt9742392                         Kindred
7957  tt9850386                                
7958  tt9911196  De beentjes van Sint-Hildegard

[7959 rows x 2 columns]


In [92]:
tables = conn.execute('PRAGMA show_tables').fetchdf()['name'].tolist()
frames = []
for i in tables:
    frames.append(conn.execute(f"SELECT * FROM {i}").fetchdf())
merged_df = reduce(lambda  left,right: pd.merge(left,right,on=['tconst'], how='outer'), frames)
print(f"Any NaN values in the df: {merged_df.isnull().values.any()}")

labels = merged_df['labels']
labels = np.array(labels.astype('int').tolist())
merged_df = merged_df.drop('labels', 1)

Any NaN values in the df: False


Add value to indicate wether the title has been changed

In [93]:
renamed = []
for i in range(len(merged_df)):
    curr_original = merged_df.iloc[i]['original_title']
    curr_primary = merged_df.iloc[i]['primary_title']
    if curr_original != "" and curr_primary != curr_original:
        renamed.append(1)
    else:
        renamed.append(0)
    
merged_df['renamed'] = renamed

Transform dataframe to array

In [94]:
print(merged_df.dtypes)
merged_df = merged_df.drop('original_title', 1)
merged_df = merged_df.drop('primary_title', 1)
merged_df = merged_df.drop('tconst', 1)
full_array = merged_df.to_numpy()

tconst             object
end_year            int32
num_votes           int32
original_title     object
primary_title      object
runtime_minutes     int32
start_year          int32
renamed             int64
dtype: object


In [95]:
print(full_array)

[[   0 1898   66 1919    1]
 [   0 5376  145 1920    0]
 [   0 5842   97 1921    1]
 ...
 [   0 1719  101 2020    0]
 [   0 4144  111 2020    0]
 [   0 3242  103 2020    1]]


Train / test split

In [102]:
from sklearn.preprocessing import StandardScaler
standardizer = StandardScaler()
full_array = standardizer.fit_transform(full_array)

X_train = full_array[:6300]
y_train = labels[:6300]

X_test = full_array[6300:]
y_test = labels[6300:]

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)



(6300, 5)
(6300,)
(1659, 5)
(1659,)


### Logistic Regression

In [104]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)

predictions = model.predict(X_test)

LogisticRegression()

In [106]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, predictions)

TN, FP, FN, TP = confusion_matrix(y_test, predictions).ravel()

print('True Positive(TP)  = ', TP)
print('False Positive(FP) = ', FP)
print('True Negative(TN)  = ', TN)
print('False Negative(FN) = ', FN)

accuracy =  (TP+TN) /(TP+FP+TN+FN)

print('Accuracy of the binary classification = {:0.3f}'.format(accuracy))

True Positive(TP)  =  465
False Positive(FP) =  200
True Negative(TN)  =  696
False Negative(FN) =  298
Accuracy of the binary classification = 0.700


### Keras

In [108]:
import keras
from keras.models import Sequential   # importing Sequential model
from keras.layers import Dense        # importing Dense layers
import keras.optimizers
import tensorflow as tf

In [113]:
basic_model = Sequential()
# Adding layers to the model
# First layers: 16 neurons/perceptrons that takes the input and uses 'sigmoid' activation function.
basic_model.add(Dense(units = 16 , activation = 'sigmoid', input_shape = (5,))) 
# Second layer: 1 neuron/perceptron that takes the input from the 1st layers and gives output as 0 or 1.Activation used is 'Hard Sigmoid'
basic_model.add(Dense(1, activation = 'hard_sigmoid'))




In [114]:
sgd = tf.keras.optimizers.SGD(learning_rate=0.5, momentum=0.9, nesterov=True)
basic_model.compile(loss = 'binary_crossentropy', optimizer = 'sgd', metrics = ['accuracy'])

In [116]:
basic_model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
basic_model.fit(X_train, y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x7f8f81498e20>

In [117]:
# Test, Loss and accuracy
loss_and_metrics = basic_model.evaluate(X_test, y_test)
print('Loss = ',loss_and_metrics[0])
print('Accuracy = ',loss_and_metrics[1])

Loss =  0.6617915630340576
Accuracy =  0.6757082343101501


### Keras 2

In [119]:
import keras
from keras.models import Sequential   # importing Sequential model
from keras.layers import Dense        # importing Dense layers
import keras.optimizers
import tensorflow as tf

In [124]:
model = keras.Sequential([
    keras.layers.Flatten(input_shape=(5,)),
    keras.layers.Dense(16, activation=tf.nn.relu),
    keras.layers.Dense(32, activation=tf.nn.relu),
    keras.layers.Dense(16, activation=tf.nn.relu),
    keras.layers.Dense(1, activation=tf.nn.sigmoid),
])

In [125]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(X_train, y_train, epochs=10, batch_size=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f8fa188ee80>

In [131]:
# Test, Loss and accuracy
loss_and_metrics = model.evaluate(X_test, y_test)
print('Loss = ',loss_and_metrics[0])
print('Accuracy = ',loss_and_metrics[1])

Loss =  0.5748496055603027
Accuracy =  0.705846905708313


In [133]:
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
# Instantiate model with 1000 decision trees
rf = RandomForestClassifier(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(X_train, y_train);

In [134]:
# Use the forest's predict method on the test data
predictions = rf.predict(X_test)
# Test, Loss and accuracy
print(predictions)



[1 0 0 ... 0 0 0]
