In [None]:
!pip install PyHive
!pip install thrift

In [None]:
from sqlalchemy import create_engine
from TCLIService.ttypes import TOperationState
from pyhive import hive
import requests
import pandas as pd
from google.colab import files
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [None]:
protocol = 'https' # Connection protocol can be 'http' or 'https'

# username - Use 'token' as the username when connecting using a Timbr token, otherwise its the user name.

user_name = 'token'

# userpass - Should be the token value if using a token as a username, otherwise its the user's password.

user_pass = 'tk_ab44fcf5cb043c12f6a85e10bcb2cbbe37c3736b3158cb7cfb64a942f0b09352'

# hostname - The IP / Hostname of the Timbr server (not necessarily the hostname of the Timbr platform).

hostname = 'azure-env2.timbr.ai'

# port - Timbr default port 11000

port = '443'

# ontology - the ontology / knowledge graph to connect to.

ontology = 'csv_tables'



engine = create_engine(f"hive+{protocol}://{user_name}@{ontology}:{user_pass}@{hostname}:{port}")

conn = engine.connect()



#query = "SHOW CONCEPTS"



query = """

SELECT

       `review`, `person_id_person_film[person_film].rating` AS "rating"

FROM `dtimbr`.`viewing_with_ids`

WHERE `review` IS NOT NULL AND `person_id_person_film[person_film].rating` IS NOT NULL

AND `person_id_person_film[person_film].films_id` = `films_id`

LIMIT 100

"""



dbapi_conn = engine.raw_connection()

cursor = dbapi_conn.cursor()

cursor.execute(query, async_=True)

status = cursor.poll().operationState

while status in (TOperationState.INITIALIZED_STATE, TOperationState.RUNNING_STATE):

    status = cursor.poll().operationState



print(status)

cursor._arraysize = 100

results = cursor.fetchall()

df = pd.DataFrame(results)
df

In [None]:
def executeQuery(url, ontology, token, query):

    if not url.endswith("/"):

       url += "/"

    post_data = {'ontology_name': ontology, 'query': query}

    headers = {'Content-Type': 'application/json', 'x-api-key': token}

    response = requests.post(url + "timbr/api/query/", headers = headers, json = post_data, verify = False)

    response_data = response.json()

    if response_data['status'] == 'success':

        df = pd.DataFrame(response_data['data'])

        return df

    else:

        raise Exception("Error in request: " + response_data['data'])

In [None]:
url = "https://azure-env2.timbr.ai" # http://<hostname>:<port> or https://<hostname>:<port> for example (your environment): https://azure-env2.timbr.ai/

ontology = "csv_tables" # ontology name, for example: timbr_imdb

token = "tk_ab44fcf5cb043c12f6a85e10bcb2cbbe37c3736b3158cb7cfb64a942f0b09352" # The value of your user token (can be found in the homepage in the user profile box or run the query “show token” in SQL Editor)

query = "SHOW CONCEPT_RELATIONSHIPS" # The SQL query you wish to run



response = executeQuery(url, ontology, token, query)
response

In [None]:
query = """SELECT
       `review`, `person_id_person_film[person_film].rating` AS "rating"
FROM `dtimbr`.`viewing_with_ids`
WHERE `review` IS NOT NULL AND `person_id_person_film[person_film].rating` IS NOT NULL
AND `person_id_person_film[person_film].films_id` = `films_id`
LIMIT 100000


"""
# Prepare the data
data_nlp = executeQuery(url, ontology, token, query)

data_nlp

In [None]:
# Prepare the data
review_data = data_nlp[['review']]
#review_data['sentiment'] = ""

# Create a Tokenizer object to convert the text data into sequences of integers
tokenizer = Tokenizer()
tokenizer.fit_on_texts(review_data["review"])
X = tokenizer.texts_to_sequences(review_data["review"])

# Define the maximum length of the sequences
max_len = max([len(x) for x in X])

# Pad the sequences to ensure that all the sequences have the same length
X = pad_sequences(X, maxlen=max_len)

# Assign sentiment labels to reviews
data_nlp["sentiment"] = data_nlp['rating']

# define y
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
data_nlp["sentiment"] = le.fit_transform(data_nlp["sentiment"])
y = data_nlp["sentiment"]/10.0



In [None]:
#review_data['sentiment'] = prediction.T[0] > 0.35
#review_data['sentiment'] = review_data['sentiment'].apply(lambda x: 'positive' if x else 'negative')
review_data['prediction'] = pd.cut(pd.Series(prediction.T[0]),10, labels=[1,2,3,4,5,6,7,8,9,10])

# Output the results
#review_data.drop('sentiment',axis=1)

In [None]:
review_data['ratings'] = data_nlp['rating']

In [None]:
review_data.ratings.groupby(review_data.prediction).describe()

In [None]:
review_data


In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the RNN model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=32, input_length=max_len))
model.add(LSTM(units=32))
#model.add(Dense(units=32, activation='relu'))
model.add(Dense(units=1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Fit the model to the training data
model.fit(X_train, y_train, epochs=20, batch_size=16)

# Evaluate the performance of the model on the testing data
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test loss: {loss:.3f}")
print(f"Test accuracy: {accuracy:.3f}")

In [None]:
model.save('/content/letterboxd_rnn_split_e20_b16.h5')

In [None]:
files.download('letterboxd_rnn_split_e20_b16.h5')

In [None]:
loaded_model = load_model('/content/letterboxd_rnn_split_e20_b16')

In [None]:
loaded_model

In [None]:
model.predict(X_test)

In [None]:
predictions = model.predict(X_train)


In [None]:
qs = pd.Series(predictions.T[0]).quantile([0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])

In [None]:
qs

In [None]:
review_data.loc[y_test.index]

In [None]:
test_p = model.predict(X_test).T[0]

In [None]:
pd.DataFrame({'y':y_train,'p':predictions.T[0]}).plot.scatter(x='p',y='y')

In [None]:
pd.DataFrame({'y':y_test,'p':test_p}).plot.scatter(x='p',y='y')