In [None]:
!pip install scikit-learn seaborn
!pip install googletrans==4.0.0-rc1



### Declaration of Library

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import csv
from tensorflow.keras.models import save_model
from googletrans import Translator
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

### Upload Dataset

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')
# path_to_course = '/content/drive/My Drive/Bangkit/courses.csv'

In [None]:
import json
import requests

base_url = 'https://cloud-computing-chi.vercel.app/ml/'

# Replace path with URL
url_to_course = base_url + 'courses'
# url_to_user = base_url + 'users'
# url_to_rating = base_url + 'user-rating'

## Reading JSON from URL and Converting to DataFrame with Pandas

In [None]:
# Reading JSON from URL
response = requests.get(url_to_course)
json_data = response.json()

# Convert JSON to DataFrame
courses = pd.json_normalize(json_data, 'data')
courses.head()

Unnamed: 0,id,created_at,title,link,organizer,category,instructor,level,rating,fee,description,image
0,1,2023-12-16T16:47:23.175286+00:00,"¡Luces, celular y acción!Crea contenidosaudiov...",https://www.edx.org/learn/marketing/pontificia...,Pontificia Universidad Javeriana,Communication,,Introductory,,,"Smartphone Operation, Value Propositions, Mark...",https://prod-discovery.edx-cdn.org/media/cours...
1,2,2023-12-16T16:47:23.175286+00:00,3D CAD Fundamental,https://www.coursera.org/learn/3d-cad-fundamental,Coursera,Graphic Design,康仕仲,,4.7,Gratis,There have many three-dimensional shape of the...,https://d3njjcbhbojbot.cloudfront.net/api/util...
2,3,2023-12-16T16:47:23.175286+00:00,A Fun Collage - with 30+ Examples,https://www.udemy.com/course/5-minute-collage/,Udemy,Graphic Design,Mike Merkur,,4.0,Gratis,How to Put Together a Photo CollageHow to use ...,https://img-b.udemycdn.com/course/240x135/3569...
3,4,2023-12-16T16:47:23.175286+00:00,A travel by SpanishAmerica: Spanish forbeginners,https://www.edx.org/learn/spanish/universidad-...,Universidad del Rosario,Language,,Introductory,,,Spanish Language,https://prod-discovery.edx-cdn.org/media/cours...
4,5,2023-12-16T16:47:23.175286+00:00,Academic and BusinessWriting,https://www.edx.org/learn/writing/university-o...,"University of California, Berkeley",Communication,,Introductory,,,"Diction, Business Writing, Writing, Vocabulary...",https://prod-discovery.edx-cdn.org/media/cours...


## Load Universal Sentence Encoder and Obtain Embeddings

In [None]:
# Define the URL of the Universal Sentence Encoder module
universal_sentence_encoder_url = "https://tfhub.dev/google/universal-sentence-encoder/4"

# Load the Universal Sentence Encoder model
use_model = hub.load(universal_sentence_encoder_url)
print("Universal Sentence Encoder loaded from %s" % universal_sentence_encoder_url)

# Define a function to obtain embeddings representation from the model
def get_embeddings(input_text):

    return use_model(input_text)


Universal Sentence Encoder loaded from https://tfhub.dev/google/universal-sentence-encoder/4


## Fetching and Displaying Course Titles from a JSON API

In [None]:
url_to_course = base_url + 'courses'

# Fetch JSON content from the URL
response = requests.get(url_to_course)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Normalize JSON data and create a DataFrame
    courses = pd.json_normalize(response.json(), 'data')

    # Assuming the data is a DataFrame with a 'course_title' column
    course_title = courses['title'].tolist()

    # Display the extracted course_title
    print(course_title)
else:
    print(f"Failed to fetch the content. Status code: {response.status_code}")


['¡Luces, celular y acción!Crea contenidosaudiovisuales de impacto', '3D CAD Fundamental', 'A Fun Collage - with 30+ Examples', 'A travel by SpanishAmerica: Spanish forbeginners', 'Academic and BusinessWriting', 'Academic English: How toWrite an Essay', 'Administración pública yfiscal: cómo se gestionaun gobierno', 'Adobe Illustrator-CC Logo & Graphic Design Crash Course', 'Adobe Photoshop Introduction', 'Adobe Photoshop: Brush Tools', 'Adobe Photoshop: Color Gradient', 'Adobe Photoshop: Coloring', 'Adobe Photoshop: Designing Instastories', 'Adobe Photoshop: Digital Imaging Photo Manipulation', 'Adobe Photoshop: Digital Imaging Photo Retouching', 'Adobe Photoshop: Digital Imaging Poster Ads', 'Adobe Photoshop: Digital Imaging Product Showcase', 'Adobe Photoshop: Duotone Effect', 'Adobe Photoshop: Layer Mask & Layer Style', 'Adobe Photoshop: Layer System', 'Adobe Photoshop: Product Photo Retouching', 'Adobe Photoshop: Selection', 'Adobe Photoshop: Type Tools', 'AI & ChatGPT Prompt for  

In [None]:
# Function to translate titles to English
def translate_to_english(text):
    translator = Translator()
    translation = translator.translate(text, dest='en')
    return translation.text

# List to store translated titles
tittle_eng = []

# Translate each title and store in tittle_eng
for judul in course_title:
    translation = translate_to_english(judul)
    tittle_eng.append(translation)

# Display the translated titles
print(tittle_eng)

['Lights, cell phone and action! Create audiovisual impact content', '3D CAD Fundamental', 'A Fun Collage - with 30+ Examples', 'A travel by Spanish American: Spanish for beginners', 'Academic and Business Writing', 'Academic English: How to Write an Essay', 'Public and Fiscal Administration: How to manage a government', 'Adobe Illustrator-CC Logo & Graphic Design Crash Course', 'Adobe Photoshop Introduction', 'Adobe Photoshop: Brush Tools', 'Adobe Photoshop: Color Gradient', 'Adobe Photoshop: Coloring', 'Adobe Photoshop: Designing Instastories', 'Adobe Photoshop: Digital Imaging Photo Manipulation', 'Adobe Photoshop: Digital Imaging Photo Retouching', 'Adobe Photoshop: Digital Imaging Poster Ads', 'Adobe Photoshop: Digital Imaging Product Showcase', 'Adobe Photoshop: Duotone Effect', 'Adobe Photoshop: Layer Mask and Layer Style', 'Adobe Photoshop: Layer System', 'Adobe Photoshop: Product Photo Retouching', 'Adobe Photoshop: Selection', 'Adobe Photoshop: Type Tools', 'AI & ChatGPT Prom

In [None]:
# Combine course_title and tittle_eng into a list of tuples
data = list(zip(course_title, tittle_eng))

# Specify the CSV file path
csv_file_path = 'course_title.csv'

# Write data to the CSV file
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)

    # Write the header row
    writer.writerow(['judul kursus', 'judul kursus eng'])

    # Write the data rows
    writer.writerows(data)

# Display a message indicating the CSV file creation
print(f"CSV file '{csv_file_path}' has been created.")


CSV file 'course_title.csv' has been created.


In [None]:
# Obtain embeddings for tittle_eng
judul_embeddings = use_model(tittle_eng) # memperoleh embedding dari judul yang sudah di traslate

# Convert the list of arrays to a numpy array
judul_embeddings = np.array(judul_embeddings) # konversi list array menjadi array Numpy

## Calculate Top Similarities for Translated Title Embeddings

In [None]:
# List to store indices of top similarities for each title
top_similarities_indices = [] # List ini bertujuan untuk menyimpan indeks dari judul-judul yang memiliki kemiripan kosinus tertinggi dengan setiap judul tertentu.

# Iterate through each title embedding
for i, embedding in enumerate(judul_embeddings):
    # Calculate cosine similarities with all other title embeddings
    similarities = cosine_similarity(embedding.reshape(1, -1), judul_embeddings).flatten()

    # Get indices of top similar titles (excluding self-similarity)
    top_indices = np.argsort(similarities)[-6:-1][::-1]

    # Append the top indices to the list
    top_similarities_indices.append(top_indices)

In [None]:
# Iterate through the indices of top similarities for each title
for n, similar in enumerate(top_similarities_indices):
    judul_similar = []

    # Retrieve titles corresponding to the top similar indices
    for index in similar:
        judul_similar.append(course_title[index])

    # Display the original title and its top similar titles
    print(f"{course_title[n]}:\n {judul_similar}")
    print()


¡Luces, celular y acción!Crea contenidosaudiovisuales de impacto:
 ['Merancang dan Mengkreasikan Iklan Dengan Semiotika Visual', 'Graphic Elements of Design: Color Theory and Image Formats', 'Foundations of Digital Marketing and E-commerce', 'Introduction and Installation', 'Comunicación yRedacción Digital']

3D CAD Fundamental:
 ['Fundamentals of Graphic Design', 'Data Analysis Fundamental', 'Foundations of Finance', 'Interactive Computer Graphics', 'Basic SQL']

A Fun Collage - with 30+ Examples:
 ['Learn Canva & Canva Pro - Easiest Graphic Design Course 2023', 'Amazing Graphic Design for Beginners in 3 EASY STEPS', 'Inspiration: Success & Award Winning Campaigns', 'Leading High-PerformingTeams', 'The Ultimate Canva Course For Beginners: Graphic Design']

A travel by SpanishAmerica: Spanish forbeginners:
 ['Basic Spanish 1: GettingStarted', 'Basic Spanish 2: One StepFurther', 'Basic Spanish 3: Gettingthere', 'Francés Introductorio', 'Pengantar Analisis Merek Digital untuk Pemula']

A

## Export Translated Title Embeddings to CSV

In [None]:
# Combine data into a list of tuples
data = list(zip(course_title, tittle_eng, judul_embeddings))

# Specify the CSV file path
csv_file_path = 'course_title_embedding.csv'

# Write data to the CSV file
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)

    # Write the header row
    writer.writerow(['judul kursus', 'judul kursus eng', 'embedding'])

    # Write the data rows
    writer.writerows(data)

# Display a message indicating the CSV file creation
print(f"CSV file '{csv_file_path}' has been created.")


CSV file 'course_title_embedding.csv' has been created.


## Reading and Processing Embeddings from CSV File for Course Titles

In [None]:
# Read data from CSV file
with open('course_title_embedding.csv', 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')

    # Skip header row
    next(csvfile)

    # Extract data columns
    course_title = [row[0] for row in reader]
    csvfile.seek(0)

    next(csvfile)
    tittle_eng = [row[1] for row in reader]
    csvfile.seek(0)

    next(csvfile)
    judul_embedding = [row[2] for row in reader]

# Parse string embeddings into numpy arrays
judul_embedding = [np.fromstring(embedding[1:-1], sep=' ') for embedding in judul_embedding]

# Convert the list of arrays to a numpy array
judul_embedding_numpy = np.array(judul_embedding)

## Text Tokenization and Embedding for Course Titles using TensorFlow

In [None]:
vocab_size = 1000
embedding_dim = 16
max_length = 15
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'

# Mengonversi Teks menjadi urutan Token
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok) # mengonfigurasi jumlah kata unik (termasuk kata OOV atau out-of-vocabulary) yang akan dipertahankan, dengan batas maksimum sebanyak vocab_size.
tokenizer.fit_on_texts(course_title) # mengadaptasi tokenizer ke teks yang diberikan (course_title).
sequences = tokenizer.texts_to_sequences(course_title)
padded = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type, padding=padding_type)
padded = np.array(padded)

# Model Definition
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=512, input_length=max_length), # mengubah urutan token menjadi vektor embedding
    tf.keras.layers.GlobalAveragePooling1D(), # merata-ratakan vektor embedding untuk menghasilkan vektor representasi yang lebih sederhana.
    tf.keras.layers.Dense(512, activation='relu'), # aktivasi relu untuk memperkenalkan non-linearitas ke dalam model.
    tf.keras.layers.Dense(512)
])

# Model Compilation
model.compile(optimizer='adam', loss='mean_squared_error')

# Model Training
model.fit(padded, judul_embedding_numpy, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x7934600b63e0>

In [None]:
judul_embedding_predict = model.predict(padded)



## Save Model

In [None]:
model.save("model.h5")

  saving_api.save_model(


## Finding and Display the Top Similar Titles using Cosine Similarity

In [None]:
# Initialize a list to store top similar indices for each title
top_similarities_indices = []

# Iterate over the embeddings of predicted titles
for i, embedding in enumerate(judul_embedding_predict):
    # Calculate cosine similarities between the current title and all other titles
    similarities = cosine_similarity(embedding.reshape(1, -1), judul_embedding_predict).flatten()

    # Get the indices of the top 5 most similar titles (excluding itself)
    top_indices = np.argsort(similarities)[-6:-1][::-1]

    # Append the top indices to the list
    top_similarities_indices.append(top_indices)

# Iterate over the top similar indices for each title
for n, similar in enumerate(top_similarities_indices):
    # Create a list of similar titles based on the top indices
    judul_similar = [course_title[index] for index in similar]

    # Display the original title and its top similar titles
    print(f"{course_title[n]}\nTop Similar Titles: {judul_similar}")
    print()

¡Luces, celular y acción!Crea contenidosaudiovisuales de impacto
Top Similar Titles: ['Merancang dan Mengkreasikan Iklan Dengan Semiotika Visual', 'Graphic Elements of Design: Color Theory and Image Formats', 'Foundations of Digital Marketing and E-commerce', 'Introduction and Installation', 'Comunicación yRedacción Digital']

3D CAD Fundamental
Top Similar Titles: ['Fundamentals of Graphic Design', 'Data Analysis Fundamental', 'Interactive Computer Graphics', 'Foundations of Finance', 'Media Fundamentals']

A Fun Collage - with 30+ Examples
Top Similar Titles: ['Learn Canva & Canva Pro - Easiest Graphic Design Course 2023', 'Inspiration: Success & Award Winning Campaigns', 'Leading High-PerformingTeams', 'Amazing Graphic Design for Beginners in 3 EASY STEPS', 'Innovation Through Design: Think, Make, Break, Repeat']

A travel by SpanishAmerica: Spanish forbeginners
Top Similar Titles: ['Basic Spanish 1: GettingStarted', 'Basic Spanish 2: One StepFurther', 'Basic Spanish 3: Gettingthere