# Import Required Libraries
Import libraries such as pandas, numpy, matplotlib, seaborn, tensorflow/keras, and json.

In [1]:
# Import necessary libraries for data processing, visualization, and deep learning
import pandas as pd  # For data manipulation and analysis
import numpy as np  # For numerical computations
import matplotlib.pyplot as plt  # For data visualization
import seaborn as sns  # For enhanced data visualization
import tensorflow as tf  # For building and training deep learning models
from tensorflow import keras  # High-level API for TensorFlow
import json  # For working with JSON files

2025-04-07 12:51:34.453713: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744030294.499436  885313 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744030294.511921  885313 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1744030294.590900  885313 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1744030294.590937  885313 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1744030294.590939  885313 computation_placer.cc:177] computation placer alr

# Load and Explore JSON Data
Load all `.json` files from the folder, combine them into a single DataFrame, and explore the structure and content of the data.

In [2]:
import os  # For interacting with the file system

# Define the folder path containing the JSON files
folder_path = '/home/nhatlinh/tiktok-comment-scrapper'

# List all JSON files in the folder
json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]

# Initialize an empty list to store data from all JSON files
data_list = []

# Load each JSON file and append its content to the data list
for file in json_files:
    file_path = os.path.join(folder_path, file)
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        data_list.append(data)

# Combine all data into a single DataFrame
df = pd.DataFrame(data_list)

# Display the first few rows of the DataFrame to explore its structure
print("First few rows of the DataFrame:")
print(df.head())

# Display basic information about the DataFrame
print("\nDataFrame Info:")
print(df.info())

# Display summary statistics of the DataFrame
print("\nSummary Statistics:")
print(df.describe(include='all'))

First few rows of the DataFrame:
                                             caption  \
0  PHÁO ĐÃ LÊN NÒNG, vào livestream đối chất 1:1 ...   
1  tư duy ngược - một phương pháp học rất quen th...   
2  Chúc các bạn 2k6 thật nhiều may mắn, bình tâm ...   
3  Nhật ký ôn thi THPTQG| Chúng ta sẽ không có cơ...   
4  [D51] “Tôi không có thiên phú, nhưng tôi muốn ...   

                                           video_url  \
0  https://t.tiktok.com/i18n/share/video/74868857...   
1  https://t.tiktok.com/i18n/share/video/73402265...   
2  https://t.tiktok.com/i18n/share/video/73766570...   
3  https://t.tiktok.com/i18n/share/video/71985216...   
4  https://t.tiktok.com/i18n/share/video/73658524...   

                                            comments  has_more  
0  [{'comment_id': '7486886369214874369', 'userna...         1  
1  [{'comment_id': '7341636945293034247', 'userna...         1  
2  [{'comment_id': '7486681879690117895', 'userna...         1  
3  [{'comment_id': '7198741785848

# Preprocess Data
Clean the text data by removing special characters, converting to lowercase, and handling missing values.

In [7]:
import pandas as pd
import re
import json

# Load data from the JSON file
json_file_path = "/home/nhatlinh/tiktok-comment-scrapper/neg7483896983124462856.json"
with open(json_file_path, "r", encoding="utf-8") as file:
    data = json.load(file)

# Convert JSON data to a DataFrame
df = pd.DataFrame(data)

# Check for missing values in the DataFrame
print("\nMissing Values Before Cleaning:")
print(df.isnull().sum())

# Drop rows with missing values in the 'comments' column
if 'comments' in df.columns:
    df = df.dropna(subset=['comments'])
else:
    raise KeyError("The 'comments' column is missing in the DataFrame.")

# Reset the index after dropping rows
df = df.reset_index(drop=True)

import re

def clean_comment_text(data):
    """
    Trích xuất và làm sạch phần 'comment' từ dữ liệu TikTok (bao gồm cả reply).
    - Chỉ giữ lại comment (không quan tâm username, avatar, v.v.)
    - Loại bỏ emoji, ký tự đặc biệt, số
    - Giữ lại chữ cái tiếng Việt (kể cả có dấu)
    - Chuyển về chữ thường
    - Xóa khoảng trắng thừa
    """
    def clean_text(text):
        text = re.sub(r"[^a-zA-ZÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚĂĐĨŨƠàáâãèéêìíòóôõùúăđĩũơƯĂẠẢẤẦẨẪẬẮẰẲẴẶẸẺẼỀỀỂưăạảấầẩẫậắằẳẵặẹẻẽềềểếỄỆỈỊỌỎỐỒỔỖỘỚỜỞỠỢỤỦỨỪễệỉịọỏốồổỗộớờởỡợụủứừỬỮỰỳỵỷỹÝýỴỶỸ\s]", '', text)
        text = text.lower()
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    result = []
    # Làm sạch comment chính
    if 'comment' in data:
        result.append(clean_text(data['comment']))
    # Làm sạch comment trong replies (nếu có)
    for reply in data.get('replies', []):
        if 'comment' in reply:
            result.append(clean_text(reply['comment']))

    return result



# Apply the cleaning function to the 'comments' column
df['cleaned_text'] = df['comments'].apply(clean_comment_text)

# Display the first few rows of the cleaned DataFrame
print("\nFirst few rows after cleaning:")
print(df[['comments', 'cleaned_text']].head())

# Check for missing values after cleaning
print("\nMissing Values After Cleaning:")
print(df.isnull().sum())



Missing Values Before Cleaning:
caption      0
video_url    0
comments     0
has_more     0
dtype: int64

First few rows after cleaning:
                                            comments  \
0  {'comment_id': '7483899822577320711', 'usernam...   
1  {'comment_id': '7483912469787280149', 'usernam...   
2  {'comment_id': '7484401968984064786', 'usernam...   
3  {'comment_id': '7483964387545760520', 'usernam...   
4  {'comment_id': '7483914010791822098', 'usernam...   

                                        cleaned_text  
0  [đâu phải t nhiên tên là viruss, đúng là cái t...  
1  [ổng nói gì mẹ ngọc kem v ạ, tui cx tò mò nx, ...  
2  [mà này vr đã nói gì về mẹ của nk vậy mn tại t...  
3  [giờ viruss lên bài đính chính tôi là bot may ...  
4  [rep may mắn cả đời, rep, , rep, , rep, , , , ...  

Missing Values After Cleaning:
caption         0
video_url       0
comments        0
has_more        0
cleaned_text    0
dtype: int64


In [34]:
df['cleaned_text'][29]

['nhìn chị như muốn khóc vậy á',
 'khóc mấy ngày nay rồi á tr thương bả lắm luônn',
 'thương thật']

# Label Data for Sentiment Classification
Assign sentiment labels (neg, neutral, pos) based on the file names or content of the data.

In [23]:
# Assign sentiment labels based on file names
def assign_label(file_name):
    if file_name.startswith('neg'):
        return 'neg'
    elif file_name.startswith('pos'):
        return 'pos'
    else:
        return 'neutral'

# Create a new column 'label' in the DataFrame by mapping file names to labels
df['label'] = [assign_label(file) for file in json_files]

# Display the distribution of sentiment labels
print("\nSentiment Label Distribution:")
print(df['label'].value_counts())

# Visualize the distribution of sentiment labels
plt.figure(figsize=(8, 6))
sns.countplot(x='label', data=df, palette='viridis')
plt.title('Distribution of Sentiment Labels')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

ValueError: Length of values (25) does not match length of index (1549)

# Visualize Data Insights
Use matplotlib and seaborn to create visualizations such as bar plots for sentiment distribution and word clouds for common words in each sentiment category.

In [None]:
# Import necessary libraries for visualization
from wordcloud import WordCloud  # For generating word clouds

# Visualize the distribution of sentiment labels
plt.figure(figsize=(8, 6))
sns.countplot(x='label', data=df, palette='viridis')
plt.title('Distribution of Sentiment Labels')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

# Generate word clouds for each sentiment category
sentiments = ['neg', 'neutral', 'pos']
for sentiment in sentiments:
    # Filter the DataFrame for the current sentiment
    sentiment_text = df[df['label'] == sentiment]['cleaned_text']
    
    # Combine all text for the current sentiment into a single string
    combined_text = ' '.join(sentiment_text)
    
    # Generate a word cloud
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(combined_text)
    
    # Display the word cloud
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f'Word Cloud for {sentiment.capitalize()} Sentiment')
    plt.axis('off')
    plt.show()

# Prepare Data for Model Training
Tokenize and pad the text data, split it into training and validation sets, and encode the sentiment labels.

In [None]:
# Tokenize and pad the text data
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Initialize the tokenizer with a maximum number of words
max_words = 10000  # Maximum number of words to keep in the vocabulary
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')

# Fit the tokenizer on the cleaned text data
tokenizer.fit_on_texts(df['cleaned_text'])

# Convert the text data to sequences of integers
sequences = tokenizer.texts_to_sequences(df['cleaned_text'])

# Pad the sequences to ensure uniform length
max_sequence_length = 100  # Maximum length of sequences
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post', truncating='post')

# Encode the sentiment labels into integers
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(df['label'])

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    padded_sequences, encoded_labels, test_size=0.2, random_state=42, stratify=encoded_labels
)

# Display the shapes of the training and validation sets
print("Training data shape:", X_train.shape)
print("Validation data shape:", X_val.shape)
print("Training labels shape:", y_train.shape)
print("Validation labels shape:", y_val.shape)

# Build and Train Deep Learning Model
Define a deep learning model using TensorFlow/Keras, compile it, and train it on the preprocessed data.

In [None]:
# Build the deep learning model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Define the model architecture
model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_sequence_length),  # Embedding layer
    LSTM(128, return_sequences=True),  # LSTM layer with 128 units
    Dropout(0.2),  # Dropout to prevent overfitting
    LSTM(64),  # Another LSTM layer with 64 units
    Dropout(0.2),  # Dropout to prevent overfitting
    Dense(64, activation='relu'),  # Fully connected layer with ReLU activation
    Dense(3, activation='softmax')  # Output layer with softmax activation for 3 sentiment classes
])

# Compile the model
model.compile(
    optimizer='adam',  # Adam optimizer
    loss='sparse_categorical_crossentropy',  # Loss function for multi-class classification
    metrics=['accuracy']  # Metric to monitor
)

# Display the model summary
print("Model Summary:")
model.summary()

# Train the model
batch_size = 32  # Number of samples per batch
epochs = 10  # Number of epochs to train

history = model.fit(
    X_train, y_train,  # Training data and labels
    validation_data=(X_val, y_val),  # Validation data and labels
    batch_size=batch_size,
    epochs=epochs,
    verbose=1  # Display training progress
)

# Plot training and validation accuracy and loss
plt.figure(figsize=(12, 6))

# Plot accuracy
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

# Plot loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

# Evaluate Model Performance
Evaluate the model on validation data and visualize metrics such as accuracy and loss.

In [None]:
# Evaluate the model on the validation data
val_loss, val_accuracy = model.evaluate(X_val, y_val, verbose=1)

# Print the evaluation metrics
print(f"Validation Loss: {val_loss}")
print(f"Validation Accuracy: {val_accuracy}")

# Generate predictions on the validation data
y_pred = model.predict(X_val)
y_pred_classes = np.argmax(y_pred, axis=1)  # Convert probabilities to class labels

# Import classification report and confusion matrix
from sklearn.metrics import classification_report, confusion_matrix

# Generate a classification report
print("\nClassification Report:")
print(classification_report(y_val, y_pred_classes, target_names=label_encoder.classes_))

# Generate a confusion matrix
conf_matrix = confusion_matrix(y_val, y_pred_classes)

# Visualize the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()