In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
df = pd.read_csv('results/filtered_netflix_reviews_language.csv')

In [3]:
# Download required NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('omw-1.4')  # For token normalization if needed

# Define stop words
stop_words = set(stopwords.words('english'))

# Emoji pattern
def remove_emojis(text):
    emoji_pattern = re.compile(
        "[\U0001F600-\U0001F64F]|"  # emoticons
        "[\U0001F300-\U0001F5FF]|"  # symbols & pictographs
        "[\U0001F680-\U0001F6FF]|"  # transport & map symbols
        "[\U0001F1E0-\U0001F1FF]"    # flags (iOS)
        "[\u2600-\u26FF]|"          # Miscellaneous symbols
        "[\u2700-\u27BF]",           # Dingbats
        flags=re.UNICODE
    )
    return emoji_pattern.sub(r"", text)

# Text preprocessing function
def preprocess_text(text):
    try:
        # Ensure text is a string
        text = str(text)

        # Handle encoding issues by replacing surrogate characters
        text = text.encode('utf-8', 'replace').decode('utf-8')

        # Convert to lowercase
        text = text.lower()
        
        # Remove URLs
        text = re.sub(r'http\S+|www\.\S+', '', text)

        # Remove Twitter handles
        text = re.sub(r'@\w+', '', text)

        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))

        # Remove emojis
        text = remove_emojis(text)

        # Tokenize text
        tokens = word_tokenize(text)

        # Remove stop words
        tokens = [word for word in tokens if word not in stop_words]

        # Join the tokens back into a string
        processed_text = ' '.join(tokens)

        return processed_text

    except Exception as e:
        print(f"Error processing text: {e}")
        return ''  # Return an empty string in case of errors

# Apply preprocessing to a DataFrame column
def preprocess_dataframe(df, column_name):
    df['preprocess_content'] = df[column_name].apply(preprocess_text)
    return df


# Preprocess the 'content' column of the DataFrame
processed_df = preprocess_dataframe(df, 'content')

[nltk_data] Downloading package stopwords to C:\Users\Babli
[nltk_data]     Dey\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Babli
[nltk_data]     Dey\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Babli
[nltk_data]     Dey\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [4]:
processed_df

Unnamed: 0,reviewId,userName,content,score,thumbsUpCount,date,year,month,day,language,preprocess_content
0,6fcd7344-d13e-46f5-ac98-d0ebec6ecb3b,ayuni ayuni,Can you prolong Big bang theory series in Netf...,5,0,2024-12-31 15:43:14,2024,12,31,en,prolong big bang theory series netflix
1,de92d64e-999b-4cbd-8487-518126fe0629,goat and birds channel,NICE APP,5,0,2024-12-31 15:42:23,2024,12,31,it,nice app
2,754c9b1e-9bfa-4c23-b658-8a8804033265,Prashanth Annam,West app,1,0,2024-12-31 15:38:05,2024,12,31,en,west app
3,258242e4-ba13-4233-88a0-147c721fd65f,Shubham Kadam,Primium mat maro जो movie चाहिय ओ नही मिलती मे...,1,0,2024-12-31 15:35:33,2024,12,31,hi,primium mat maro जो movie चाहिय ओ नही मिलती मे...
4,b113fac2-503f-4acf-8480-4f099e7706be,Jahnavi Reddy,This APP IS VERY GOOD INFACT ULTIMATE I even d...,5,0,2024-12-31 15:34:33,2024,12,31,en,app good infact ultimate even downloaded tv al...
...,...,...,...,...,...,...,...,...,...,...,...
123508,a760ead9-e7aa-4ed1-a651-5c37c3600dac,A Google user,i really like it! there are so many movies and...,5,0,2019-08-03 15:06:03,2019,8,3,en,really like many movies kdramas watch
123509,4957f9e7-d7f4-4a52-9764-031cebcac83f,Captain Jeoy,I love Netflix. I always enjoy my time using it.,5,0,2022-08-15 16:16:30,2022,8,15,en,love netflix always enjoy time using
123510,9acf7586-7abf-4b50-8c50-3ede3b2a42c4,Suryansh,Sound quality is very slow of movies,1,0,2020-08-17 07:26:58,2020,8,17,en,sound quality slow movies
123511,32870f7f-c461-4256-b602-75244ca60248,A Google user,Rate is very expensive.. bcos we see netflix s...,1,0,2019-07-21 09:41:42,2019,7,21,en,rate expensive bcos see netflix sunday charged...


In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from langdetect import detect

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-mul-en")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-mul-en")

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Function to detect language
def detect_language(text):
    try:
        return detect(text)
    except:
        return "unknown"

# Function to translate text to English
def translate_to_english(text, detected_lang):
    # If language is English, return the text as is
    if detected_lang == "en":
        return text
    try:
        # Tokenize the input text
        inputs = tokenizer.encode(text, return_tensors="pt", truncation=True).to(device)
        
        # Generate translation
        outputs = model.generate(inputs, max_length=512, num_beams=5, early_stopping=True)
        
        # Decode the output
        translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return translated_text
    except Exception as e:
        return f"Translation failed: {e}"

# Add detected language and translation columns
processed_df["language_hel"] = processed_df["preprocess_content"].apply(detect_language)
processed_df["translated"] = processed_df.apply(
    lambda row: translate_to_english(row["preprocess_content"], row["language_hel"]), axis=1
)

  torch.utils._pytree._register_pytree_node(


In [7]:
processed_df[processed_df["language_hel"] != "en"]

Unnamed: 0,reviewId,userName,content,score,thumbsUpCount,date,year,month,day,language,preprocess_content,language_hel,translated
1,de92d64e-999b-4cbd-8487-518126fe0629,goat and birds channel,NICE APP,5,0,2024-12-31 15:42:23,2024,12,31,it,nice app,it,Nice app
3,258242e4-ba13-4233-88a0-147c721fd65f,Shubham Kadam,Primium mat maro जो movie चाहिय ओ नही मिलती मे...,1,0,2024-12-31 15:35:33,2024,12,31,hi,primium mat maro जो movie चाहिय ओ नही मिलती मे...,hi,Premium mat maro which should be filmed if it ...
5,43794861-0f4a-4e5e-95e5-197c1ba643cb,Basudev Hansda,🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀...,5,0,2024-12-31 15:29:42,2024,12,31,ur,🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀...,unknown,I'm sorry.
7,d40c0dbd-b85b-4734-bc96-74725a144f62,Marc Human,Best,5,0,2024-12-31 15:09:09,2024,12,31,en,best,af,best
8,1d42e2e0-34ae-4d27-8e14-64f1321e4c33,Anshika Bansal,Amazing customer support and good application.,5,0,2024-12-31 14:56:34,2024,12,31,en,amazing customer support good application,it,amazing customer support good application
...,...,...,...,...,...,...,...,...,...,...,...,...,...
123495,200037e7-5836-43aa-a85e-18a9ca2e1927,Nishtha Bisht,Netflix app isn't opening up at all,2,0,2021-11-14 14:16:04,2021,11,14,en,netflix app isnt opening,no,netflix app is open
123497,4293dedc-e5a3-47a4-8350-8326a90abe33,Dave jc,Does not have kannada movies,1,0,2024-03-17 19:13:46,2024,3,17,en,kannada movies,sk,Can not open message
123502,3759168b-2cfd-4719-824f-61bcf0082965,Manjunath Manju,"Their is no position for kannada language , do...",1,0,2024-03-17 15:19:57,2024,3,17,en,position kannada language consider,es,candid position
123504,5f68b545-1079-4a57-9024-fc6e27d7db96,AD Basson,The. Best App. Ever,2,0,2024-04-01 10:21:15,2024,4,1,en,best app ever,no,best app ever


In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")

In [9]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from langdetect import detect

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Function to detect language
def detect_language(text):
    try:
        return detect(text)
    except:
        return "unknown"

# Function to translate text to English
def translate_to_english(text, detected_lang):
    # If language is English, return the text as is
    if detected_lang == "en":
        return text
    try:
        # T5 expects a specific task prefix for translation
        task_prefix = f"translate {detected_lang} to English: "
        input_text = task_prefix + text

        # Tokenize the input text
        inputs = tokenizer.encode(input_text, return_tensors="pt", truncation=True).to(device)
        
        # Generate translation
        outputs = model.generate(inputs, max_length=512, num_beams=5, early_stopping=True)
        
        # Decode the output
        translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return translated_text
    except Exception as e:
        return f"Translation failed: {e}"

# Add detected language and translation columns
processed_df["language_t5"] = processed_df["preprocess_content"].apply(detect_language)
processed_df["translated_t5"] = processed_df.apply(
    lambda row: translate_to_english(row["preprocess_content"], row["language_t5"]), axis=1
)


Downloading config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [11]:
processed_df[processed_df["language_t5"] != "en"]

Unnamed: 0,reviewId,userName,content,score,thumbsUpCount,date,year,month,day,language,preprocess_content,language_hel,translated,language_t5,translated_t5
1,de92d64e-999b-4cbd-8487-518126fe0629,goat and birds channel,NICE APP,5,0,2024-12-31 15:42:23,2024,12,31,it,nice app,it,Nice app,it,nice app
3,258242e4-ba13-4233-88a0-147c721fd65f,Shubham Kadam,Primium mat maro जो movie चाहिय ओ नही मिलती मे...,1,0,2024-12-31 15:35:33,2024,12,31,hi,primium mat maro जो movie चाहिय ओ नही मिलती मे...,hi,Premium mat maro which should be filmed if it ...,hi,...
5,43794861-0f4a-4e5e-95e5-197c1ba643cb,Basudev Hansda,🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀...,5,0,2024-12-31 15:29:42,2024,12,31,ur,🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀🫀...,unknown,I'm sorry.,unknown,
6,39923caa-1af7-4123-859f-76087131ce1c,Joy Sadang,The app Is very good i just don't like about I...,4,0,2024-12-31 15:28:40,2024,12,31,en,app good dont like dont enough season watch be...,en,app good dont like dont enough season watch be...,af,app good dont like dont enough season watch be...
7,d40c0dbd-b85b-4734-bc96-74725a144f62,Marc Human,Best,5,0,2024-12-31 15:09:09,2024,12,31,en,best,af,best,af,best
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123491,be0b7765-422b-42e8-bc6c-1d93200963ec,Joshua Jameson,I'm enjoying 😎🎧,5,0,2024-05-02 07:53:48,2024,5,2,en,im enjoying,hr,im enjoying,hr,hr im enjoying
123497,4293dedc-e5a3-47a4-8350-8326a90abe33,Dave jc,Does not have kannada movies,1,0,2024-03-17 19:13:46,2024,3,17,en,kannada movies,sk,Can not open message,sk,sk kannada movies
123502,3759168b-2cfd-4719-824f-61bcf0082965,Manjunath Manju,"Their is no position for kannada language , do...",1,0,2024-03-17 15:19:57,2024,3,17,en,position kannada language consider,es,candid position,id,position kannada language consider
123504,5f68b545-1079-4a57-9024-fc6e27d7db96,AD Basson,The. Best App. Ever,2,0,2024-04-01 10:21:15,2024,4,1,en,best app ever,no,best app ever,no,best app ever
