In [2]:
import pandas as pd
import numpy as np
import pickle
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split


In [3]:
df = pd.read_csv("trainNLP.csv")
df = df[['comment_text', 'toxic']]  # We’ll classify only toxic for simplicity
df = df.dropna()

X = df['comment_text']
y = df['toxic']


In [4]:
max_words = 10000
max_len = 150

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
X_pad = pad_sequences(X_seq, maxlen=max_len)


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)


In [6]:
model = Sequential()
model.add(Embedding(max_words, 128, input_length=max_len))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, batch_size=64, epochs=2, validation_data=(X_test, y_test))


Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x165274abfd0>

In [7]:
model.save("model/toxic_model.h5")

with open("model/tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)


  saving_api.save_model(


In [8]:
!pip install google-api-python-client google-auth streamlit


Collecting google-api-python-client
  Downloading google_api_python_client-2.166.0-py2.py3-none-any.whl (13.2 MB)
     ---------------------------------------- 13.2/13.2 MB 1.3 MB/s eta 0:00:00
Collecting google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0,>=1.31.5
  Downloading google_api_core-2.24.2-py3-none-any.whl (160 kB)
     -------------------------------------- 160.1/160.1 kB 1.2 MB/s eta 0:00:00
Collecting httplib2<1.0.0,>=0.19.0
  Downloading httplib2-0.22.0-py3-none-any.whl (96 kB)
     -------------------------------------- 96.9/96.9 kB 794.6 kB/s eta 0:00:00
Collecting uritemplate<5,>=3.0.1
  Downloading uritemplate-4.1.1-py2.py3-none-any.whl (10 kB)
Collecting google-auth-httplib2<1.0.0,>=0.2.0
  Downloading google_auth_httplib2-0.2.0-py2.py3-none-any.whl (9.3 kB)
Collecting proto-plus<2.0.0,>=1.22.3
  Downloading proto_plus-1.26.1-py3-none-any.whl (50 kB)
     -------------------------------------- 50.2/50.2 kB 849.4 kB/s eta 0:00:00
Collecting googleapis-common-proto

In [None]:
AIzaSyDcEO76-CcCygH5p2_emzsrIkhaqtRF_zQ

In [9]:
!pip install google-api-python-client
!pip install --upgrade google-auth google-auth-oauthlib google-auth-httplib2


Collecting google-auth-oauthlib
  Downloading google_auth_oauthlib-1.2.1-py2.py3-none-any.whl (24 kB)
Installing collected packages: google-auth-oauthlib
  Attempting uninstall: google-auth-oauthlib
    Found existing installation: google-auth-oauthlib 1.0.0
    Uninstalling google-auth-oauthlib-1.0.0:
      Successfully uninstalled google-auth-oauthlib-1.0.0
Successfully installed google-auth-oauthlib-1.2.1


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorboard 2.13.0 requires google-auth-oauthlib<1.1,>=0.5, but you have google-auth-oauthlib 1.2.1 which is incompatible.


In [11]:
from googleapiclient.discovery import build
import pandas as pd

API_KEY = "AIzaSyDcEO76-CcCygH5p2_emzsrIkhaqtRF_zQ"  # 🔑 Paste your key here
youtube = build('youtube', 'v3', developerKey=API_KEY)


In [12]:
def get_video_comments(video_id, max_comments=20):
    comments = []
    request = youtube.commentThreads().list(
        part="snippet",
        videoId=video_id,
        maxResults=100,
        textFormat="plainText"
    )
    response = request.execute()

    while response and len(comments) < max_comments:
        for item in response["items"]:
            comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
            comments.append(comment)
            if len(comments) >= max_comments:
                break

        if "nextPageToken" in response:
            request = youtube.commentThreads().list(
                part="snippet",
                videoId=video_id,
                pageToken=response["nextPageToken"],
                maxResults=100,
                textFormat="plainText"
            )
            response = request.execute()
        else:
            break

    return comments


In [24]:
video_id = "orJ_CQ3VU28"
comments = get_video_comments(video_id, max_comments=20)

df = pd.DataFrame(comments, columns=["Comment"])
df.head()


Unnamed: 0,Comment
0,Yi trailer chustunte agent movie gurtostundhi ...
1,This is real wild saala
2,lie????????????????//
3,trailer is more than Good. We can see all the ...
4,All the best to the entire team 🎉🎉🎉🎉🎉🎉


In [25]:
import tensorflow as tf
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences

model = tf.keras.models.load_model("model/toxic_model.h5")

with open("model/tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)


In [26]:
def predict_toxicity(comments):
    sequences = tokenizer.texts_to_sequences(comments)
    padded = pad_sequences(sequences, maxlen=150)
    predictions = model.predict(padded)

    labels = ["Toxic" if p >= 0.5 else "Not Toxic" for p in predictions]
    confidence = [float(p) for p in predictions]

    return labels, confidence


In [27]:
labels, confidence = predict_toxicity(df["Comment"].tolist())
df["Prediction"] = labels
df["Confidence"] = confidence

df.head(10)




Unnamed: 0,Comment,Prediction,Confidence
0,Yi trailer chustunte agent movie gurtostundhi ...,Not Toxic,0.000315
1,This is real wild saala,Not Toxic,0.041461
2,lie????????????????//,Not Toxic,0.297327
3,trailer is more than Good. We can see all the ...,Not Toxic,0.002376
4,All the best to the entire team 🎉🎉🎉🎉🎉🎉,Not Toxic,0.015818
5,worst part is herione,Toxic,0.558032
6,tollywood lo inko trod loading,Not Toxic,0.011665
7,D J tillu😂😂,Not Toxic,0.058344
8,😮😮😮😮😮😮,Not Toxic,0.099546
9,😂😂😂😮😮❤❤❤😊,Not Toxic,0.099546


In [28]:
!pip install googletrans==4.0.0-rc1 transformers torch


Collecting googletrans==4.0.0-rc1

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
spyder 5.2.2 requires pyqt5<5.13, which is not installed.
spyder 5.2.2 requires pyqtwebengine<5.13, which is not installed.
anaconda-project 0.11.1 requires ruamel-yaml, which is not installed.
spyder 5.2.2 requires ipython<8.0.0,>=7.6.0, but you have ipython 8.7.0 which is incompatible.
conda-repo-cli 1.0.20 requires clyent==1.2.1, but you have clyent 1.2.2 which is incompatible.
conda-repo-cli 1.0.20 requires nbformat==5.4.0, but you have nbformat 5.5.0 which is incompatible.



  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting torch
  Downloading torch-2.6.0-cp39-cp39-win_amd64.whl (204.1 MB)
     -------------------------------------- 204.1/204.1 MB 1.2 MB/s eta 0:00:00
Collecting httpx==0.13.3
  Downloading httpx-0.13.3-py3-none-any.whl (55 kB)
     -------------------------------------- 55.1/55.1 kB 954.5 kB/s eta 0:00:00
Collecting rfc3986<2,>=1.3
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl (31 kB)
Collecting hstspreload
  Downloading hstspreload-2025.1.1-py3-none-any.whl (1.3 MB)
     ---------------------------------------- 1.3/1.3 MB 1.3 MB/s eta 0:00:00
Collecting httpcore==0.9.*
  Downloading httpcore-0.9.1-py3-none-any.whl (42 kB)
     ---------------------------------------- 42.6/42.6 kB 2.0 MB/s eta 0:00:00
Collecting chardet==3.*
  Downloading chardet-3.0.4-py2.py3-none-any.whl (133 kB)
     ------------------------------------ 1

In [2]:
import os

# Get the current working directory
current_path = os.getcwd()
print(f"Current Directory: {current_path}")

# List all files and folders in the current directory
files_and_folders = os.listdir(current_path)
print(f"Files and Folders in the current directory:\n{files_and_folders}")

Current Directory: C:\Users\91901\NLP_PROJECT
Files and Folders in the current directory:
['.ipynb_checkpoints', 'app.py', 'hello.ipynb', 'model', 'testNLP.csv', 'trainNLP.csv']


In [3]:
pip install langdetect googletrans==4.0.0-rc1


Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
     -------------------------------------- 981.5/981.5 kB 1.3 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py): started
  Building wheel for langdetect (setup.py): finished with status 'done'
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993225 sha256=78cac3749ec67358e803da15d5fc59bd0a84340099c0a7aecd532d71316a2845
  Stored in directory: c:\users\91901\appdata\local\pip\cache\wheels\d1\c1\d9\7e068de779d863bc8f8fc9467d85e25cfe47fa5051fff1a1bb
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9
Note: you may need to restart the kernel to use updated packages.
