In [1]:
!pip install google-api-python-client pandas




In [2]:
from googleapiclient.discovery import build
import pandas as pd

# Replace with your YouTube Data API key
API_KEY = "AIzaSyDaFmq3F7SBFQvL44zwSmPCSBAVxkwZuUA"

# Function to fetch comments from a video
def fetch_comments(video_id):
    youtube = build("youtube", "v3", developerKey=API_KEY)
    comments = []
    request = youtube.commentThreads().list(
        part="snippet",
        videoId=video_id,
        maxResults=100  # You can change this as needed
    )

    while request:
        response = request.execute()
        for item in response["items"]:
            comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
            comments.append(comment)

        # Check for next page
        if "nextPageToken" in response:
            request = youtube.commentThreads().list(
                part="snippet",
                videoId=video_id,
                maxResults=100,
                pageToken=response["nextPageToken"],
            )
        else:
            break

    return comments

# Main function
def main():
    # Replace with your YouTube video ID
    video_id = "nWc3c1Lvp1Q"
    comments = fetch_comments(video_id)

    # Save to a CSV file
    df = pd.DataFrame(comments, columns=["Comment"])
    df.to_csv("youtube_comments.csv", index=False, encoding="utf-8")
    print(f"Saved {len(comments)} comments to youtube_comments.csv")

if __name__ == "__main__":
    main()


Saved 15536 comments to youtube_comments.csv


In [None]:
from google.colab import files
files.download('youtube_comments.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [3]:
import pandas as pd
import re

# Load the data
df = pd.read_csv("youtube_comments.csv")

# Cleaning function
def clean_text(text):
    text = re.sub(r"http\S+|www\S+", "", text)  # Remove URLs
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove special characters
    text = text.lower().strip()  # Convert to lowercase and strip whitespace
    return text

# Apply cleaning
df["cleaned_comments"] = df["Comment"].apply(clean_text)

# Save cleaned data
df.to_csv("cleaned_youtube_comments.csv", index=False)
print("Data cleaned and saved as cleaned_youtube_comments.csv")


Data cleaned and saved as cleaned_youtube_comments.csv


In [4]:
print(f"Number of cleaned comments: {len(df['cleaned_comments'])}")

Number of cleaned comments: 15536


In [5]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk

nltk.download('stopwords')

df = pd.read_csv("cleaned_youtube_comments.csv")
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()

def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    tokens = text.split()
    processed_tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return " ".join(processed_tokens)

df['cleaned_comments'] = df['cleaned_comments'].fillna('')  # Handle NaN values
df["processed_comments"] = df["cleaned_comments"].apply(preprocess_text)


# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # Use top 1000 features
tfidf_matrix = tfidf_vectorizer.fit_transform(df["processed_comments"])

# Convert TF-IDF matrix to a DataFrame for easy viewing
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Save the TF-IDF DataFrame
tfidf_df.to_csv("tfidf_vectorized_comments.csv", index=False)
print("Data preparation complete. Saved as tfidf_vectorized_comments.csv.")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Data preparation complete. Saved as tfidf_vectorized_comments.csv.


In [None]:
from google.colab import files
files.download('tfidf_vectorized_comments.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [6]:
import pandas as pd

# Load your YouTube comments (after cleaning)
df = pd.read_csv("cleaned_youtube_comments.csv")

# Initialize a list to store labels
labels = []

# Function to manually label the data
def label_data(comment):
    # Ensure the comment is a string
    if not isinstance(comment, str):
        return "neutral"  # In case comment is NaN or not a string

    # Here you can define conditions based on keywords or sentiment rules
    if "good" in comment or "love" in comment:
        return "positive"
    elif "bad" in comment or "hate" in comment:
        return "negative"
    else:
        return "neutral"

# Apply the label_data function to each comment
df["sentiment"] = df["cleaned_comments"].apply(label_data)

# Optionally, print a few rows to check
print(df.head())

# Save the labeled data
df.to_csv("labeled_youtube_comments.csv", index=False)
print("Data labeled successfully and saved as 'labeled_youtube_comments.csv'.")


                                             Comment  \
0  For daily updates follow us on Instagram: niti...   
1           Tell us about Balbeer SingüéâüéâüéâüéâüéâüéâüéâüéâüéâüéâüéâüòÖüòÖüòÖ   
2                jai shree ram ji<br>jai bajrangbali   
3                                   Jai Shree Ram ‚ù§‚ù§   
4               Jai shree Ram ji <br>Jai bajrangbali   

                                    cleaned_comments sentiment  
0  for daily updates follow us on instagram nitis...   neutral  
1                         tell us about balbeer sing   neutral  
2                  jai shree ram jibrjai bajrangbali   neutral  
3                                      jai shree ram   neutral  
4                 jai shree ram ji brjai bajrangbali   neutral  
Data labeled successfully and saved as 'labeled_youtube_comments.csv'.


In [7]:
import pandas as pd

# Load the labeled data (with sentiment labels)
df = pd.read_csv("labeled_youtube_comments.csv")

# Count the number of comments in each sentiment category
sentiment_counts = df["sentiment"].value_counts()

# Print the counts for each category
print("Sentiment Counts:")
print(sentiment_counts)

# Optionally, you can also access the individual counts like this:
positive_count = sentiment_counts.get("positive", 0)
negative_count = sentiment_counts.get("negative", 0)
neutral_count = sentiment_counts.get("neutral", 0)

print(f"\nPositive Comments: {positive_count}")
print(f"Negative Comments: {negative_count}")
print(f"Neutral Comments: {neutral_count}")


Sentiment Counts:
sentiment
neutral     14490
negative      527
positive      519
Name: count, dtype: int64

Positive Comments: 519
Negative Comments: 527
Neutral Comments: 14490


In [8]:
sentiment_counts = df["sentiment"].value_counts()
print(sentiment_counts)


sentiment
neutral     14490
negative      527
positive      519
Name: count, dtype: int64


In [9]:
!pip install imbalanced-learn




In [10]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Step 1: Load your dataset into a DataFrame
# Replace 'your_dataset.csv' with the path to your dataset file
df = pd.read_csv("labeled_youtube_comments.csv")

# Step 2: Handle missing values in the 'cleaned_comments' column
df["cleaned_comments"] = df["cleaned_comments"].fillna("")  # Fill NaN with empty string

# Features and labels
X = df["cleaned_comments"]
y = df["sentiment"]

# Vectorize the text data
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_vectorized = tfidf_vectorizer.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print("Before balancing:")
print(y_train.value_counts())

print("\nAfter balancing:")
print(pd.Series(y_train_balanced).value_counts())

# Convert the sparse matrix X_train_balanced back to a dense array for exporting
X_train_balanced_dense = X_train_balanced.toarray()

# Create a DataFrame from the resampled data
balanced_df = pd.DataFrame(
    X_train_balanced_dense,
    columns=tfidf_vectorizer.get_feature_names_out()
)
balanced_df["sentiment"] = y_train_balanced  # Add the resampled labels

# Save the balanced dataset to a CSV file
balanced_df.to_csv("balanced_dataset.csv", index=False)
print("Balanced dataset saved as 'balanced_dataset.csv'.")


Before balancing:
sentiment
neutral     11571
positive      429
negative      428
Name: count, dtype: int64

After balancing:
sentiment
positive    11571
neutral     11571
negative    11571
Name: count, dtype: int64
Balanced dataset saved as 'balanced_dataset.csv'.


In [16]:
from google.colab import files
files.download('balanced_dataset.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Step 1: Load the dataset
df = pd.read_csv("balanced_dataset.csv")

# Separate features and labels
X = df.drop(columns=["sentiment"])  # Drop the target column
y = df["sentiment"]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Train Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(kernel="linear"),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Naive Bayes": MultinomialNB(),
}

results = {}

for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.fit(X_train, y_train)  # Train the model
    y_pred = model.predict(X_test)  # Predict on test set
    accuracy = accuracy_score(y_test, y_pred)  # Calculate accuracy
    results[model_name] = accuracy
    print(f"{model_name} Accuracy: {accuracy:.2f}")

# Step 3: Compare Models
print("\nModel Comparison:")
for model_name, accuracy in results.items():
    print(f"{model_name}: {accuracy:.2f}")

# Identify the best model
best_model = max(results, key=results.get)
print(f"\nBest Model: {best_model} with Accuracy: {results[best_model]:.2f}")


Training Logistic Regression...
Logistic Regression Accuracy: 0.96
Training SVM...
SVM Accuracy: 0.97
Training Random Forest...
Random Forest Accuracy: 0.99
Training Naive Bayes...
Naive Bayes Accuracy: 0.79

Model Comparison:
Logistic Regression: 0.96
SVM: 0.97
Random Forest: 0.99
Naive Bayes: 0.79

Best Model: Random Forest with Accuracy: 0.99


In [12]:
import joblib

# Save the best model
best_model_filename = f"Best_Model_{best_model.replace(' ', '_')}.pkl"
joblib.dump(models[best_model], best_model_filename)  # Save the best model
print(f"Best model ({best_model}) saved as {best_model_filename}.")


Best model (Random Forest) saved as Best_Model_Random_Forest.pkl.


In [14]:
import joblib
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
import re

# Download stopwords
nltk.download('stopwords')

# Load the saved model
best_model_filename = "Best_Model_Random_Forest.pkl"  # Replace with your actual file name
model = joblib.load(best_model_filename)

# Load the vectorizer (recreate or use a saved instance if available)
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_vectorizer.fit_transform(pd.read_csv("balanced_dataset.csv").drop(columns=["sentiment"]))

# Preprocessing function
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()

def preprocess_text(text):
    text = re.sub(r"http\S+|www\S+", "", text)  # Remove URLs
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove special characters
    text = text.lower().strip()  # Lowercase and strip whitespace
    tokens = text.split()
    processed_tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return " ".join(processed_tokens)

# Predict function
def predict_sentiment(input_texts):
    # Ensure input is a list
    if isinstance(input_texts, str):
        input_texts = [input_texts]

    # Preprocess input texts
    preprocessed_texts = [preprocess_text(text) for text in input_texts]

    # Vectorize texts
    input_vectorized = tfidf_vectorizer.transform(preprocessed_texts)

    # Make predictions
    predictions = model.predict(input_vectorized)
    return predictions

# Example input
input_texts = [
    "I love this video, it was amazing!",
    "This content is terrible and I hate it.",
    "The video was okay, nothing special."
]

# Predict sentiments
predicted_sentiments = predict_sentiment(input_texts)
for text, sentiment in zip(input_texts, predicted_sentiments):
    print(f"Comment: {text}\nPredicted Sentiment: {sentiment}\n")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Comment: I love this video, it was amazing!
Predicted Sentiment: positive

Comment: This content is terrible and I hate it.
Predicted Sentiment: negative

Comment: The video was okay, nothing special.
Predicted Sentiment: neutral





In [15]:
import joblib
joblib.dump(tfidf_vectorizer, "tfidf_vectorizer.pkl")


['tfidf_vectorizer.pkl']