# NLP Sentiment Analysis on YouTube Channel Analytics

    # Import the necessary libraries

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import string
from bs4 import BeautifulSoup
import isodate
import re
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
import plotly.express as px
import pandas as pd
import seaborn as sns
import plotly.graph_objects as go
import plotly.offline as pyo
import warnings
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv('gaugegadget.csv')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.isnull().sum()

    # Drop columns that are not much of value.

In [None]:
data = data.drop(columns=['favouriteCount','video_id'])


In [None]:
data.head()

    # Convert publishedAt and duration to a better readable format

In [None]:
data['publishedAt'] = pd.to_datetime(data['publishedAt'])
def convert_duration(duration_str):
    duration = isodate.parse_duration(duration_str)
    return duration.total_seconds() / 60

data['duration'] = data['duration'].apply(convert_duration)

In [None]:
data.head()

In [None]:
data.sort_values(by='duration', ascending=False, inplace=True)
data.head()

    # Define a function for cleaning the text.

In [None]:
def process_text(text):
    # Remove HTML tags
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text()
    # Remove numbers
    #text = re.sub(r'\d+','',text)
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('','',string.punctuation))
    # Tokenization
    tokens = nltk.word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return ' '.join(tokens)

# Example usage
text = 'The quick brown foxES aRe jumping ove@#$r the la#$zy dogs!'
print(process_text(text))

    # Feature Engineering

In [None]:
data = data.dropna(subset='tags')

In [None]:
data['processed_tags'] = data['tags'].apply(process_text)
data['processed_description'] = data['description'].apply(process_text)
data['processed_title'] = data['title'].apply(process_text)

In [None]:
data.head()

In [None]:
all_tags = ' '.join(data['processed_tags'].dropna())
all_title = ' '.join(data['processed_title'].dropna())
all_description = ' '.join(data['processed_description'].dropna())


In [None]:
# Load the image
g_path = 'Gletter.png'  # Replace with the path to your mask image
t_path = 'T.png'
d_path = 'Dletter.png'
g_image = Image.open(g_path)
t_image = Image.open(t_path)
d_image = Image.open(d_path)

# Convert the image to a numpy array
g_array = np.array(g_image)
t_array = np.array(t_image)
d_array = np.array(d_image)

# Create a WordCloud object with the mask
gcloud = WordCloud(width=800, height=400, background_color='white', max_words=200, mask=g_array, contour_width=0, contour_color='black', min_font_size=10).generate(all_tags)
tcloud = WordCloud(width=800, height=400, background_color='white', max_words=200, mask=t_array, contour_width=0, contour_color='black', min_font_size=10).generate(all_title)
dcloud = WordCloud(width=800, height=400, background_color='white', mask=d_array, contour_width=0, contour_color='black').generate(all_description)

# Display the word cloud
plt.figure(figsize=(10,5))
plt.imshow(gcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Most Common Words in Tags')
plt.show()

plt.figure(figsize=(10,5))
plt.imshow(tcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Most Common Words in Title')
plt.show()

plt.figure(figsize=(10,5))
plt.imshow(dcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Most Common Words in Description')
plt.show()


    # Plotting some visuals to gain insights

In [None]:
# Drop rows with missing values or fill them with appropriate values
data = data.dropna(subset=['publishedAt', 'viewCount', 'title'])

# Convert to datetime and numeric, if not already done
data['publishedAt'] = pd.to_datetime(data['publishedAt'], errors='coerce')
data['viewCount'] = pd.to_numeric(data['viewCount'], errors='coerce')

# Extract month and year from the date and create a new column for aggregation
data['month_year'] = data['publishedAt'].dt.to_period('M')

# Define a custom aggregation function to get the titles of the top 3 videos with the highest view counts
def top_titles(series):
    top_n = 3  # You can change this to get the top N titles
    sorted_series = series.sort_values(ascending=False)
    top_titles = sorted_series.head(top_n).index.tolist()
    return '<br>'.join(top_titles)

# Group by 'month_year', sum the 'viewCount', and get the top titles
data_grouped = data.groupby('month_year').agg({
    'viewCount': 'sum',
    'title': lambda x: top_titles(x.groupby(x).count())
}).reset_index()

# Create an interactive line plot
fig = px.line(data_grouped, x=data_grouped['month_year'].astype(str), y='viewCount', 
              title='Monthly Views Trend', labels={'viewCount': 'Total Views', 'month_year': 'Month-Year'},
              hover_data={'title': True})  # Include 'title' in hover data

# Customize hover data
fig.update_traces(mode='lines+markers',
                  hovertemplate='Month-Year: %{x}<br>Total Views: %{y:.2f}<br>Top Titles: %{customdata[0]}')

fig.update_layout(xaxis_title='Month-Year')
# Show the plot
fig.show()

pyo.plot(fig, filename='line.html', config={'displayModeBar': False})


In [None]:
numerical_features = ['viewCount', 'likeCount', 'commentCount']
sns.pairplot(data[numerical_features])
plt.show()


In [None]:
sns.scatterplot(x='viewCount', y='likeCount', data=data)
plt.title('Scatter plot between View Count and Like Count')
plt.show()
