# **Google Team 3A Google Colab Notebook**

Import Datatset and Tools

In [None]:
import pandas as pd
import numpy as np
from google.colab import files
from datetime import datetime

# # Load Dataset Here
df = pd.read_csv("US_youtube_trending_data.csv", on_bad_lines='skip', engine='python')
#The on_bad_lines argument will skip any malformed lines and the engine argument set to python will use the python parsing engine, which is more flexible


FileNotFoundError: [Errno 2] No such file or directory: 'US_youtube_trending_data.csv'

# Exploratory Data Analysis

In [None]:
print(df.shape)
df.head()

In [None]:
# Delete Features that are not needed

df = df.drop(columns=['tags'])
df = df.drop(columns=['video_id'])
df = df.drop(columns=['channelId'])
df = df.drop(columns=['thumbnail_link'])

# Cleaning (removing videos with comments and ratings disabled)

df = df.drop(df[df['view_count'] == 0].index)
df = df[df['comments_disabled'] == False]
df = df[df['ratings_disabled'] == False]
df.head()

In [None]:
def calculate_time_to_trend(row):
    trending_date = datetime.strptime(row['trending_date'], '%Y-%m-%dT%H:%M:%SZ')
    published_at = datetime.strptime(row['publishedAt'], '%Y-%m-%dT%H:%M:%SZ')
    time_difference = (trending_date - published_at).total_seconds() / 60  # Convert to minutes
    return time_difference

df['time_to_trend_minutes'] = df.apply(calculate_time_to_trend, axis=1)
df.head()


In [None]:
# categoryId vs view_count

from matplotlib import pyplot as plt
df.plot(kind='scatter', x='categoryId', y='view_count', s=32, alpha=.8)
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
# Comment vs View Count

from matplotlib import pyplot as plt

# Create the scatter plot
df.plot(kind='scatter', x='comment_count', y='view_count', s=32, alpha=.8)

# Add labels to the points
for i, row in df.iterrows():  # Iterate over each row of the DataFrame
    plt.annotate(row['title'], (row['comment_count'], row['view_count']))

# Customize the plot appearance
plt.gca().spines[['top', 'right',]].set_visible(False)
plt.xlabel("Comment Count")
plt.ylabel("View Count")
plt.title("Comment Count vs. View Count")

# Show the plot
plt.show()

In [None]:
# Time of Day

# 0 = morning, 1 = afternoon, 2 = night
def calculate_time_of_day(row):
    published_at = datetime.strptime(row['publishedAt'], '%Y-%m-%dT%H:%M:%SZ')
    time_difference = (published_at).hour
    if (time_difference < 10):
        return 0
    elif (time_difference < 17):
        return 1
    return 2;

df['time_posted'] = df.apply(calculate_time_of_day, axis=1)
df.head()

# Calculate Engagement Features

In [None]:
# Calculate Engagement Rate (Likes + Dislikes + No. of Comments / View Count)

df['engagement_rate'] = round((df['likes'] + df['comment_count']) / df['view_count'])
df.head()


In [None]:
# Like - Dislike Ratio
# df['like_dislike_ratio'] = df['likes'] / (df['dislikes'])

# Comment - View Ratio
df['comment_view_ratio'] = round(df['comment_count'] / df['view_count'])

df.head()

# Define Viral vs. Trending Thresholds

In [None]:
# Define thresholds for trending & viral videos
# viral = 1,000,000 views in 24 hours
viral_threshold = 1000000

# Create trending/viral labels (1 = is viral/trending, 0 = isn't viral/trending)
df['is_viral'] = (df['view_count'] >= viral_threshold).astype(int)

df.head()


#*---- Next steps: split data into training/testing sets ?*


# Split Data (Training vs. Testing Sets)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# # Next steps - split data into training/testing sets ?
missing_values = df.isnull().sum()
print("Missing values in each column:\n", missing_values)
df = df.dropna()


# Select features to use in the model (try to use clustering?)
X = df[['time_to_trend_minutes', 'time_posted', 'likes', 'comment_view_ratio']]
y = df['view_count']

# # Split the data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [None]:
# Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)
prediction = model.predict(X_test)

In [None]:
# Metrics

# Print mean squared error
print('\nModel Performance\n\nRMSE =   %.2f'
      % np.sqrt(mean_squared_error(y_test, prediction)))
# The coefficient of determination: 1 is perfect prediction
print(' R^2 =   %.2f'
      % r2_score(y_test, prediction))

In [None]:
# Visualization
plt.plot(X_test, prediction, color='blue', linewidth=3);

plt.xlabel('real view_count');
plt.ylabel('view_count prediction');

In [None]:
missing_values = df.isnull().sum()
print("Missing values in each column:\n", missing_values)
X.replace([np.inf, -np.inf], np.nan, inplace=True)
df = df.dropna()


# # Next steps - split data into training/testing sets ?

# Select features to use in the model
X = df[[ 'engagement_rate', 'comment_view_ratio']]
print(X)

y = df['is_viral']

# # Split the data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [None]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the logistic regression model
log_reg = LogisticRegression()

# Fit the model on the training data
log_reg.fit(X_train, y_train)

# Predict on the test data
y_pred = log_reg.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report and confusion matrix
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier

In [None]:
# 1. Create the  DecisionTreeClassifier model object below and assign to variable 'model'
model3 = DecisionTreeClassifier(max_depth=16, min_samples_leaf=1)

# 2. Fit the model to the training data below
model3.fit(X_train, y_train)

# 3. Make predictions on the test data below and assign the result to the variable 'class_label_predictions'
class_label_predictions = model3.predict(X_test)

# 4. Compute the accuracy here and save the result to the variable 'acc_score'
acc_score = accuracy_score(y_test, class_label_predictions)
print(acc_score)

In [None]:
# Maybe Unsuperised Learning to find features? KMeans Clustering

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt



In [None]:

# Which features to use for clustering
cluster_features = df[['time_to_trend_minutes', 'engagement_rate', 'comment_view_ratio', 'likes']]

# standardize features
scaler = StandardScaler()
cluster_features_scaled = scaler.fit_transform(cluster_features)

# define the number of clusters
n_clusters = 3
kmeans = KMeans(n_clusters=n_clusters, random_state=42)


kmeans.fit(cluster_features_scaled)

# assign cluster labels to each data point
df['cluster'] = kmeans.labels_

# plot
plt.figure(figsize=(10, 6))
plt.scatter(df['time_to_trend_minutes'], df['engagement_rate'], c=df['cluster'], cmap='viridis', s=50)
plt.xlabel('Time to Trend (minutes)')
plt.ylabel('Engagement Rate')
plt.title(f'KMeans Clustering with {n_clusters} Clusters')
plt.colorbar(label='Cluster')
plt.gca().spines[['top', 'right']].set_visible(False)
plt.show()



In [None]:
# NLP - Title


