# YouTube Data Analysis


## Libraries Installations and Imports

In [118]:
from IPython.display import clear_output

!pip install contractions
!pip install -q transformers
!pip install langdetect

clear_output()

In [119]:
import os
import pandas as pd
import re
import contractions
from transformers import pipeline
from langdetect import detect, DetectorFactory, LangDetectException

## Data Preparation and Preprocessing

### Loading the data

In [88]:
# Function to read all json files in a folder and return a DF concatenating all the information

def read_and_concatenate_json_files(folder):
    dataframes = []

    for filename in os.listdir(folder):
        if filename.endswith('.json'):
            file_path = os.path.join(folder, filename)
            df = pd.read_json(file_path)
            dataframes.append(df)
    df = pd.concat(dataframes, ignore_index=True) # Concatenate all DataFrames
    return df

In [89]:
df = read_and_concatenate_json_files('./data')

In [90]:
df.head()

Unnamed: 0,id,description,view_count,like_count,dislike_count,comment_count,duration,favorite_count,comments,error
0,K26_sDKnvMU,Trailer for Toy Story (1995) captured from the...,116545.0,150.0,0.0,0.0,PT1M31S,0.0,[],<HttpError 403 when requesting https://youtube...
1,3LPANjHlPxo,Jumanji Trailer 1995\nDirector: Joe Johnston\n...,196593.0,286.0,0.0,0.0,PT2M48S,0.0,[],<HttpError 403 when requesting https://youtube...
2,rEnOoWs3FuA,"The more things change, the more they stay the...",221365.0,220.0,0.0,13.0,PT1M52S,0.0,"[Buena película de comedia romántica, <a href=...",
3,j9xml1CxgXI,Whitney Houston and Angela Bassett star in thi...,433261.0,0.0,0.0,0.0,PT2M40S,0.0,[],<HttpError 403 when requesting https://youtube...
4,ltwvKLnj1B4,,,,,,,,[],list index out of range


In [91]:
df.shape

(11484, 10)

In [92]:
data = df.copy()

### Eliminating Duplicates

In [93]:
data['id'].duplicated().sum() # We have some duplicated videos

21

In [94]:
data = data.drop_duplicates(subset=['id'], keep='last')

In [95]:
data.shape

(11463, 10)

In [96]:
data['id'].duplicated().sum() # Checking duplicates were eliminated

0

### Removing Nulls

In [97]:
data.isnull().sum() # Checking the number of nulls per column

id                   0
description       2990
view_count        2990
like_count        2990
dislike_count     2990
comment_count     2990
duration          2990
favorite_count    2990
comments             0
error             7572
dtype: int64

In [98]:
# Droping columns with null values

data = data[~data['description'].isna()]
data.shape

(8473, 10)

In [99]:
data.isnull().sum() # Checking back the number of nulls per column

id                   0
description          0
view_count           0
like_count           0
dislike_count        0
comment_count        0
duration             0
favorite_count       0
comments             0
error             7572
dtype: int64

### Preparing Each Column

#### Duration

In [100]:
data.dtypes

id                 object
description        object
view_count        float64
like_count        float64
dislike_count     float64
comment_count     float64
duration           object
favorite_count    float64
comments           object
error              object
dtype: object

In [101]:
data['duration'].head() # The column durations is in the ISO 8601 duration format

0    PT1M31S
1    PT2M48S
2    PT1M52S
3    PT2M40S
5    PT2M28S
Name: duration, dtype: object

In [102]:
# The following function transform the duration from ISO 8601 to minutes format
def convert_duration_to_minutes(duration):
    pattern = re.compile(r'PT(\d+H)?(\d+M)?(\d+S)?')
    matches = pattern.match(duration)

    hours = int(matches.group(1)[:-1]) if matches.group(1) else 0
    minutes = int(matches.group(2)[:-1]) if matches.group(2) else 0
    seconds = int(matches.group(3)[:-1]) if matches.group(3) else 0

    # Converts duration to minutes
    total_minutes = hours * 60 + minutes + seconds / 60
    total_minutes = round(total_minutes, 2)
    return total_minutes

In [103]:
convert_duration_to_minutes('PT1H2M3S') # checking the function

62.05

In [104]:
# Applying the convert_duration_to_minutes to the 'duration' column

data['duration'] = data['duration'].apply(convert_duration_to_minutes)

In [105]:
data.dtypes

id                 object
description        object
view_count        float64
like_count        float64
dislike_count     float64
comment_count     float64
duration          float64
favorite_count    float64
comments           object
error              object
dtype: object

#### Comments

In [149]:
# Function to clean a text input for sentiment analysis

def clean_text(text):
  if type(text) != str:
    return ""
  # Remove the contractions
  try:
    text = contractions.fix(text)
  except:
    pass
  # Remove URLs
  text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
  # Remove URLs within <a> tags
  text = re.sub(r'<a[^>]*>([^<]*)<\/a>', '', text)
  # Remove special characters and digits
  text = re.sub(r'[^a-zA-Z!\s]', '', text)
  # Remove extra whitespace
  text = re.sub(r'\s+', ' ', text).strip()
  # Convert to lowercase
  text = text.lower()
  return text

In [152]:
# Function to clean all the comments in a given row
DetectorFactory.seed = 0

def clean_comments(comments):
  cleaned_comments = []
  for comment in comments:
    try:
    # Detect the language of the text
      if detect(comment) == 'en':
        cleaned_comment = clean_text(comment)
        cleaned_comments.append(cleaned_comment)
    except LangDetectException:
      continue  # Exclude if language detection fails

  return cleaned_comments

In [155]:
data['comments'] = data['comments'].apply(clean_comments)

In [156]:
data.head()

Unnamed: 0,id,description,view_count,like_count,dislike_count,comment_count,duration,favorite_count,comments,error
0,K26_sDKnvMU,Trailer for Toy Story (1995) captured from the...,116545.0,150.0,0.0,0.0,1.52,0.0,[],<HttpError 403 when requesting https://youtube...
1,3LPANjHlPxo,Jumanji Trailer 1995\nDirector: Joe Johnston\n...,196593.0,286.0,0.0,0.0,2.8,0.0,[],<HttpError 403 when requesting https://youtube...
2,rEnOoWs3FuA,"The more things change, the more they stay the...",221365.0,220.0,0.0,13.0,1.87,0.0,"[im watching this now it never gets old, i was...",
3,j9xml1CxgXI,Whitney Houston and Angela Bassett star in thi...,433261.0,0.0,0.0,0.0,2.67,0.0,[],<HttpError 403 when requesting https://youtube...
5,2GfZl4kuVNI,"Director: Michael Mann.\nCast: Al Pacino, Robe...",1457328.0,7925.0,0.0,668.0,2.47,0.0,[oh these soothing trailers brwhy cant we have...,


## Store Cleaned Data as CSV

In [157]:
data.to_csv('./data/youtube_cleaned.csv', index=False)

## Sentiment Analysis

In [158]:
sentiment_pipeline = pipeline("sentiment-analysis")
d = ["great movie"]
sentiment_pipeline(d)


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'POSITIVE', 'score': 0.9998624324798584}]