In [1]:
import pandas as pd

In [2]:
# Gathering the Data
us_data = pd.read_csv("./data/USvideos.csv")

In [None]:
us_data.info()

In [3]:
# Grouping the Views for the titles since they were seperated by dates in each row
# and we want the total views for each title
us_aff = us_data.groupby("title")["views"].sum()

In [4]:
# Sorting the data in descending order
# and converting it to a DataFrame
sorted_us_agg = us_aff.sort_values(ascending=False)

In [5]:
# Merging the sorted data with the original data to get a clear dataframe
# with the title and the views and sorting it by views
# The merged data will contain the original data with the views column
merged_us_agg = us_data.merge(sorted_us_agg, on="title", how="left").sort_values(["views_y"], ascending=False)

In [None]:
# View the data to see the top 10 titles with the most views
merged_us_agg.loc[:, ["title", "category_id", "views_x", "views_y"]].head(10)

In [7]:
# Copying the merged data to drop the duplciate columns
copy_merged = merged_us_agg.copy()

In [8]:
# Drop the duplicate rows
copy_merged.drop_duplicates(subset=["title"], inplace=True)

In [None]:
# Sorting the data by views and getting the top 10 titles.
copy_merged.sort_values(["views_y"], ascending=False).head(10)

In [10]:
# Filtering the data to get onyl required columns
top_10_us_videos = copy_merged.loc[:, ["title", "category_id", "views_x", "views_y"]].head(10)

In [11]:
# Adding tge region column to the data
top_10_us_videos["region"] = "US"

In [6]:
# Getting the US category data
us_category_data_raw = pd.read_json("./data/US_category_id.json")

In [None]:
us_category_data_raw.info()

In [13]:
# Converting the JSON data to a DataFrame
us_category_data = us_category_data_raw["items"].apply(pd.Series)

In [14]:
# Function to grab the category name from the JSON data
def get_category_name(item):
    return item["snippet"]["title"]
    

In [15]:
# Applied the function to get the category name
# and added it to the DataFrame
us_category_data["category_name"] = us_category_data.apply(get_category_name, axis=1)

In [16]:
# Converting the id field to numeric so the data can be merged
us_category_data["id"] = pd.to_numeric(us_category_data["id"])

In [17]:
# Merged the top 10 videos with the category data
# to get the category name for each video
us_final_data = pd.merge(
    top_10_us_videos,
    us_category_data,
    left_on="category_id",
    right_on="id",
    how="left",
)

In [None]:
# This is the final data with the top 10 videos and their category names for USA
us_final_data

In [2]:
clean_data = pd.read_csv("./data/cleaned_data.csv")

In [None]:
clean_data.info()

In [None]:
clean_data.info()