A significant amount of people now make a living through being content creator across different platforms. Among them, Youtube is known to be a major source of income. On Youtube, content creators income is highly associated with the view counts of its video. We’d like to develop a model that can help us predict future success of a video ……



In [1]:
pip install google-api-python-client pandas

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import googleapiclient.discovery
import pandas as pd
from datetime import datetime, timezone, timedelta

In [3]:
# Here's where I'm putting my YouTube API key. 
# (pls don't share it with others i have number limits every day on the videos i can pull off)
API_KEY = "AIzaSyCWxTrW6o2KWs9E6YH85EtDl-cURyIor4U"  

# Now we initialize the YouTube API client with the API key so we can make requests
youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=API_KEY)


In [4]:
# Next, let's write a function to fetch videos based on a search query
def get_videos(query, max_results=50, published_after=None, next_page_token=None):
    """
    This function will fetch videos from YouTube based on the query we pass in.
    
    Parameters:
        query (str): The search term to find relevant videos.
        max_results (int): The maximum number of videos to get in one request.
        published_after (str): Only fetch videos published after this timestamp (formatted in RFC 3339).
        next_page_token (str): Here we use pagination to fetch more videos, if available.
    
    Returns:
        response (dict): This will return the YouTube response with video data.
    """
    request = youtube.search().list(
        q=query,  # Here we set the search term
        part="snippet",  # Now we can get the basic info about each video (like title, channel, etc.)
        type="video",  # This will get only videos (not channels or playlists)
        maxResults=max_results,  # This controls how many videos we’re requesting in this call
        publishedAfter=published_after,  # Only fetch videos after a certain date
        order="date",  # Sort by the latest videos
        pageToken=next_page_token  # If there are more pages, this will help us get the next set of results
    )
    response = request.execute()  # Now we execute the request to YouTube
    return response  # Here we return the fetched video data

In [5]:
# Now we can write a function to get the stats for each video (e.g., views, likes, comments)
def get_video_statistics(video_ids):
    """
    This function will grab all the stats for the videos we found, including category information.
    
    Parameters:
        video_ids (list): A list of video IDs for which to get statistics.
    
    Returns:
        response (dict): This returns the video statistics and metadata from YouTube.
    """
    request = youtube.videos().list(
        part="statistics, snippet",  # Now we can fetch both statistics (views, likes) and snippet (title, category)
        id=",".join(video_ids)  # Here we join the video IDs into a single string separated by commas
    )
    response = request.execute()  # Now we execute the request to get the stats
    return response  # Finally, we return the response containing video stats


In [6]:
# This is the main function that collects video data up to a maximum of 50 videos
def collect_video_data(query, max_results_total=50, max_results_per_page=50):
    """
    Here we collect video data for a specific search query, collecting up to max_results_total videos
    published within the last 24 hours.
    
    Parameters:
        query (str): The search query to find relevant videos.
        max_results_total (int): The total number of videos to collect.
        max_results_per_page (int): Maximum videos to fetch per request.
    
    Returns:
        pd.DataFrame: Now we’ll return a DataFrame with all the collected video statistics and metadata.
    """
    video_data = []
    total_videos_collected = 0  # Track the total number of videos collected
    next_page_token = None  # Initialize the page token for pagination
    
    # Get the current time and subtract 24 hours to get the time range for the last 24 hours
    current_time = datetime.now(timezone.utc)
    published_after = (current_time - timedelta(days=1)).strftime("%Y-%m-%dT%H:%M:%SZ")  # 24 hours ago in RFC 3339 format
    
    # Loop until we reach max_results_total videos
    while total_videos_collected < max_results_total:
        # Fetch videos published within the last 24 hours
        response = get_videos(query, max_results=max_results_per_page, published_after=published_after, next_page_token=next_page_token)
        
        video_ids = [item['id']['videoId'] for item in response.get('items', [])]  # Extract video IDs from the response
        
        if not video_ids:
            break  # If no more videos are found, we stop the loop
        
        # Get statistics for the fetched videos
        stats_response = get_video_statistics(video_ids)
        
        # Loop through each video and collect the data
        for item in stats_response.get('items', []):
            video_id = item['id']
            statistics = item['statistics']  # Get the statistics like views, likes, etc.
            snippet = item['snippet']  # Get the metadata like title, category, etc.
            
            # Append the collected data for each video
            video_data.append({
                'Video ID': video_id,
                'View Count': int(statistics.get('viewCount', 0)),  # Ensure view count is an integer, defaulting to 0 if missing
                'Like Count': int(statistics.get('likeCount', 0)),  # Same for like count
                'Comment Count': int(statistics.get('commentCount', 0)),  # Same for comment count
                'Favorite Count': int(statistics.get('favoriteCount', 0)),  # Handle favorite count similarly
                'Video Category': snippet.get('categoryId', 'Unknown'),  # Store the category ID as a feature
                'Video Published At': snippet.get('publishedAt', 'Unknown')  # Store the publication date
            })
        
        # Update the total number of videos collected
        total_videos_collected += len(video_ids)
        
        # Check if we have reached the max results or no more pages to fetch
        next_page_token = response.get('nextPageToken')
        if not next_page_token or total_videos_collected >= max_results_total:
            break
    
    # Finally, convert the list of video data into a DataFrame
    return pd.DataFrame(video_data[:max_results_total])


In [7]:
# Now we can use all of this to collect data and save it to a CSV
if __name__ == "__main__":
    query = "Youtube"  # Set the search term to something relevant 
    max_videos = 50  # Collect up to 50 videos in total
    
    # Time to collect our video data
    df = collect_video_data(query, max_results_total=max_videos)
    
    # Now we save the collected data to a CSV file
    df.to_csv('YP_Youtube_50_Videos2.csv', index=False)
    print("Data collection complete. Saved to YP_Youtube_50_Videos2.csv")


Data collection complete. Saved to YP_Youtube_50_Videos2.csv


In [8]:
# Now, we're going to create a dictionary to map those category IDs to actual category names. 
# This way, we can easily understand what each category represents.
category_mapping = {
    '1': 'Film & Animation',
    '2': 'Autos & Vehicles',
    '10': 'Music',
    '15': 'Pets & Animals',
    '17': 'Sports',
    '18': 'Short Movies',
    '19': 'Travel & Events',
    '20': 'Gaming',
    '21': 'Videoblogging',
    '22': 'People & Blogs',
    '23': 'Comedy',
    '24': 'Entertainment',
    '25': 'News & Politics',
    '26': 'Howto & Style',
    '27': 'Education',
    '28': 'Science & Technology',
    '29': 'Nonprofits & Activism'
    # More categories can be added here if needed, but this should cover the most common ones. I didn't find a better list/map yet if  you do you  can add here
}

# Now let's load the CSV file that contains our YouTube video data.
# This file has all the video details that we collected earlier.
df = pd.read_csv('YP_Youtube_50_Videos2.csv')

# Here's where the transformation happens. We're going to replace the category IDs with the actual names.
# First, we make sure the 'Video Category' column is in string format, then we map those IDs to the category names.
df['Video Category'] = df['Video Category'].astype(str).map(category_mapping)

# Now that we've updated the data, let's save it to a new CSV file.
# This way, the original data stays untouched, and we get a more readable version with category names.
df.to_csv('YP_Youtube_50_Videos2.csv', index=False)

# Finally, let's print a message to confirm that everything worked and the file was saved successfully.
print("Category mapping completed. Saved to YP_Youtube_50_Videos2.csv")


Category mapping completed. Saved to YP_Youtube_50_Videos2.csv


In [9]:
# Load the CSV file into a pandas DataFrame
df = pd.read_csv('YP_Youtube_50_Videos2.csv')

# Display the first 50 rows of the DataFrame
print(df.head(100))

       Video ID  View Count  Like Count  Comment Count  Favorite Count  \
0   mb5ldcr1jbA      266046        1263              0               0   
1   rtLGwJF9Jxc       88604         557              0               0   
2   HAOgdVdSSkA      576583           0              6               0   
3   WWeOBvAvwrY       21829        4059            532               0   
4   WunqF6zQnbI       31961        1640             16               0   
5   rm4qj0SJKKE        6134        1203              0               0   
6   uDvNq7vtGnM        4031         164             28               0   
7   p8SIBaDl9PE       69614         897              0               0   
8   q_V96gv_IDA       93868        1267              0               0   
9   TS6wQl2Gq_Y       11103         497              1               0   
10  CQMZndfGRWI        8241        1267              0               0   
11  qX6MAp5qQrU      106761        6819             69               0   
12  tcAExHTclc4        8824          6

In [10]:
import pandas as pd

# Define the function to extract video IDs from the CSV with an optional parameter to limit the number of rows
def get_video_ids_from_csv(file_path, n=None):
    """
    Extract the first n video IDs from a CSV file.
    
    Parameters:
        file_path (str): Path to the CSV file.
        n (int, optional): Number of rows to extract. If None, extract all rows.
    
    Returns:
        list: A list of video IDs.
    """
    # Load the CSV into a pandas DataFrame
    df = pd.read_csv(file_path)
    
    # If n is provided, extract only the first n rows, otherwise extract all rows
    if n:
        video_ids = df['Video ID'].head(n).tolist()  # Extract first n rows
    else:
        video_ids = df['Video ID'].tolist()  # Extract all rows if n is not provided
    
    return video_ids


csv_file_path = '/Users/yipengandrewwang/DS3000-3/Project1/YP_Youtube_50_Videos2.csv'

#Extract only the first 10 video IDs
video_ids_list = get_video_ids_from_csv(csv_file_path, n=50)
print(video_ids_list)

['mb5ldcr1jbA', 'rtLGwJF9Jxc', 'HAOgdVdSSkA', 'WWeOBvAvwrY', 'WunqF6zQnbI', 'rm4qj0SJKKE', 'uDvNq7vtGnM', 'p8SIBaDl9PE', 'q_V96gv_IDA', 'TS6wQl2Gq_Y', 'CQMZndfGRWI', 'qX6MAp5qQrU', 'tcAExHTclc4', '95RPXz_h2Fw', '6kKE4Rurmmk', 'YRyo5vL6XsI', 'Y8hZ5i0TGIs', 'JrD7QKy8PUI', 'f7v8WJgChU8', 'zToiY455lhQ', 'uDVNRYxIMRE', 'Yfcs9OPrQz8', '-n0waD4qzI4', 'qfih0r0sq0c', 'NUzGaZkGTu4', 'I1cPbwuMc3U', '3s6uanj_5DY', 'XzW21Dj6PIQ', 'bD-R-2DPzuc', 'f-Dvm17aVxk', 'RzQpDaa2044', 'Gs1r0uzedBw', 'F4oaCRpuCtU', 'ng_xV81XhHM', 'yGLEpVbJm40', 'lKwo4S2VA3c', 'T0sPOtCwXio', '6iG4Zp8HLOA', 'E-TA8hJIUk8', 'gE4eK3He-7w', 'qWDt3sCTGFg', 'zZsgnPmP6lg', 'PYJ4do-OZAQ', '_AlZYvis4uE', 'Nks9y_kpofE', 'sMY2RlXdreo', 'u5us4Y5Q4AY', 'jX9vfyAyELE', 'IXq9Ex6qLxY', '00-9NCfhVF8']


In [11]:
len(video_ids_list)

50

In [12]:
import time
import pandas as pd
from datetime import datetime, timezone, timedelta
import googleapiclient.discovery

# Initialize YouTube API client
API_KEY = "AIzaSyCWxTrW6o2KWs9E6YH85EtDl-cURyIor4U"  
youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=API_KEY)


In [13]:
# Updated Function to fetch video statistics for a list of video IDs
def get_video_statistics(video_ids):
    """
    This function fetches video statistics like views, likes, comments for a list of video IDs.
    
    Parameters:
        video_ids (list): A list of video IDs for which to get statistics.
    
    Returns:
        response (dict): The YouTube response containing the video statistics.
    """
    request = youtube.videos().list(
        part="statistics, snippet",  # Get both statistics and metadata
        id=",".join(video_ids)  # Join the video IDs into a comma-separated string
    )
    response = request.execute()
    return response

In [14]:
# The main function for collecting data every 30 minutes
def collect_consecutive_sessions_data(video_ids, sessions=10, output_file="YP_Youtube_Consecutive_Sessions.csv"):
    """
    Collects video statistics for the same set of videos every 30 minutes over a specified number of sessions
    and saves the results to a CSV file after each session.
    
    Parameters:
        video_ids (list): List of video IDs for which to fetch statistics.
        sessions (int): The number of consecutive 30-minute sessions to collect data (default is 10 sessions).
        output_file (str): The name of the output CSV file to store the collected data.
    
    Returns:
        None: The data is saved to the output CSV file after each session.
    """
    
    # Create a list to store the data for all sessions
    all_sessions_data = []
    
    # Collect data for each video over consecutive sessions (every 30 minutes)
    for session in range(1, sessions + 1):
        print(f"Collecting data for Session {session} (every 1 hours)...")
        session_data = []  # Store data for each video in this session
        
        # Fetch statistics for the same video IDs
        stats_response = get_video_statistics(video_ids)
        
        # Loop through each video and collect the stats for the current session
        for item in stats_response.get('items', []):
            video_id = item['id']
            statistics = item['statistics']
            snippet = item['snippet']
            
            # Store video stats for the current session
            session_data.append({
                'Video ID': video_id,
                'Session': session,
                'View Count': int(statistics.get('viewCount', 0)),
                'Like Count': int(statistics.get('likeCount', 0)),
                'Comment Count': int(statistics.get('commentCount', 0)),
                'Favorite Count': int(statistics.get('favoriteCount', 0)),
                'Video Category': snippet.get('categoryId', 'Unknown'),
                'Video Published At': snippet.get('publishedAt', 'Unknown'),
            })
        
        # Append the current session's data to the all_sessions_data list
        all_sessions_data.extend(session_data)
        
        # Convert the collected data into a pandas DataFrame
        df = pd.DataFrame(session_data)
        
        # Save the data to CSV file after each session (appending if it exists)
        if session == 1:
            # For the first session, write a new CSV file with the header
            df.to_csv(output_file, mode='w', header=True, index=False)
        else:
            # For subsequent sessions, append to the existing file without writing the header
            df.to_csv(output_file, mode='a', header=False, index=False)
        
        # Print summary of collected data for this session
        print(f"Session {session} data collected. Collected data for {len(session_data)} videos.")
        print(f"Waiting 1 hours before the next collection...")
        
        # Sleep for 3 hours before collecting the next session's data
        time.sleep(1 * 60 * 60)  # Sleep for 1 hours


In [None]:
if __name__ == "__main__":
  
    sessions_to_collect = 30  # Collect data for 18 consecutive 2-hour sessions
    
    # Collect video data over consecutive sessions for the provided video IDs
    collect_consecutive_sessions_data(video_ids_list, sessions=sessions_to_collect, output_file="YP_Youtube_Consecutive_Sessions4.3.csv")
    print("Data collection for consecutive 1-hour sessions complete. Saved to YP_Youtube_Consecutive_Sessions_4.3.csv")

Collecting data for Session 1 (every 1 hours)...
Session 1 data collected. Collected data for 50 videos.
Waiting 1 hours before the next collection...


In [None]:
import pandas as pd

df = pd.read_csv('YP_Youtube1_4.csv',header=0)

df.head()

In [None]:
len = df.shape[0]
for i in range(len):
    df.loc[i,'Video Published At'] = convert_to_dt_obj(df['Video Published At'][i])

df.head()


In [None]:
print(type(df['Video Published At'][0]))

In [None]:
sorted = df.sort_values(by='Video Published At',ascending=True)
sorted.head()

In [None]:
# category_mapping = {
#     '1': 'Film & Animation',
#     '2': 'Autos & Vehicles',
#     '10': 'Music',
#     '15': 'Pets & Animals',
#     '17': 'Sports',
#     '18': 'Short Movies',
#     '19': 'Travel & Events',
#     '20': 'Gaming',
#     '21': 'Videoblogging',
#     '22': 'People & Blogs',
#     '23': 'Comedy',
#     '24': 'Entertainment',
#     '25': 'News & Politics',
#     '26': 'Howto & Style',
#     '27': 'Education',
#     '28': 'Science & Technology',
#     '29': 'Nonprofits & Activism'
#     # More categories can be added here if needed, but this should cover the most common ones. I didn't find a better list/map yet if  you do you  can add here
# }

In [None]:
recreation = ['Film & Animation','Music','Pets & Animals','Sports','Short Movies','Travel & Events',\
              'Gaming''Videoblogging','People & Blogs','Comedy','Entertainment'
                ]

info = ['News & Politics','Howto & Style','Education','Science & Technology','Nonprofits & Activism']

In [None]:
def re_categorize(string,recreation_list):
    if string in recreation_list:
        return 'Recreation'
    else:
        return 'Info'



for i in range(len):
    df.loc[i,'Video Category'] = re_categorize(df['Video Category'][i],recreation)

df.head()


In [None]:
grouped_df = df.groupby(['Video Category'])
