# Duration Model

In [80]:
import glob
from typing import List, Literal

import numpy as np
import pandas as pd
from pandas import DataFrame, Series, Timedelta

* all dataframe should have :
  * `scanTimeStamp` column: a timestamp designing the collection time (in absolute second)
  * Keep a consistent naming style for collected data files. I suggest : `dataset_scan_{date}_{hour}_{min}`

Here is an enhanced version of the text:

- **Objective (alternative 1)**: Predict the probability that a video **_leaves_** the YouTube Trending list, given its characteristics.
- [x] **Objective (alternative 2)**: Predict the probability that a video **_enters_** the YouTube Trending list, given its characteristics. And given that it is trending, predict its rank.

- **Output**
  - Probabilities to enter in trend given some characteristics, at current date/time
  - Display survival curves by Dayof Week / by videoCategory / by videoLength cat (0-5 min, 5min-10min, 10min - 25min, 25min+)

- **Features**: 
  - **videoID**: a unique identifier of the video
  - **scanDateTime**: the date and time of collecting the sample. It identifies a group of videos collected together at a given time, i.e., the set of videos in YouTube Trending at the moment of the scan.
  - **videoPublishDate**: the date and time when the video was published
  - **Trend date**: the date and time when the video entered the YouTube Trending list
  - **creatorSubscriberNumber**: the number of subscribers of the video creator
  - **videoTrendsRanking**: the video's ranking in the YouTube Trending list
  - **videoLengthSeconds**: the video's length in seconds
  - **videoType**: the video's type (now, recently trending, short)
  - **videoCategory**: the video's category
  - **exactViewNumber**: the number of views of the video
  - **numberLikes**: the number of likes of the video
  - **numberOfComments**: the number of comments on the video
  - **isCreatorVerified**: a binary indicator of whether the video creator is verified or not
  - **videoKeywords**: the keywords associated with the video


**Preprocessing**
* Load the data
* Wrangle the data (extract relevant information from each column and convert to the appropriate type)
* (alternative 1) Create a binary indicator of the presence of the video in the YouTube Trending list. By default, all videos observed in the sample have value "1" (because we are collecting only trending videos). Then, the first time the video disappears from the dataset, we create a row for this video with the value "0" for the indicator variable.
* [x] (alternative 2) Create a binary indicator of the presence of the video in the YouTube Trending list. Here, it is simpler. Comparing the publish date with the trend date gives us the duration before entering the trend. We define observation periods on which videos whose trend date have not yet been reached get value "0" for the indicator variable. The indicator variable is "1" only for the trend date and after. We can delete all the observations of the video after the first entry in the trend (after its trend date).  
  * **start date** : most ancient publish date
  * **end date**: one day/hour before the most recent scraping date ()
  * **Assumption 1**: first obervation scraping date = first observation trend date
  * **Assumption 2**: we observe the video from its publish date until the end date
  * **duration** = first observation scraping date - self publish date
  * **isTrend** = 1 if the video has been observed between [min(publishDate), end date]

**Modeling**
* Train a model to predict the variable "isTrend"

In [81]:
duration_df = pd.read_csv("../data/Poling_dataset.csv").drop(columns=["Unnamed: 0", "Unnamed: 0.1"])
print(duration_df.shape)
duration_df.head(5)

(785, 15)


Unnamed: 0,videoId,scanTimestamp,videoExactPublishDate,creatorSubscriberNumber,videoTrendsRanking,videoLengthSeconds,videoType,videoCategory,trendingCountry,exactViewNumber,numberLikes,numberOfComments,isCreatorVerified,videoKeywords,Epoch
0,bfmUsDJjBkU,1702948000.0,2023-12-17T02:02:00-08:00,5.79M subscribers,0,4585,Now,Comedy,FR,"2,489,981 views",183K,3.1K,True,"['Mastu', 'Mastus', 'Humour', 'Matsu', 'loat',...",1
1,9gAADPMt1FU,1702948000.0,2023-12-17T10:28:05-08:00,724K subscribers,1,1056,Now,Comedy,FR,"325,905 views",57K,1.2K,True,,1
2,LnW-DwBrWLk,1702948000.0,2023-12-17T07:59:59-08:00,4M subscribers,0,827,Now,Comedy,FR,"492,696 views",39K,563,True,,1
3,K4GQRiuNpP0,1702948000.0,2023-12-17T08:00:33-08:00,1.99M subscribers,1,2773,Now,Howto & Style,FR,"301,552 views",12K,438,False,,1
4,aZOiDNzoeWQ,1702948000.0,2023-12-17T02:17:24-08:00,9.19M subscribers,2,1679,Now,Entertainment,FR,"979,201 views",63K,1.7K,True,"['michou', 'roue', 'tournes', 'souffres', 'vid...",1


In [88]:
def load_data(
        folder: str = None,
        pattern: str = "dataset*",
        use_filenames: bool = False,
        filenames: List[str] = None) -> pd.DataFrame:
    """
    Load data from a folder or specific files.

    Args:
        folder (str, optional): The path to the folder containing the data files.
        pattern (str, optional): The pattern to match files in the folder.
        use_filenames (bool, optional): If True, use the filenames provided in the 
            'filenames' parameter. If False, load all files in the folder.
        filenames (List[str], optional): A list of filenames to load. Only used if 
            'use_filenames' is True.

    Returns:
        pd.DataFrame: A DataFrame containing the loaded data.
    """
    if use_filenames:
        files = filenames
    else:
        files = glob.glob(rf"{folder}/{pattern}")

    dfs = [pd.read_csv(file, index_col=0) for file in files]
    concat_df = pd.concat(dfs, ignore_index=True)

    return concat_df


def _parse_numeric_column(series: Series) -> Series:
    """
    Parse a numeric column, handling 'K' and 'M' suffixes.

    Args:
        series (Series): The series to parse.

    Returns:
        Series: The parsed series, with 'K' and 'M' suffixes converted to numeric values.
    """
    # Normalize columns (remove whitespace, lowercase, remove ",")
    series = series.str.strip().str.lower().replace(',', '', regex=True)

    # Define a regex pattern to match numbers with optional K or M suffix
    pattern = r'(\d+(?:\.\d+)?)([KkMm])?'  # regex group capture

    # Extract the numeric part and the suffix.
    result_df = series.str.extract(pattern, expand=True)
    numeric_part = pd.to_numeric(result_df[0], errors='coerce')
    suffix_series = result_df[1]

    # Define a dictionary to map suffixes to multiplication factors
    suffix_multiplier = {'K': 1e3, 'k': 1e3, 'M': 1e6, 'm': 1e6}

    # Multiply by the corresponding factor based on the suffix
    multiplier = suffix_series.map(suffix_multiplier)

    # Replace NaN values with 1 (default multiplier for rows without a suffix)
    multiplier = multiplier.fillna(1)

    # Multiply the numeric part by the multiplier, ensure numeric type.
    result_series = numeric_part * multiplier
    result_series = pd.to_numeric(result_series, downcast='integer')

    return result_series


def clean_columns(data: pd.DataFrame) -> pd.DataFrame:
    """
    Clean columns in a DataFrame.

    Args:
        data (pd.DataFrame): The DataFrame to clean.

    Returns:
        pd.DataFrame: The cleaned DataFrame.
    """
    df = data.copy()

    # Datetime columns
    df["videoExactPublishDate"] = pd.to_datetime(df["videoExactPublishDate"], utc=True)
    df["scanTimeStamp"] = pd.to_datetime(df["scanTimeStamp"], unit="s", utc=True)

    # Numeric columns
    df["numberLikes"] = _parse_numeric_column(df["numberLikes"])
    df["exactViewNumber"] = _parse_numeric_column(df["exactViewNumber"])
    df["numberOfComments"] = _parse_numeric_column(df["numberOfComments"])
    df["creatorSubscriberNumber"] = _parse_numeric_column(df["creatorSubscriberNumber"])

    return df


def create_duration_model_columns(data: DataFrame, frequency: Literal["hour", "day"] = "hour") -> DataFrame:
    """
    Adjust the end date of the data based on the chosen frequency and compute
    the duration before trending for each video.

    Args:
        data (DataFrame): The DataFrame containing the data.
        frequency (Literal["hour", "day"]): The frequency for the adjustment. 
            If "hour", subtract 1 hour from the end date. 
            If "day", subtract 1 day from the end date.
            Other possible values: see `pandas.Timedelta()`.

    Returns:
        DataFrame: The DataFrame with the time to trend in seconds.
    """
    df = data.copy()
    start_date = pd.to_datetime(df["videoExactPublishDate"].min())
    end_date = pd.to_datetime(df["scanTimeStamp"].max())
    end_date -= Timedelta(value=1.5, unit=frequency)

    # Compute the time spent before entering in the trending list.
    first_trending_time = df.groupby("videoId")["scanTimeStamp"].min()
    first_trending_time.name = "firstTrendingTime"
    df = df.merge(first_trending_time, left_on="videoId", right_index=True)
    df["timeToTrendSeconds"] = (df["firstTrendingTime"] - df["videoExactPublishDate"]).dt.total_seconds()

    # Determine whether or not the video has been trending
    df["isTrend"] = np.logical_and(df["firstTrendingTime"] >= start_date, df["firstTrendingTime"] <= end_date)

    return df.sort_index()


def processing_for_duration_model(
        folder: str = None,
        pattern: str = "dataset*",
        use_filenames: bool = False,
        filenames: List[str] = None,
        frequency: Literal["hour", "day"] = "hour") -> DataFrame:
    """
    Perform data processing operations for a duration model.

    This function combines loading data, cleaning columns, and creating
    duration model columns using the previously defined functions.

    Args:
        folder (str, optional): The path to the folder containing the data files.
        pattern (str, optional): The pattern to match files in the folder.
        use_filenames (bool, optional): If True, use the filenames provided in the 
            'filenames' parameter. If False, load all files in the folder.
        filenames (List[str], optional): A list of filenames to load. Only used if 
            'use_filenames' is True.
        frequency (Literal["hour", "day"], optional): The frequency for the adjustment. 
            If "hour", subtract 1 hour from the end date. 
            If "day", subtract 1 day from the end date.
            Other possible values: see `pandas.Timedelta()`.

    Returns:
        DataFrame: The processed DataFrame for the duration model.
    """
    # Load data
    data = load_data(
        folder=folder, 
        pattern=pattern, 
        use_filenames=use_filenames, 
        filenames=filenames
    )

    # Clean columns
    cleaned_data = clean_columns(data)

    # Create duration model columns
    processed_data = create_duration_model_columns(cleaned_data, frequency)

    return processed_data


# Usage
# 1
df = load_data("../data")
print(df.shape, df.columns)

test_data = {
    'views': ['300,000 views', '400.75K views', '2.5M views', 'NoSuffix']
}
test_df = pd.DataFrame(test_data)
test_df['result'] = _parse_numeric_column(test_df['views'])
print(test_df)

# 2
df1 = clean_columns(df)
df1.head()

# 3
df2 = create_duration_model_columns(df1)
print(df2.shape, df2["isTrend"].sum() / len(df2))
df2.head()

# 4 (ALL IN ONE)
super_df = processing_for_duration_model("../data/")
super_df

(785, 15) Index(['videoId', 'videoExactPublishDate', 'creatorSubscriberNumber',
       'videoTrendsRanking', 'videoLengthSeconds', 'videoType',
       'videoCategory', 'trendingCountry', 'exactViewNumber', 'numberLikes',
       'numberOfComments', 'isCreatorVerified', 'videoKeywords',
       'scanTimeStamp', 'Epoch'],
      dtype='object')
           views     result
0  300,000 views   300000.0
1  400.75K views   400750.0
2     2.5M views  2500000.0
3       NoSuffix        NaN
(785, 18) 0.9974522292993631


Unnamed: 0,videoId,videoExactPublishDate,creatorSubscriberNumber,videoTrendsRanking,videoLengthSeconds,videoType,videoCategory,trendingCountry,exactViewNumber,numberLikes,numberOfComments,isCreatorVerified,videoKeywords,scanTimeStamp,Epoch,firstTrendingTime,timeToTrendSeconds,isTrend
0,bfmUsDJjBkU,2023-12-17 10:02:00+00:00,5790000,0,4585,Now,Comedy,FR,2489981,183000.0,3100.0,True,"['Mastu', 'Mastus', 'Humour', 'Matsu', 'loat',...",2023-12-19 01:03:55.763489024+00:00,1,2023-12-19 01:03:55.763489024+00:00,140515.763489,True
1,9gAADPMt1FU,2023-12-17 18:28:05+00:00,724000,1,1056,Now,Comedy,FR,325905,57000.0,1200.0,True,,2023-12-19 01:03:55.763489024+00:00,1,2023-12-19 01:03:55.763489024+00:00,110150.763489,True
2,LnW-DwBrWLk,2023-12-17 15:59:59+00:00,4000000,0,827,Now,Comedy,FR,492696,39000.0,563.0,True,,2023-12-19 01:03:55.763489024+00:00,1,2023-12-19 01:03:55.763489024+00:00,119036.763489,True
3,K4GQRiuNpP0,2023-12-17 16:00:33+00:00,1990000,1,2773,Now,Howto & Style,FR,301552,12000.0,438.0,False,,2023-12-19 01:03:55.763489024+00:00,1,2023-12-19 01:03:55.763489024+00:00,119002.763489,True
4,aZOiDNzoeWQ,2023-12-17 10:17:24+00:00,9190000,2,1679,Now,Entertainment,FR,979201,63000.0,1700.0,True,"['michou', 'roue', 'tournes', 'souffres', 'vid...",2023-12-19 01:03:55.763489024+00:00,1,2023-12-19 01:03:55.763489024+00:00,139591.763489,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
780,_Q4pcR5H2OE,2023-12-16 09:30:17+00:00,290000,10,60,Short,Entertainment,FR,451293,73000.0,613.0,True,,2023-12-19 07:32:00.194941184+00:00,7,2023-12-19 01:03:55.763489024+00:00,228818.763489,True
781,iFDN9N-GIrw,2023-12-16 10:05:05+00:00,751000,11,60,Short,Entertainment,FR,684282,,484.0,True,,2023-12-19 07:32:00.194941184+00:00,7,2023-12-19 01:03:55.763489024+00:00,226730.763489,True
782,9AYGP5imeWk,2023-12-16 15:02:32+00:00,321000,12,60,Short,Entertainment,FR,408654,23000.0,37.0,True,,2023-12-19 07:32:00.194941184+00:00,7,2023-12-19 01:03:55.763489024+00:00,208883.763489,True
783,IMsEt4YBF3Q,2023-12-16 17:20:20+00:00,22900,13,42,Short,Gaming,FR,258762,17000.0,33.0,False,,2023-12-19 07:32:00.194941184+00:00,7,2023-12-19 01:03:55.763489024+00:00,200615.763489,True


In [83]:
# class duration_model():
#     # instantiate model
#     # train model (fit method)
#     # prediction method
#     # prediction proba method
#     # survival curves (use the features and their categories as parameters)
#     # save model method
#     # load model method
#     pass