# User Behavior Analysis

In [5]:
import pandas as pd


MIND_type = 'MINDsmall'

data_path_base="/app/SUBERX/datasets/"
data_path = data_path_base + MIND_type +"/"


behaviors_file = data_path + "train/behaviors.tsv"
print(f"Behaviors File {behaviors_file}")


# Load the behaviors data
columns = ["impression_id", "user_id", "time", "history", "impressions"]
behaviors_df = pd.read_csv(behaviors_file, sep="\t", names=columns)

# Display basic statistics and data sample
print(behaviors_df.info())
print(behaviors_df.head())

Behaviors File /app/SUBERX/datasets/MINDsmall/train/behaviors.tsv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156965 entries, 0 to 156964
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   impression_id  156965 non-null  int64 
 1   user_id        156965 non-null  object
 2   time           156965 non-null  object
 3   history        153727 non-null  object
 4   impressions    156965 non-null  object
dtypes: int64(1), object(4)
memory usage: 6.0+ MB
None
   impression_id user_id                   time  \
0              1  U13740  11/11/2019 9:05:58 AM   
1              2  U91836  11/12/2019 6:11:30 PM   
2              3  U73700  11/14/2019 7:01:48 AM   
3              4  U34670  11/11/2019 5:28:05 AM   
4              5   U8125  11/12/2019 4:11:21 PM   

                                             history  \
0  N55189 N42782 N34694 N45794 N18445 N63302 N104...   
1  N31739 N6072 N63045 N23979 N35656 N4335

## Statistics

Calculate some statistics

**Number of unique users**: There are 50,000 unique users in the *small* MIND dataset.
**NUmber of sessions**: A session represents a single instance where a user interacts with the platform, such as browsing news or clicking on articles.
**Average Session length**: A longer history provides more context for the recommendation system but can also increase computational complexity.
**Average Impressions per session**: Impressions are the total number of recommendations or articles displayed to a user.


In [10]:
num_users = behaviors_df["user_id"].nunique()
print(f"Number of unique users: {num_users}")


num_sessions = len(behaviors_df)
print(f"Number of sessions: {num_sessions}")

behaviors_df["history_length"] = behaviors_df["history"].fillna("").apply(lambda x: len(x.split(" ")))
avg_history_length = behaviors_df["history_length"].mean()
print(f"Average session length (history length): {avg_history_length:.2f}")

behaviors_df["num_impressions"] = behaviors_df["impressions"].apply(lambda x: len(x.split(" ")))
avg_impressions = behaviors_df["num_impressions"].mean()
print(f"Average impressions per session: {avg_impressions:.2f}")


def calculate_ctr(impressions):
    clicks = sum([1 for impression in impressions.split(" ") if impression.endswith("-1")])
    total = len(impressions.split(" "))
    return clicks / total if total > 0 else 0

behaviors_df["ctr"] = behaviors_df["impressions"].apply(calculate_ctr)
avg_ctr = behaviors_df["ctr"].mean()
print(f"Average CTR: {avg_ctr:.2%}")



Number of unique users: 50000
Number of sessions: 156965
Average session length (history length): 32.56
Average impressions per session: 37.23
Average CTR: 10.85%
