In [1]:
import pandas as pd
import random

---

In [2]:
# Load Wine Products DF:
df = pd.read_csv('../data/df.csv')

## Creating User Preferences DataFrame

Ideally this data would come from Wine.com internally. However, since this data is not publically avalable, a fictitious user preferences dataframe has been created showing user-driven activity and associated ratings per product.

In [3]:
# Function to create df of user interactions
def create_user_interaction_df(df):
    
    # Create a list of unique user_ids (215 unique users)
    unique_users = [str(random.randint(100000, 999999)) for _ in range(215)]

    # Create a list of product_ids from the 'product_id' column in the wine DataFrame df
    product_ids = df['product_id'].tolist()

    # Create a list of event types with specified probabilities (total 850)
    event_types = ["view"] * 400 + ["like"] * 200 + ["rate"] * 100 + ["comment"] * 50 + ["follow"] * 50 + ["bookmark"] * 50
    random.shuffle(event_types)

    # Map event types to event strengths
    event_strength_mapping = {
        'view': 1.0,
        'like': 1.5,
        'rate': 2.0,
        'comment': 3.0,
        'follow': 4.0,
        'bookmark': 4.5
    }
    event_strengths = [event_strength_mapping[event_type] for event_type in event_types]

    # Create the DataFrame
    data = {
        "user_id": random.choices(unique_users, k=850),
        "event_type": event_types,
        "product_id": random.choices(product_ids, k=850),
        "event_strength": event_strengths
    }

    df_user_interactions = pd.DataFrame(data)
    return df_user_interactions

Explain: users can interact with Wine.com in many different ways. Explain types, explain subjective rating of 1.0 to 4.5 for strength. 

In [4]:
# Create user interactions DataFrame
df_user_interactions = create_user_interaction_df(df)

In [5]:
# Explore the DF
df_user_interactions.head()

Unnamed: 0,user_id,event_type,product_id,event_strength
0,509102,view,1298793,1.0
1,344116,comment,1340075,3.0
2,554077,rate,1498938,2.0
3,375045,like,385384,1.5
4,661905,like,1133016,1.5


In [6]:
df_user_interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 850 entries, 0 to 849
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   user_id         850 non-null    object 
 1   event_type      850 non-null    object 
 2   product_id      850 non-null    int64  
 3   event_strength  850 non-null    float64
dtypes: float64(1), int64(1), object(2)
memory usage: 26.7+ KB


In [7]:
print(f'Of the {df_user_interactions.shape[0]} user interactions, there are {df_user_interactions["user_id"].nunique()} unique users and {df_user_interactions["product_id"].nunique()} unique wine products that were rated.')

Of the 850 user interactions, there are 210 unique users and 256 unique wine products that were rated.


In [8]:
# Merge DataFrames with outer join
df_all_interactions = pd.merge(df, df_user_interactions, on='product_id', how='outer')

In [9]:
print(f'The original shape of the dataframe df_all_interactions is {df_all_interactions.shape}.')

# Drop nulls for user_id
df_all_interactions.dropna(subset=['user_id'], inplace=True)

print(f'The shape of the dataframe df_all_interactions after dropping user_id nulls is {df_all_interactions.shape}.')

The original shape of the dataframe df_all_interactions is (1046, 13).
The shape of the dataframe df_all_interactions after dropping user_id nulls is (1025, 13).


In [10]:
# Redefine event strength mapping outside of previous function
event_strength_mapping = {
    'view': 1.0,
    'like': 1.5,
    'rate': 2.0,
    'comment': 3.0,
    'follow': 4.0,
    'bookmark': 4.5
}

# Map event types to event strengths
df_all_interactions['event_strength'] = df_all_interactions['event_type'].map(event_strength_mapping)

# Calculate the weighted sum per user
df_all_interactions['user_weighted_sum'] = df_all_interactions.groupby('user_id')['event_strength'].transform('sum')

In [11]:
df_all_interactions.head()

Unnamed: 0,product_id,wine_type,wine_name,wine_origin,rating_avg,rating_num,price_current,price_prediscount,discount_nom,savings_percent,user_id,event_type,event_strength,user_weighted_sum
0,1405250,Malbec,Chateau Du Caillau Cahors 2021,"Cahors, Southwest, France",5.0,19,15.0,39,$23.01,59.0,372752,like,1.5,14.0
1,1405250,Malbec,Chateau Du Caillau Cahors 2021,"Cahors, Southwest, France",5.0,19,15.0,39,$23.01,59.0,314317,view,1.0,15.5
2,1405250,Malbec,Chateau Du Caillau Cahors 2021,"Cahors, Southwest, France",5.0,19,15.0,39,$23.01,59.0,798639,view,1.0,13.5
3,1405250,Malbec,Chateau Du Caillau Cahors 2021,"Cahors, Southwest, France",5.0,19,15.0,39,$23.01,59.0,287430,comment,3.0,15.0
4,1405250,Malbec,Chateau Du Caillau Cahors 2021,"Cahors, Southwest, France",5.0,19,15.0,39,$23.01,59.0,439922,like,1.5,13.5


In [14]:
# Saving df to .csv
df_all_interactions.to_csv('../data/df_all_interactions.csv', index=False)