# Creating a custom user-item interaction matrix

In [86]:
import pandas as pd
import numpy as np

In [87]:
df_raw = pd.read_csv("cleaned_top_5k.csv")
df_raw.head()

Unnamed: 0.1,Unnamed: 0,App Name,App Id,Category,Rating,Rating Count,Installs,Minimum Installs,Maximum Installs,Free,...,Size,Minimum Android,Developer Website,Released,Last Updated,Content Rating,Privacy Policy,Ad Supported,In App Purchases,Editors Choice
0,385470,WhatsApp Messenger,com.whatsapp,Communication,4.0,138557570.0,"5,000,000,000+",5000000000.0,6265637751,True,...,19.2,Varies with device,1,"Oct 18, 2010","Jun 09, 2021",Everyone,1,False,False,True
1,304824,Instagram,com.instagram.android,Social,3.8,120206190.0,"1,000,000,000+",1000000000.0,3559871277,True,...,19.2,Varies with device,1,"Apr 3, 2012","Jun 14, 2021",Teen,1,True,True,True
2,2222701,Facebook,com.facebook.katana,Social,2.3,117850066.0,"5,000,000,000+",5000000000.0,6782619635,True,...,19.2,Varies with device,1,"Jun 16, 2021","Jun 16, 2021",Teen,1,True,True,False
3,881403,YouTube,com.google.android.youtube,Video Players & Editors,4.4,112440547.0,"5,000,000,000+",5000000000.0,9766230924,True,...,19.2,Varies with device,1,"Oct 20, 2010","Jun 16, 2021",Teen,1,True,False,False
4,244319,Garena Free Fire - Rampage,com.dts.freefireth,Action,4.2,89177097.0,"500,000,000+",500000000.0,976536041,True,...,19.2,4.1 and up,1,"Dec 7, 2017","Jun 04, 2021",Mature 17+,1,False,True,True


In [88]:
"""
Simulate a users 5 ratings... (try to do this for like 10 users)

Format should be the following:
user_id, app_id, rating...
"""
import random
import math

def get_random_number(min_score, max_score, probability_power=1):
    """
    Get a random number between min_score and max_score with a specific bias towards
    upper boundary numbers or lower boundary numbers depending on the probability_power.
    
    If the probability_power > 1, there will be a higher probability of getting a number closer to lower bound
    If the probability_power < 1, there will be a higher probability of getting a number closer to upper bound
    """
    result = math.floor(min_score + (max_score + 1 - min_score) * (math.pow(random.random(), probability_power)))
    
    return result

# create empty data frame
df = pd.DataFrame()

# create data for 10k users
for i in range(1,10000):
    # the user can have 3-20 ratings
    possible_number_of_ratings = np.arange(3,20)
    
    ratings = []
    app_ids = []
    user_ids = []
    
    chosen_number_of_ratings = random.choice(possible_number_of_ratings)

    while len(app_ids) != chosen_number_of_ratings:
        
        """
        Here the probability_power is 10. This will make it more likely for the 
        random number to be closer to 0 than the upper bound. 
        
        The reason is that we want more users to give ratings to more popular
        apps.
        """
        num = get_random_number(0, 4999, 10)
        if num not in app_ids:
            app_ids.append(num)
            
    for j in range(0, chosen_number_of_ratings):
        """
        The probability_power here is 0.66 because we want users to mostly give out good ratings
        because that is the usual behavior in the real life. 
        
        Of course there is still chance that they give the lowest possible rating
        """
        ratings.append(get_random_number(1,5, 0.66))
        
    for x in range(0, chosen_number_of_ratings):
        user_ids.append(i)
        
    data = pd.DataFrame({
            "user_id" : user_ids, 
            "app_id" : app_ids,
            "rating" : ratings
        })
        
    df = pd.concat([df, data])
    
df.head(10)


Unnamed: 0,user_id,app_id,rating
0,1,1,5
1,1,0,4
2,1,3880,2
0,2,34,5
1,2,1,1
2,2,0,5
0,3,0,4
1,3,451,2
2,3,144,3
3,3,4931,4


In [81]:
df.to_csv("user-item-100k-cleaned.csv")