# **An application for calculating the product rating over the ratings given to a product**

# Story of the dataset
### (50+ Hours) Python A-Z™: Data Science and Machine Learning
### Rating: 4.8 (4.764925)
### Total Rating: 4611
### Score Percentages: 75, 20, 4, 1, <1
### Approximate Numerical Equivalents: 3458, 922, 184, 46, 6

# Importing libraries

In [1]:
import pandas as pd
import math
import datetime as dt
import scipy.stats as st
from sklearn.preprocessing import MinMaxScaler
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 500)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

# Reading the dataset

In [2]:
df = pd.read_csv('/kaggle/input/course-reviews/course_reviews.csv')
df.columns = [col.lower() for col in df.columns]
df.head()

Unnamed: 0,rating,timestamp,enrolled,progress,questions asked,questions answered
0,5.0,2021-02-05 07:45:55,2021-01-25 15:12:08,5.0,0.0,0.0
1,5.0,2021-02-04 21:05:32,2021-02-04 20:43:40,1.0,0.0,0.0
2,4.5,2021-02-04 20:34:03,2019-07-04 23:23:27,1.0,0.0,0.0
3,5.0,2021-02-04 16:56:28,2021-02-04 14:41:29,10.0,0.0,0.0
4,4.0,2021-02-04 15:00:24,2020-10-13 03:10:07,10.0,0.0,0.0


# Average

### rating distribution

In [3]:
df['rating'].value_counts()

5.00    3267
4.50     475
4.00     383
3.50      96
3.00      62
1.00      15
2.00      12
2.50      11
1.50       2
Name: rating, dtype: int64

### questions asked distribution

In [4]:
df['questions asked'].value_counts()

0.00     3867
1.00      276
2.00       80
3.00       43
4.00       15
5.00       13
6.00        9
8.00        5
9.00        3
14.00       2
11.00       2
7.00        2
10.00       2
15.00       2
22.00       1
12.00       1
Name: questions asked, dtype: int64

### get the rating mean in the questions asked breakdown

In [5]:
df.groupby('questions asked').agg({
    'questions asked': 'count',
    'rating': 'mean'})

Unnamed: 0_level_0,questions asked,rating
questions asked,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,3867,4.77
1.0,276,4.74
2.0,80,4.81
3.0,43,4.74
4.0,15,4.83
5.0,13,4.65
6.0,9,5.0
7.0,2,4.75
8.0,5,4.9
9.0,3,5.0


### get the rating mean

In [6]:
df['rating'].mean()

4.764284061993986

# Time-based weighted average

### changing type of the the variable 'timestamp' to 'datetime

In [7]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4323 entries, 0 to 4322
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   rating              4323 non-null   float64       
 1   timestamp           4323 non-null   datetime64[ns]
 2   enrolled            4323 non-null   object        
 3   progress            4323 non-null   float64       
 4   questions asked     4323 non-null   float64       
 5   questions answered  4323 non-null   float64       
dtypes: datetime64[ns](1), float64(4), object(1)
memory usage: 202.8+ KB


### observing the comments date

In [8]:
current_day = pd.to_datetime('2021-02-10 0:0:0')
df['days'] = (current_day - df['timestamp']).dt.days
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4323 entries, 0 to 4322
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   rating              4323 non-null   float64       
 1   timestamp           4323 non-null   datetime64[ns]
 2   enrolled            4323 non-null   object        
 3   progress            4323 non-null   float64       
 4   questions asked     4323 non-null   float64       
 5   questions answered  4323 non-null   float64       
 6   days                4323 non-null   int64         
dtypes: datetime64[ns](1), float64(4), int64(1), object(1)
memory usage: 236.5+ KB


### get the comments in the last 30 days

In [9]:
df[df['days'] <= 30].head()

Unnamed: 0,rating,timestamp,enrolled,progress,questions asked,questions answered,days
0,5.0,2021-02-05 07:45:55,2021-01-25 15:12:08,5.0,0.0,0.0,4
1,5.0,2021-02-04 21:05:32,2021-02-04 20:43:40,1.0,0.0,0.0,5
2,4.5,2021-02-04 20:34:03,2019-07-04 23:23:27,1.0,0.0,0.0,5
3,5.0,2021-02-04 16:56:28,2021-02-04 14:41:29,10.0,0.0,0.0,5
4,4.0,2021-02-04 15:00:24,2020-10-13 03:10:07,10.0,0.0,0.0,5


### get the rating counts in the last 30 days

In [10]:
df.loc[df['days'] <= 30, 'rating'].count()

194

### get the average of the ratings in the last 30 days (1. range)

In [11]:
df.loc[df['days'] <= 30, 'rating'].mean()

4.775773195876289

### get the average of the ratings between 30 and 90 days (2. range)

In [12]:
df.loc[(df['days'] > 30) & (df['days'] <= 90), 'rating'].mean()

4.763833992094861

### get the average of the ratings between 90 and 180 days (3. range)

In [13]:
df.loc[(df['days'] > 90) & (df['days'] <= 180), 'rating'].mean()

4.752503576537912

### get the average of the ratings after 180 days (4. range)

In [14]:
df.loc[df['days'] > 180, 'rating'].mean()

4.76641586867305

### Let's consider the following weights
### total of the percentages must be 100. 
* 1.range = 28%
* 2.range = 26%
* 3.range = 24%
* 4.range = 22%

In [15]:
df.loc[df['days'] <= 30, 'rating'].mean() * 0.28 + \
    df.loc[(df['days'] > 30) & (df['days'] <= 90), 'rating'].mean() * 0.26 + \
        df.loc[(df['days'] > 90) & (df['days'] <= 180), 'rating'].mean() * 0.24 + \
            df.loc[df['days'] > 180, 'rating'].mean() * 0.22

4.765025682267194

### # functionalization of the above process

In [16]:
def time_based_weighted_average(dataframe, w1=0.28, w2=0.26, w3=0.24, w4=0.22):
    return dataframe.loc[dataframe['days'] <= 30, 'rating'].mean() * w1 + \
        dataframe.loc[(dataframe['days'] > 30) & (dataframe['days'] <= 90), 'rating'].mean() * w2 + \
            dataframe.loc[(dataframe['days'] > 90) & (dataframe['days'] <= 180), 'rating'].mean() * w3 + \
                dataframe.loc[dataframe['days'] > 180, 'rating'].mean() * w4

time_based_weighted_average(df)

4.765025682267194

### operating the function again by changing the weights

In [17]:
time_based_weighted_average(df, w1=0.30, w3=0.22)

4.765491074653962

# User-based weighted average

### making a weighting related to the given rating according to the progress of the course

In [18]:
df.groupby('progress').agg({'rating': 'mean'}).sort_values('progress', ascending=False)

Unnamed: 0_level_0,rating
progress,Unnamed: 1_level_1
100.0,4.87
98.0,5.0
97.0,5.0
95.0,4.79
94.0,5.0
93.0,4.83
91.0,5.0
90.0,4.92
89.0,4.79
87.0,5.0


### Let's consider the following user weights
* for the progress > 75% = 28%
* 45% < the progress <= 75% = 26%
* 10% < the progress <= 45% = 24%
* for the progress <= 10% = 22%

In [19]:
df.loc[(df['progress'] <= 10), 'rating'].mean() * 0.22 + \
    df.loc[(df['progress'] > 10) & (df['progress'] <= 45), 'rating'].mean() * 0.24 + \
        df.loc[(df['progress'] > 45) & (df['progress'] <= 75), 'rating'].mean() * 0.26 + \
            df.loc[(df['progress'] > 75), 'rating'].mean() * 0.28

4.800257704672543

### functionalization of the above process

In [20]:
def user_based_weighted_average(dataframe, w1=0.22, w2=0.24, w3=0.26, w4=0.28):
    return dataframe.loc[(dataframe['progress'] <= 10), 'rating'].mean() * w1 + \
    dataframe.loc[(dataframe['progress'] > 10) & (dataframe['progress'] <= 45), 'rating'].mean() * w2 + \
        dataframe.loc[(dataframe['progress'] > 45) & (dataframe['progress'] <= 75), 'rating'].mean() * w3 + \
            dataframe.loc[(dataframe['progress'] > 75), 'rating'].mean() * w4

user_based_weighted_average(df)

4.800257704672543

### operating the function again by changing the weights

In [21]:
user_based_weighted_average(df, w1=0.20, w4=0.30)

4.803286469062915

# Weighted rating


### combining the functions time_based_weighted_average and user_based_weighted_average

In [22]:
def weighted_rating(dataframe, time_w=0.50, user_w=0.50):
    return time_based_weighted_average(dataframe) * time_w + user_based_weighted_average(dataframe) * user_w

weighted_rating(df)

4.782641693469868

### operating the function again by changing the weights

In [23]:
weighted_rating(df, 0.4, 0.6)

4.786164895710403

In [24]:
weighted_rating(df, 0.6, 0.4)

4.779118491229334

### Consequently, when considering the first product rating (4.764925), it was increased to 4.76549, 4.80328, and 4.77912 by using tim-base weighted rating, user-based weighted rating, and weighted rating, respectively. On the other hand, if there is the distribution of the ratings, we can also here apply the bayesian average rating function.

# **Thanks for checking my notebook!**