In [1]:
import pandas as pd
import math
import scipy.stats as st
from sklearn.preprocessing import MinMaxScaler

pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)
pd.set_option('display.width',500)
pd.set_option('display.float_format',lambda x: '%.5f' % x)

In [2]:
df = pd.read_csv('../input/course-review/course_reviews.csv')

***
Column descriptions
***

In [3]:
df.head()

Unnamed: 0,Rating,Timestamp,Enrolled,Progress,Questions Asked,Questions Answered
0,5.0,2021-02-05 07:45:55,2021-01-25 15:12:08,5.0,0.0,0.0
1,5.0,2021-02-04 21:05:32,2021-02-04 20:43:40,1.0,0.0,0.0
2,4.5,2021-02-04 20:34:03,2019-07-04 23:23:27,1.0,0.0,0.0
3,5.0,2021-02-04 16:56:28,2021-02-04 14:41:29,10.0,0.0,0.0
4,4.0,2021-02-04 15:00:24,2020-10-13 03:10:07,10.0,0.0,0.0


In [4]:
df.shape

(4323, 6)

In [5]:
df.Rating.value_counts()

5.00000    3267
4.50000     475
4.00000     383
3.50000      96
3.00000      62
1.00000      15
2.00000      12
2.50000      11
1.50000       2
Name: Rating, dtype: int64

In [6]:
df['Questions Asked'].value_counts()

0.00000     3867
1.00000      276
2.00000       80
3.00000       43
4.00000       15
5.00000       13
6.00000        9
8.00000        5
9.00000        3
14.00000       2
11.00000       2
7.00000        2
10.00000       2
15.00000       2
22.00000       1
12.00000       1
Name: Questions Asked, dtype: int64

In [7]:
df.groupby(['Rating']).agg({'Questions Asked' : 'mean'})

Unnamed: 0_level_0,Questions Asked
Rating,Unnamed: 1_level_1
1.0,1.46667
1.5,0.0
2.0,0.08333
2.5,0.45455
3.0,0.1129
3.5,0.19792
4.0,0.15144
4.5,0.19158
5.0,0.2314


# Average

In [8]:
df['Rating'].mean()

4.764284061993986

# Time-Based Weighted Average

In [9]:
df['Timestamp'] = pd.to_datetime(df.Timestamp)

In [10]:
analysis_date = pd.to_datetime('2021-02-10')

In [11]:
df['days'] = (analysis_date - df.Timestamp).dt.days

In [12]:
df.head()

Unnamed: 0,Rating,Timestamp,Enrolled,Progress,Questions Asked,Questions Answered,days
0,5.0,2021-02-05 07:45:55,2021-01-25 15:12:08,5.0,0.0,0.0,4
1,5.0,2021-02-04 21:05:32,2021-02-04 20:43:40,1.0,0.0,0.0,5
2,4.5,2021-02-04 20:34:03,2019-07-04 23:23:27,1.0,0.0,0.0,5
3,5.0,2021-02-04 16:56:28,2021-02-04 14:41:29,10.0,0.0,0.0,5
4,4.0,2021-02-04 15:00:24,2020-10-13 03:10:07,10.0,0.0,0.0,5


In [13]:
# Reviews commited in last 30 days
df.loc[df['days'] <= 30].head()

Unnamed: 0,Rating,Timestamp,Enrolled,Progress,Questions Asked,Questions Answered,days
0,5.0,2021-02-05 07:45:55,2021-01-25 15:12:08,5.0,0.0,0.0,4
1,5.0,2021-02-04 21:05:32,2021-02-04 20:43:40,1.0,0.0,0.0,5
2,4.5,2021-02-04 20:34:03,2019-07-04 23:23:27,1.0,0.0,0.0,5
3,5.0,2021-02-04 16:56:28,2021-02-04 14:41:29,10.0,0.0,0.0,5
4,4.0,2021-02-04 15:00:24,2020-10-13 03:10:07,10.0,0.0,0.0,5


In [14]:
def time_based_weighted_average(dataframe, w1=28, w2=26, w3=24, w4=22):
    return dataframe.loc[dataframe['days'] <= 30,'Rating'].mean() * w1 / 100 \
            +dataframe.loc[(dataframe['days'] > 30) &  (dataframe['days'] <= 90),'Rating'].mean() * w2 / 100 \
            +dataframe.loc[(dataframe['days'] > 90) &  (dataframe['days'] <= 180),'Rating'].mean() * w3 / 100 \
            +dataframe.loc[dataframe['days'] > 180,'Rating'].mean() * w4 / 100 

In [15]:
time_based_weighted_average(df,30,26,22,22)

4.765491074653962

# User-Based Weighted Average

In [16]:
def user_based_weighted_average(dataframe, w1=22, w2=24, w3=26, w4=28):
    return dataframe.loc[dataframe['Progress'] <= 10,'Rating'].mean() * w1 / 100 \
            +dataframe.loc[(dataframe['Progress'] > 10) &  (dataframe['Progress'] <= 45),'Rating'].mean() * w2 / 100 \
            +dataframe.loc[(dataframe['Progress'] > 45) &  (dataframe['Progress'] <= 75),'Rating'].mean() * w3 / 100 \
            +dataframe.loc[dataframe['Progress'] > 75,'Rating'].mean() * w4 / 100 

In [17]:
user_based_weighted_average(df)

4.800257704672543

# Weighted Rating

In [18]:
def course_weighted_rating(dataframe, time_w = 50, user_w = 50):
    return time_based_weighted_average(dataframe) * time_w/100 \
            + user_based_weighted_average(dataframe) * user_w/100

In [19]:
course_weighted_rating(df)

4.782641693469868

In [20]:
course_weighted_rating(df,time_w=40,user_w=60)

4.786164895710403