# Review drift: is quality changing

In [1]:
from importlib.util import find_spec

if find_spec('evidently') is None:
    !pip install git+https://github.com/evidentlyai/evidently.git

Collecting git+https://github.com/evidentlyai/evidently.git
  Cloning https://github.com/evidentlyai/evidently.git to /private/var/folders/qq/gdppvj912kv3ds7_xnf0q_fc0000gn/T/pip-req-build-3jcy3m55
  Running command git clone --filter=blob:none --quiet https://github.com/evidentlyai/evidently.git /private/var/folders/qq/gdppvj912kv3ds7_xnf0q_fc0000gn/T/pip-req-build-3jcy3m55
  Resolved https://github.com/evidentlyai/evidently.git to commit 5e0267aaa09aaf4a807e6601a1cdf0711bd6945e
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting plotly>=5.10.0
  Downloading plotly-5.24.1-py3-none-any.whl (19.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.1/19.1 MB[0m [31m93.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting litestar>=2.8.3
  Downloading litestar-2.14.0-py3-none-any.whl (567 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m567.6/567.6 kB[0m [31m57.4 MB/s[0m eta [36m0:00:00[0m
Collecting uvicorn[standard]>=0.22.0
 

In [2]:
import pandas as pd
import numpy as np
import requests
import zipfile
import io

from datetime import datetime, time
from sklearn import datasets, ensemble

from evidently import ColumnMapping
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset, TargetDriftPreset, RegressionPreset



## Import data

### Reviews data

In [9]:
import os

DATA_DIR = '../data/'

# List contents of the data directory
if os.path.exists(DATA_DIR):
    files = os.listdir(DATA_DIR)
    print("Files in data directory:")
    for file in files:
        print(f" - {file}")
else:
    print(f"Directory {DATA_DIR} does not exist")


# Load oakland reviews
df_r = pd.read_csv(os.path.join(DATA_DIR, 'airbnb_oakland_reviews.csv'))
print(f'shape: {df_r.shape}')
display(df_r.head())

Files in data directory:
 - airbnb_oakland_reviews.csv
shape: (126603, 6)


Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,3083,16797662,2014-08-02,18272366,Carlos Kenji,"B- love guest house is a pretty good place, ev..."
1,3083,19980893,2014-09-21,4400249,Jason,"The description was accurate, B was very nice ..."
2,3083,21720882,2014-10-22,4268642,Carmen,B-Love makes me feel at home since she open th...
3,3083,27107155,2015-02-25,23674350,Matthew,Simple but effective. Very close to the city a...
4,3083,33318077,2015-05-26,1252519,Pavel,I enjoyed staying in Tracy's guesthouse! Her a...


In [23]:
# Pick a listing_id by number of reviews and date range
df_r['date'] = pd.to_datetime(df_r['date'])

# Get counts of reviews per listing
counts = df_r.groupby('listing_id').size().sort_values(ascending=False).rename('count').reset_index()

# Calculate date range, but flatten the multi-level columns
days = (df_r.groupby('listing_id')
        .agg({'date': lambda x: (x.max() - x.min()).days})
        .rename(columns={'date': 'days_between'})
        .reset_index())

# Now merge will work correctly
merged_df = pd.merge(counts, days, on='listing_id')

# Display results
print("Review counts per listing:")
print(counts.head())
print("\nDays between first and last review:")
print(days.head())
print("\nMerged results:")
display(merged_df.head())


# Have a view of which listing maybe a good candidate
display(merged_df.sort_values(['count', 'days_between'], ascending=[False, True], inplace=True))


Review counts per listing:
   listing_id  count
0    21632573    877
1    41325003    798
2     1615052    777
3    24498215    732
4     1673795    726

Days between first and last review:
   listing_id  days_between
0        3083          3681
1        5739          5526
2       23637          4546
3       24916          5137
4       29521          4775

Merged results:


Unnamed: 0,listing_id,count,days_between
0,21632573,877,2483
1,41325003,798,1714
2,1615052,777,3991
3,24498215,732,2331
4,1673795,726,3960


None

### Ratings Data