In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import drive

drive.mount('/content/drive') # drive already mounted! uncomment to remount if necessary

df = pd.read_csv("/content/drive/MyDrive/app_store_data.csv", index_col=None, na_values=["N/A"]) #reads the CSV into a dataframe

Mounted at /content/drive


# Filter operations

In [None]:
df

Select all rows where "rating" is 3 or greater *and* the review is from May 1, 2025 or later

In [None]:
df_filtered = df[
    (df["rating"] >= 3)
    &
    (df["review_date"] >= "2025-05-01T00:00:00-00:00")
]
df_filtered

Select all rows where "rating" is 3 or lower *or* the review is from June 28, 2024 or earlier

In [None]:
df_filtered = df[
    (df["rating"] <= 3)
    |
    (df["review_date"] <= "2024-06-28T00:00:00-00:00")
]
df_filtered

Use `isin(...)` to pick only rows where a column's value lies in a given list.

Example below singles out the col "rating" and selects only rows where ratings are equal to either 4 or 5.

In [None]:
df[df["rating"].isin([4,5])]

Another way of doing the above filtering would be to create an array of ratings (4 and 5) as a variable and use that instead:

In [None]:
goodReviews = np.array([4, 5])
df[df["rating"].isin(goodReviews)]

Another way of looking for specific review ratings would be to look for values lying between two endpoints (inclusive by default):

In [None]:
dfMid = df[df["rating"].between(2, 3)]
dfMid

dfMay2025 = df[df["review_date"].between("2025-05-01", "2025-05-31")]
dfMay2025

If you want to filter by “does the text column contain X,” use `.str` accessors:

In [None]:
dfBad = df[df["review_text"].str.contains("app", case=False, na=False)]
dfBad

Titles that start with “Great:"

In [None]:
dfGreatTitle = df[df["title"].str.startswith("Great")]
dfGreatTitle

`query(...)`
An alternative syntax that lets you write the filter as a string expression:

Inside `query()`, column names become bare identifiers, and you combine with `and`/`or` (not `&`/`|`).

You can also pass Python variables using the `@` symbol:

In [None]:
dfQuery = df.query("rating >= 3 and review_date >= '2025-05-01T00:00:00Z'")
dfQuery

Can also pass Python variables using `@`:

In [None]:
min_rating = 3
cutoff = "2025-05-01T00:00:00Z"
dfQuery2 = df.query("rating >= @min_rating and review_date >= @cutoff")
dfQuery2

`loc` with boolean mask

Equivalent to `df[mask]`, but more explicit:

In [None]:
mask = (df["rating"] >= 3) & (df["review_date"] >= "2025-05-01")
df_loc = df.loc[mask, ["author_name", "rating", "review_date"]]
df_loc

`nlargest(...)` and `nsmallest(...)`

For fetching the top‐k or bottom‐k rows by a particular column:

In [None]:
# 5 reviews with highest rating
top5 = df.nlargest(5, "rating")

# 3 earliest review dates (smallest timestamp)
oldest = df.nsmallest(3, "review_date")

# Sorting operations

`sort_values(...)`
Sort by one (or multiple) columns, ascending or descending:

In [53]:
df.sort_values("rating", ascending=False, inplace=True)
df

Unnamed: 0,author_name,rating,review_text,review_date,title,app_version
499,Tggr2000,5,Couldn't be easier!!,2014-07-24T12:48:47-07:00,Awesome App!!!,4.1.1.1
0,ratherBfying69!,5,Always proficient and trustworthy,2025-05-26T16:52:09-07:00,Best bank,2025.2.2.3
498,Hberryman,5,Easy to use and love the mobile deposit feature!,2014-07-25T12:30:24-07:00,Great app,4.1.1.1
482,Big Nic 22,5,I'm glad I can now deposit checks.,2014-10-08T12:42:50-07:00,Convenient,4.3.0.0
481,Hizmunky,5,User friendly and works well. I've never had ...,2014-10-17T16:47:17-07:00,Great app!!!,4.3.0.0
...,...,...,...,...,...,...
24,Fn Craig,1,Bring back the widget. Was way less time consu...,2024-10-16T03:04:17-07:00,Miss the widget,2024.07.1 (2024.7.0.1)
48,Silenceimpaired,1,Credit card data is just a poorly done mobile ...,2023-11-04T12:51:47-07:00,Poorly put together often down,2023.3.0.3
28,JoeNyongesa,1,"Well, I don’t know why I have banners to creat...",2024-08-20T11:09:28-07:00,Ads,2024.04.3 (2024.4.0.3)
471,Kcvenom,1,You are annoying,2014-11-27T10:29:52-07:00,Annoying,4.3.0.0


# Grouping operations

# Basic Statistics

Mean rating

Review count by date