In [3]:
#imports , Timezone
import pandas as pd
import numpy as np
import plotly.express as px
from datetime import datetime, time

try:
    from zoneinfo import ZoneInfo
    IST = ZoneInfo("Asia/Kolkata")
except:
    IST = None

def allowed_time(start, end):
    now = datetime.now(IST).time() if IST else datetime.now().time()
    return time(start,0) <= now <= time(end,0)


In [4]:
#load CSV
ps = pd.read_csv("Downloads/Play Store Data.csv")
ur = pd.read_csv("Downloads/User Reviews.csv")

print("Playstore:", ps.shape)
print("User Reviews:", ur.shape)


Playstore: (10841, 13)
User Reviews: (64295, 5)


In [5]:
# parsing Functions
def parse_installs(x):
    if pd.isna(x): return np.nan
    return pd.to_numeric(str(x).replace("+","").replace(",",""), errors="coerce")

def parse_size_mb(x):
    if pd.isna(x): return np.nan
    s = str(x).strip().lower()

    if "varies" in s:
        return np.nan

    if s.endswith("m"):
        return float(s[:-1])

    if s.endswith("k"):
        return float(s[:-1]) / 1000

    try:
        v = float(s)
        if v > 1000:
            return v / (1024*1024)
        return v
    except:
        return np.nan


In [6]:
#Standardize Columns
colmap = {
    "App":"app",
    "Category":"category",
    "Rating":"rating",
    "Reviews":"reviews",
    "Installs":"installs",
    "Size":"size",
}
for old,new in colmap.items():
    if old in ps.columns:
        ps.rename(columns={old:new}, inplace=True)

ps["rating_n"]   = pd.to_numeric(ps["rating"], errors="coerce")
ps["reviews_n"]  = pd.to_numeric(ps["reviews"], errors="coerce")
ps["installs_n"] = ps["installs"].apply(parse_installs)
ps["size_mb"]    = ps["size"].apply(parse_size_mb)

ps["app"]      = ps["app"].astype("string")
ps["category"] = ps["category"].astype("string")


In [7]:
#Subjectivity Merge
subj_col = "Sentiment_Subjectivity"

ur[subj_col] = pd.to_numeric(ur[subj_col], errors="coerce")

subj_df = ur.groupby("App", as_index=False)[subj_col].mean()
subj_df.columns = ["app_reviews", "avg_subjectivity"]

merged = ps.merge(subj_df, left_on="app", right_on="app_reviews", how="left")
print("Merged shape:", merged.shape)


Merged shape: (10841, 19)


In [8]:
#apply all filters 
df = merged.copy()

# rating > 3.5
df = df[df["rating_n"] > 3.5]

# allowed categories list
allowed_categories = [
    "game","beauty","business","comics","communication",
    "dating","entertainment","social","event"
]

df["category_norm"] = df["category"].str.lower().str.strip()
df = df[df["category_norm"].isin(allowed_categories)]

# reviews > 500
df = df[df["reviews_n"] > 500]

# installs > 50,000
df = df[df["installs_n"] > 50000]

# app name should NOT contain letter 'S'
df = df[~df["app"].str.contains("s", case=False, na=False)]

# subjectivity > 0.5
df = df[df["avg_subjectivity"] > 0.5]

# drop missing
df = df.dropna(subset=["size_mb", "rating_n", "installs_n"])

print("Filtered rows:", df.shape[0])
df.head()


Filtered rows: 43


Unnamed: 0,app,category,rating,reviews,size,installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,rating_n,reviews_n,installs_n,size_mb,app_reviews,avg_subjectivity,category_norm
198,Google Primer,BUSINESS,4.4,62272,18M,"10,000,000+",Free,0,Everyone,Business,"June 26, 2018",3.550.2,4.1 and up,4.4,62272.0,10000000.0,18.0,Google Primer,0.675,business
206,Call Blocker,BUSINESS,4.6,188841,3.2M,"5,000,000+",Free,0,Everyone,Business,"June 21, 2018",1.1.13,4.0 and up,4.6,188841.0,5000000.0,3.2,Call Blocker,0.655431,business
436,Call Blocker,COMMUNICATION,4.1,17529,10M,"1,000,000+",Free,0,Everyone,Communication,"July 26, 2018",5.86,4.0.3 and up,4.1,17529.0,1000000.0,10.0,Call Blocker,0.655431,communication
440,"CallApp: Caller ID, Blocker & Phone Call Recorder",COMMUNICATION,4.4,483565,20M,"10,000,000+",Free,0,Everyone,Communication,"July 29, 2018",1.286,4.1 and up,4.4,483565.0,10000000.0,20.0,"CallApp: Caller ID, Blocker & Phone Call Recorder",0.506481,communication
450,Caller ID +,COMMUNICATION,4.0,9498,118k,"1,000,000+",Free,0,Everyone,Communication,"June 7, 2016",5.28.0,2.3 and up,4.0,9498.0,1000000.0,0.118,Caller ID +,0.6,communication


In [9]:
#category traslations and color map
def translate(cat):
    c = cat.lower().strip()

    if c == "beauty":      return "सौंदर्य"        # Hindi
    if c == "business":    return "வணிகம்"         # Tamil
    if c == "dating":      return "Verabredung"    # German

    return cat.title()

df["category_disp"] = df["category"].apply(translate)

# Make Game category PINK
color_map = {}
for c in df["category_disp"].unique():
    if "game" in c.lower():
        color_map[c] = "pink"
    else:
        color_map[c] = None


In [12]:
#bubble chart (time 5pm - 7pm IST)
if not allowed_time(17, 19):
    print("This chart will only be visible between 5 PM and 7 PM IST.")
else:
    fig = px.scatter(
        df,
        x="size_mb",
        y="rating_n",
        size="installs_n",
        color="category_disp",
        hover_name="app",
        size_max=60,
        color_discrete_map=color_map,
        labels={
            "size_mb":"Size (MB)",
            "rating_n":"Average Rating",
            "installs_n":"Installs"
        }
    )

    fig.update_layout(
        title="Bubble Chart: App Size vs Rating (Bubble = Installs)",
        xaxis_title="Size (MB)",
        yaxis_title="Average Rating",
        legend_title="Category (Translated)",
        margin=dict(t=80, b=120)
    )

    fig.show()



This chart will only be visible between 5 PM and 7 PM IST.


In [13]:
#optional save
try:
    fig.write_html("Task5_Bubble.html")
    print("Saved HTML: Task5_Bubble.html")
except:
    print("Graph not shown due to time window.")


Graph not shown due to time window.
