In [17]:
# imports + IST time
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime, time

try:
    from zoneinfo import ZoneInfo
    IST = ZoneInfo("Asia/Kolkata")
except:
    IST = None

def is_allowed_time(start_hour, end_hour):
    """Return True if current time (IST) is between start_hour and end_hour (inclusive)."""
    now = datetime.now(IST).time() if IST else datetime.now().time()
    return time(start_hour,0) <= now <= time(end_hour,0)


In [18]:
#  load Play Store dataset
fp = "Downloads/Play Store Data.csv"
ps = pd.read_csv(fp)
print("Loaded:", ps.shape)
ps.head(3)


Loaded: (10841, 13)


Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up


In [19]:
ps.index=range(1,len(ps)+1)

In [20]:
ps.head(3)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
1,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
2,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
3,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up


In [21]:
#  parsing helpers for installs and size
import re

def parse_installs(x):
    if pd.isna(x): 
        return np.nan
    s = str(x).replace('+','').replace(',','').strip()
    return pd.to_numeric(s, errors='coerce')

def parse_size_mb(x):
    if pd.isna(x):
        return np.nan
    s = str(x).strip()
    if "Varies" in s or s == "":
        return np.nan
    s_low = s.lower()
    # '14M', '512k' style
    if s_low.endswith('m'):
        try:
            return float(s_low[:-1])
        except:
            return np.nan
    if s_low.endswith('k'):
        try:
            return float(s_low[:-1]) / 1000.0
        except:
            return np.nan
    # fallback: try numeric
    try:
        v = float(re.sub(r'[^\d\.]', '', s))
        # heuristic: if value > 1000, maybe bytes -> convert to MB
        if v > 1000:
            return v / (1024*1024)
        return v
    except:
        return np.nan


In [22]:
#  standardize column names and parse numeric fields
colmap = {
    "App":"app",
    "Category":"category",
    "Installs":"installs",
    "Reviews":"reviews",
    "Rating":"rating",
    "Size":"size",
    "Last Updated":"last_updated"
}
for old,new in colmap.items():
    if old in ps.columns:
        ps.rename(columns={old:new}, inplace=True)

# parse numeric fields
ps['installs_n'] = ps['installs'].apply(parse_installs) if 'installs' in ps.columns else np.nan
ps['reviews_n']  = pd.to_numeric(ps['reviews'], errors='coerce') if 'reviews' in ps.columns else np.nan
ps['rating_n']   = pd.to_numeric(ps['rating'], errors='coerce') if 'rating' in ps.columns else np.nan
ps['size_mb']    = ps['size'].apply(parse_size_mb) if 'size' in ps.columns else np.nan
ps['last_dt']    = pd.to_datetime(ps['last_updated'], errors='coerce') if 'last_updated' in ps.columns else pd.NaT

# keep string types consistent
ps['app'] = ps['app'].astype('string') if 'app' in ps.columns else ps.get('App', pd.Series([], dtype='string'))
ps['category'] = ps['category'].astype('string') if 'category' in ps.columns else ps.get('Category', pd.Series([], dtype='string'))

print("Parsed columns ready:", ['installs_n' in ps.columns, 'size_mb' in ps.columns, 'rating_n' in ps.columns, 'last_dt' in ps.columns])
ps.head(2)


Parsed columns ready: [True, True, True, True]


Unnamed: 0,app,category,rating,reviews,size,installs,Type,Price,Content Rating,Genres,last_updated,Current Ver,Android Ver,installs_n,reviews_n,rating_n,size_mb,last_dt
1,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up,10000.0,159.0,4.1,19.0,2018-01-07
2,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,500000.0,967.0,3.9,14.0,2018-01-15


In [23]:
#  apply filters exactly as instructed
df = ps.copy()

# average rating at least 4.2
df = df[df['rating_n'] >= 4.2]

#  app names that do NOT contain any numbers
df = df[~df['app'].str.contains(r'\d', na=False)]

# categories that start with 'T' or 'P' (case-insensitive)
df = df[df['category'].str.strip().str[0].str.upper().isin(['T','P'])]

#  reviews > 1000
df = df[df['reviews_n'] > 1000]

#  size between 20 MB and 80 MB (inclusive)
df = df[(df['size_mb'] >= 20.0) & (df['size_mb'] <= 80.0)]

# ensure we have last_dt for time series
df = df.dropna(subset=['last_dt', 'installs_n'])

print("After filters rows:", df.shape[0])
df[['app','category','rating_n','reviews_n','size_mb','installs_n','last_dt']].head(5)


After filters rows: 139


Unnamed: 0,app,category,rating_n,reviews_n,size_mb,installs_n,last_dt
2803,"Shutterfly: Free Prints, Photo Books, Cards, G...",PHOTOGRAPHY,4.6,98716.0,59.0,5000000.0,2018-08-01
2804,FreePrints – Free Photos Delivered,PHOTOGRAPHY,4.8,109500.0,37.0,1000000.0,2018-08-02
2812,"Face Filter, Selfie Editor - Sweet Camera",PHOTOGRAPHY,4.7,142634.0,22.0,10000000.0,2018-07-06
2823,Makeup Editor -Beauty Photo Editor & Selfie Ca...,PHOTOGRAPHY,4.5,3378.0,30.0,1000000.0,2018-07-25
2824,Makeup Photo Editor: Makeup Camera & Makeup Ed...,PHOTOGRAPHY,4.4,10525.0,25.0,1000000.0,2018-07-27


In [24]:
#  create month period and aggregate installs per month per category
df['month'] = df['last_dt'].dt.to_period('M').dt.to_timestamp()  # month start timestamp

agg = (df.groupby(['month','category'], as_index=False)
         .agg(total_installs=('installs_n','sum'))
         .sort_values(['category','month']))

# pivot to get categories as columns for stacked area plotting convenience (but we'll use long form for px.area)
agg.head(8)


Unnamed: 0,month,category,total_installs
16,2018-03-01,PARENTING,100000.0
19,2018-05-01,PARENTING,10000000.0
26,2018-07-01,PARENTING,600000.0
2,2016-12-01,PERSONALIZATION,2000000.0
7,2017-09-01,PERSONALIZATION,1000000.0
11,2018-01-01,PERSONALIZATION,10000000.0
14,2018-02-01,PERSONALIZATION,10000000.0
22,2018-06-01,PERSONALIZATION,10000000.0


In [25]:
#compute MoM % change per category and list months where any category > 25% increase
agg = agg.sort_values(['category','month'])
agg['mom_pct'] = agg.groupby('category')['total_installs'].pct_change().fillna(0.0)

# flag months where any category has mom_pct > 0.25
high_growth = agg[agg['mom_pct'] > 0.25].copy()
highlight_months = sorted(high_growth['month'].drop_duplicates().tolist())

print("Months to highlight (any category >25% MoM):", highlight_months)


Months to highlight (any category >25% MoM): [Timestamp('2017-03-01 00:00:00'), Timestamp('2017-07-01 00:00:00'), Timestamp('2017-12-01 00:00:00'), Timestamp('2018-01-01 00:00:00'), Timestamp('2018-04-01 00:00:00'), Timestamp('2018-05-01 00:00:00'), Timestamp('2018-06-01 00:00:00'), Timestamp('2018-07-01 00:00:00'), Timestamp('2018-08-01 00:00:00')]


In [26]:

#  translations for legend display
translate_map = {
    "Travel & Local": "Voyage & Local",      # French (literal: 'Voyage & Local')
    "Productivity": "Productividad",          # Spanish
    "Photography": "写真"                       # Japanese Kanji for photography
}

# create display column: use translated name if present, else title-case original
agg['category_disp'] = agg['category'].apply(lambda c: translate_map.get(c, str(c).title()))
# also create mapping dict to ensure consistent coloring order
categories_disp_order = agg['category_disp'].unique().tolist()
categories_disp_order


['Parenting',
 'Personalization',
 'Photography',
 'Productivity',
 'Tools',
 'Travel_And_Local']

In [27]:
#  render stacked area if in allowed time (4 PM - 6 PM IST), else print notice
if agg.empty:
    print(" No data left after filters — nothing to plot.")
else:
    if is_allowed_time(16, 18):
        # use px.area on long-form aggregated dataframe
        fig = px.area(
            agg,
            x='month',
            y='total_installs',
            color='category_disp',
            category_orders={'category_disp': categories_disp_order},
            labels={'month':'Month', 'total_installs':'Total Installs', 'category_disp':'Category'},
            title='Cumulative Installs Over Time by Category (stacked area)'
        )

        # For each highlighted month, overlay a vertical rectangle with stronger color (darker overlay)
        # This visually increases color intensity for that month across the chart.
        for m in highlight_months:
            # x0 = month start, x1 = next month start
            x0 = pd.to_datetime(m)
            x1 = (x0 + pd.offsets.MonthBegin(1))
            # add a semi-transparent overlay rectangle
            fig.add_vrect(x0=x0, x1=x1,
                          fillcolor="rgba(0,0,0,0.08)", layer="below", line_width=0)

        fig.update_layout(
            xaxis=dict(tickformat="%Y-%m"),
            legend_title_text='Category (translated where applicable)',
            margin=dict(t=80, b=120)
        )
        fig.show()
    else:
        print(" This visualization is visible only between 4:00 PM and 6:00 PM IST. (Currently hidden.)")


 This visualization is visible only between 4:00 PM and 6:00 PM IST. (Currently hidden.)


In [30]:
# optional save (run only when chart displayed)
try:
    fig.write_html("Task6_StackedArea.html")
    print("Saved Task6_StackedArea.html")
except NameError:
    print("Chart not created (time window outside or agg empty).")


Saved Task6_StackedArea.html
