In [27]:
# Imports & IST timezone
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from datetime import datetime, time
try:
    from zoneinfo import ZoneInfo
    IST = ZoneInfo("Asia/Kolkata")
except:
    IST = None


In [28]:
#  Load Play Store dataset 
fp = "Downloads/Play Store Data.csv"
ps = pd.read_csv(fp)
print("Loaded rows, cols:", ps.shape)
ps.head(3)

Loaded rows, cols: (10841, 13)


Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up


In [29]:
ps.index=range(1,len(ps)+1)

In [30]:
ps.head(3)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
1,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
2,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
3,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up


In [31]:
#  Basic parsing & standardize columns (warning-free) 
def parse_installs(x):
    if pd.isna(x): return np.nan
    return pd.to_numeric(str(x).replace('+','').replace(',','').strip(), errors='coerce')

def parse_size(x):
    if pd.isna(x): return np.nan
    s = str(x).strip()
    if 'Varies' in s or s == '': return np.nan
    if s.lower().endswith('m'):
        return pd.to_numeric(s[:-1], errors='coerce') * 1_000_000
    if s.lower().endswith('k'):
        return pd.to_numeric(s[:-1], errors='coerce') * 1_000
    return pd.to_numeric(s, errors='coerce')

colmap = {
    'App':'app',
    'Category':'category',
    'Installs':'installs',
    'Reviews':'reviews',
    'Size':'size',
    'Last Updated':'last_updated'
}
for o,n in colmap.items():
    if o in ps.columns:
        ps.rename(columns={o:n}, inplace=True)

# parse fields (use pd.to_datetime without deprecated arg)
ps['installs_n'] = ps['installs'].apply(parse_installs) if 'installs' in ps.columns else np.nan
ps['reviews_n']  = pd.to_numeric(ps['reviews'], errors='coerce') if 'reviews' in ps.columns else np.nan
ps['size_b']     = ps['size'].apply(parse_size) if 'size' in ps.columns else np.nan
ps['last_dt']    = pd.to_datetime(ps.get('last_updated'), errors='coerce') if 'last_updated' in ps.columns else pd.NaT

# ensure app & category strings safe
ps['app'] = ps['app'].astype('string') if 'app' in ps.columns else ps.get('App', pd.Series([], dtype='string'))
ps['category'] = ps['category'].astype('string') if 'category' in ps.columns else ps.get('Category', pd.Series([], dtype='string'))

print("Parsed columns ready:", ['installs_n' in ps.columns, 'reviews_n' in ps.columns, 'size_b' in ps.columns, 'last_dt' in ps.columns])
ps.head(3)


Parsed columns ready: [True, True, True, True]


Unnamed: 0,app,category,Rating,reviews,size,installs,Type,Price,Content Rating,Genres,last_updated,Current Ver,Android Ver,installs_n,reviews_n,size_b,last_dt
1,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up,10000.0,159.0,19000000.0,2018-01-07
2,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,500000.0,967.0,14000000.0,2018-01-15
3,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up,5000000.0,87510.0,8700000.0,2018-08-01


In [32]:
ps.head()

Unnamed: 0,app,category,Rating,reviews,size,installs,Type,Price,Content Rating,Genres,last_updated,Current Ver,Android Ver,installs_n,reviews_n,size_b,last_dt
1,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up,10000.0,159.0,19000000.0,2018-01-07
2,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,500000.0,967.0,14000000.0,2018-01-15
3,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up,5000000.0,87510.0,8700000.0,2018-08-01
4,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up,50000000.0,215644.0,25000000.0,2018-06-08
5,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up,100000.0,967.0,2800000.0,2018-06-20


In [33]:
#  Apply exact filters  instructions
df = ps.copy()

#  exclude app names that start with x,y,z (case-insensitive)
df = df[~df['app'].str.lower().str.startswith(('x','y','z'))]

# app name should NOT contain letter "S" (case-insensitive) -> exclude if contains 's' or 'S'
df = df[~df['app'].str.contains('s', case=False, na=False)]

#  category should start with "E" or "C" or "B" (case-insensitive)
df = df[df['category'].str.strip().str[0].str.upper().isin(['E','C','B'])]

# reviews > 500
df = df[df['reviews_n'] > 500]

#  last_dt must be valid (we need time series); drop NaT
df = df.dropna(subset=['last_dt'])

#  size condition not specified here? (task earlier had >15M for task3) For this task no size condition mentioned so we skip it.

#  Prepare month period for aggregation
df['month'] = df['last_dt'].dt.to_period('M').dt.to_timestamp()

print("After filters rows:", df.shape[0])
df[['app','category','installs_n','reviews_n','month']].head(3)


After filters rows: 245


Unnamed: 0,app,category,installs_n,reviews_n,month
108,Ulta Beauty,BEAUTY,1000000.0,42050.0,2018-06-01
137,Rainbow Camera,BEAUTY,1000000.0,3871.0,2018-07-01
141,E-Book Read - Read Book for free,BOOKS_AND_REFERENCE,50000.0,1857.0,2018-08-01


In [34]:
# Aggregate installs per month per category 
grouped = (df.groupby(['month','category'], as_index=False)
             .agg(total_installs=('installs_n','sum'))
             .sort_values(['category','month']))

# Only keep categories with any data (already filtered), but we will keep all categories starting with E/C/B present
# apply translations for display: Beauty->Hindi, Business->Tamil, Dating->German
# Create a mapping (only if these categories exist)
translation_map = {
    'Beauty': 'सौंदर्य',        # Hindi
    'Business': 'வணிகம்',       # Tamil (Vanigam / vAnikam simplified as 'வணிகம்')
    'Dating': 'Verabredung'      # German translation for Dating
}

# map display name
grouped['category_disp'] = grouped['category'].map(lambda x: translation_map.get(x, x))

grouped.head(6)


Unnamed: 0,month,category,total_installs,category_disp
46,2018-03-01,BEAUTY,5000.0,BEAUTY
58,2018-06-01,BEAUTY,1000000.0,BEAUTY
64,2018-07-01,BEAUTY,1000000.0,BEAUTY
72,2018-08-01,BEAUTY,100000.0,BEAUTY
5,2014-10-01,BOOKS_AND_REFERENCE,500000.0,BOOKS_AND_REFERENCE
6,2014-11-01,BOOKS_AND_REFERENCE,5000000.0,BOOKS_AND_REFERENCE


In [41]:
#Compute MoM % change per category and mark months with >20% increase 
grouped = grouped.sort_values(['category','month'])
grouped['mom_pct'] = grouped.groupby('category')['total_installs'].pct_change()  # fractional change (NaN for first month)
grouped['mom_pct'] = grouped['mom_pct'].fillna(0.0)

# Flag months where mom increase > 20% (0.2)
grouped['high_growth'] = grouped['mom_pct'] > 0.20

# For shading we will create intervals: for each (category, month) where high_growth True we shade that month span
grouped.head(8)


Unnamed: 0,month,category,total_installs,category_disp,mom_pct,high_growth
46,2018-03-01,BEAUTY,5000.0,BEAUTY,0.0,False
58,2018-06-01,BEAUTY,1000000.0,BEAUTY,199.0,True
64,2018-07-01,BEAUTY,1000000.0,BEAUTY,0.0,False
72,2018-08-01,BEAUTY,100000.0,BEAUTY,-0.9,False
5,2014-10-01,BOOKS_AND_REFERENCE,500000.0,BOOKS_AND_REFERENCE,0.0,False
6,2014-11-01,BOOKS_AND_REFERENCE,5000000.0,BOOKS_AND_REFERENCE,9.0,True
10,2015-07-01,BOOKS_AND_REFERENCE,10000000.0,BOOKS_AND_REFERENCE,1.0,True
18,2016-06-01,BOOKS_AND_REFERENCE,60000.0,BOOKS_AND_REFERENCE,-0.994,False


In [44]:
#  Time gate (6 PM – 9 PM IST) and plotting 
def allowed_time():
    now_t = datetime.now(IST).time() if IST else datetime.now().time()
    return time(18,0) <= now_t <= time(21,0)

if grouped.empty:
    print("No data remains after applying filters — please check the filters.")
else:
    if allowed_time():
        # Plot lines per category (use category_disp for labels)
        fig = go.Figure()
        cats = grouped['category'].unique().tolist()
        colors = px.colors.qualitative.Dark24

        # add one line trace per category
        for i, cat in enumerate(cats):
            sub = grouped[grouped['category']==cat]
            x = sub['month']
            y = sub['total_installs']
            disp = sub['category_disp'].iloc[0] if not sub['category_disp'].isna().all() else cat
            fig.add_trace(go.Scatter(
                x=x, y=y, mode='lines+markers', name=disp,
                line=dict(width=2), marker=dict(size=6),
                hovertemplate='Category=%s<br>Month=%{x|%Y-%m}<br>Total Installs=%{y:.0f}<extra></extra>' % disp
            ))

            # For shading: find months where high_growth True for this category
            high = sub[sub['high_growth']]
            for _, row in high.iterrows():
                start = row['month']
                # end = start + 1 month (use month + 1 day trick)
                end = (pd.to_datetime(start) + pd.offsets.MonthBegin(1))
                fig.add_vrect(x0=start, x1=end, fillcolor="rgba(255, 215, 0, 0.2)", # light gold
                              layer="below", line_width=0, annotation_text=">20% MoM", annotation_position="top left")

        fig.update_layout(
            title="Total Installs over Time by Category (filtered)",
            xaxis_title="Month",
            yaxis_title="Total Installs",
            legend_title="Category (translated where applicable)",
            xaxis=dict(tickformat="%Y-%m"),
            margin=dict(t=80, b=120)
        )
        fig.show()
    else:
        print("The graph will only be visible on the dashboard between 6:00 PM and 9:00 PM IST. (Right now, it is disabled.)")


The graph will only be visible on the dashboard between 6:00 PM and 9:00 PM IST. (Right now, it is disabled.)


In [46]:
#  Optional save HTML (run when plot visible) 
try:
    fig.write_html("Task4_TimeSeries.html")
    print("Saved Task4_TimeSeries.html")
except NameError:
    print("Note: Plot not created (time window outside or grouped empty), so no HTML saved.")


Saved Task4_TimeSeries.html
