In [80]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import webbrowser
import os
import pytz
import datetime as dt

In [81]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\itzzb\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [82]:
# Step 1: Load the Dataset
apps_df = pd.read_csv('Play Store Data.csv')
reviews_df = pd.read_csv('User Reviews.csv')

In [83]:
apps_df


Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10836,Sya9a Maroc - FR,FAMILY,4.5,38,53M,"5,000+",Free,0,Everyone,Education,"July 25, 2017",1.48,4.1 and up
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,3.6M,100+,Free,0,Everyone,Education,"July 6, 2018",1.0,4.1 and up
10838,Parkinson Exercices FR,MEDICAL,,3,9.5M,"1,000+",Free,0,Everyone,Medical,"January 20, 2017",1.0,2.2 and up
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,Varies with device,"1,000+",Free,0,Mature 17+,Books & Reference,"January 19, 2015",Varies with device,Varies with device


In [84]:
# Step 2: Data Cleaning
apps_df = apps_df.dropna(subset=['Rating'])
for column in apps_df.columns:
    apps_df[column].fillna(apps_df[column].mode()[0], inplace=True)
apps_df.drop_duplicates(inplace=True)
apps_df = apps_df[apps_df['Rating'] <= 5]
reviews_df.dropna(subset=['Translated_Review'], inplace=True)


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [85]:
# Merge datasets on 'App' and handle non-matching apps
merged_df = pd.merge(apps_df, reviews_df, on='App', how='inner')
merged_df

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,A kid's excessive ads. The types ads allowed a...,Negative,-0.250000,1.000000
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,It bad >:(,Negative,-0.725000,0.833333
2,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,like,Neutral,0.000000,0.000000
3,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,I love colors inspyering,Positive,0.500000,0.600000
4,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,I hate,Negative,-0.800000,0.900000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59119,Firefox Focus: The privacy browser,COMMUNICATION,4.4,36981,4.0M,"1,000,000+",Free,0,Everyone,Communication,"July 6, 2018",5.2,5.0 and up,Nice broser slow browsing speed... make 8mbps ...,Positive,0.100000,0.492308
59120,Firefox Focus: The privacy browser,COMMUNICATION,4.4,36981,4.0M,"1,000,000+",Free,0,Everyone,Communication,"July 6, 2018",5.2,5.0 and up,The thing I found missing simple bookmark draw...,Positive,0.225000,0.426786
59121,Firefox Focus: The privacy browser,COMMUNICATION,4.4,36981,4.0M,"1,000,000+",Free,0,Everyone,Communication,"July 6, 2018",5.2,5.0 and up,Great Relief unwanted pop ups showing up. What...,Positive,0.650000,0.625000
59122,Firefox Focus: The privacy browser,COMMUNICATION,4.4,36981,4.0M,"1,000,000+",Free,0,Everyone,Communication,"July 6, 2018",5.2,5.0 and up,Hoped found new go-to; LOVE Firefox PC. Aside ...,Positive,0.345455,0.484848


In [86]:
# Step 3: Data Transformation
apps_df['Reviews'] = apps_df['Reviews'].astype(int)
apps_df['Installs'] = apps_df['Installs'].str.replace(',', '').str.replace('+', '').astype(int)
apps_df['Price'] = apps_df['Price'].str.replace('$', '').astype(float)

In [87]:
def convert_size(size):
    if 'M' in size:
        return float(size.replace('M', ''))
    elif 'k' in size:
        return float(size.replace('k', '')) / 1024
    else:
        return np.nan

In [88]:
apps_df['Size'] = apps_df['Size'].apply(convert_size)

In [89]:
# Add log_installs and log_reviews columns
apps_df['Log_Installs'] = np.log1p(apps_df['Installs'])
apps_df['Log_Reviews'] = np.log1p(apps_df['Reviews'])

In [90]:
# Add Rating Group column
def rating_group(rating):
    if rating >= 4:
        return 'Top rated'
    elif rating >= 3:
        return 'Above average'
    elif rating >= 2:
        return 'Average'
    else:
        return 'Below average'

apps_df['Rating_Group'] = apps_df['Rating'].apply(rating_group)

In [91]:
# Add Revenue column
apps_df['Revenue'] = apps_df['Price'] * apps_df['Installs']

In [92]:
# Sentiment Analysis
sia = SentimentIntensityAnalyzer()
reviews_df['Sentiment_Score'] = reviews_df['Translated_Review'].apply(lambda x: sia.polarity_scores(str(x))['compound'])

In [93]:
# Extract year from 'Last Updated' and create 'Year' column
apps_df['Last Updated'] = pd.to_datetime(apps_df['Last Updated'], errors='coerce')
apps_df['Year'] = apps_df['Last Updated'].dt.year

In [94]:
import plotly.express as px

In [95]:
# Define the path for your HTML files
html_files_path = "./"

In [96]:
# Make sure the directory exists
if not os.path.exists(html_files_path):
    os.makedirs(html_files_path)

In [97]:
# Initialize plot_containers
plot_containers = ""

In [98]:
# Save each Plotly figure to an HTML file
def save_plot_as_html(fig, filename, insight):
    global plot_containers
    filepath = os.path.join(html_files_path, filename)
    html_content = pio.to_html(fig, full_html=False, include_plotlyjs='inline')
    # Append the plot and its insight to plot_containers
    plot_containers += f"""
    <div class="plot-container" id="{filename}" onclick="openPlot('{filename}')">
        <div class="plot">{html_content}</div>
        <div class="insights">{insight}</div>
    </div>
    """
    fig.write_html(filepath, full_html=False, include_plotlyjs='inline')

In [99]:
# Define your plots
plot_width = 400
plot_height = 300
plot_bg_color = 'black'
text_color = 'white'
title_font = {'size': 16}
axis_font = {'size': 12}

In [100]:
#Figure 1
# Category Analysis Plot
category_counts = apps_df['Category'].value_counts().nlargest(10)
fig1 = px.bar(
    x=category_counts.index,
    y=category_counts.values,
    labels={'x': 'Category', 'y': 'Count'},
    title='Top Categories on Play Store',
    color=category_counts.index,
    color_discrete_sequence=px.colors.sequential.Plasma,
    width=plot_width,
    height=plot_height
)
fig1.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
fig1.update_traces(marker=dict(line=dict(color=text_color, width=1)))
save_plot_as_html(fig1, "category_analysis.html", "The top categories on the Play Store are dominated by tools, entertainment, and productivity apps. This suggests users are looking for apps that either provide utility or offer leisure activities.")

In [101]:
#Figure 2
# Type Analysis Plot
type_counts = apps_df['Type'].value_counts()
fig2 = px.pie(
    values=type_counts.values,
    names=type_counts.index,
    title='App Type Distribution',
    color_discrete_sequence=px.colors.sequential.RdBu,
    width=plot_width,
    height=plot_height
)
fig2.update_traces(textposition='inside', textinfo='percent+label')
fig2.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    margin=dict(l=10, r=10, t=30, b=10)
)
save_plot_as_html(fig2, "type_analysis.html", "Most apps on the Play Store are free, indicating a strategy to attract users first and monetize through ads or in-app purchases.")

In [102]:
#Figure 3
# Rating Distribution Plot
fig3 = px.histogram(
    apps_df,
    x='Rating',
    nbins=20,
    title='Rating Distribution',
    color_discrete_sequence=['#636EFA'],
    width=plot_width,
    height=plot_height
)
fig3.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
save_plot_as_html(fig3, "rating_distribution.html", "Ratings are skewed towards higher values, suggesting that most apps are rated favorably by users.")

In [103]:
#Figure 4
#Sentiment Analysis Plot
sentiment_counts = reviews_df['Sentiment_Score'].value_counts()
fig4 = px.bar(
    x=sentiment_counts.index,
    y=sentiment_counts.values,
    labels={'x': 'Sentiment Score', 'y': 'Count'},
    title='Sentiment Distribution',
    color=sentiment_counts.index,
    color_discrete_sequence=px.colors.sequential.RdPu,
    width=plot_width,
    height=plot_height
)
fig4.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
fig4.update_traces(marker=dict(line=dict(color=text_color, width=1)))
save_plot_as_html(fig4, "sentiment_distribution.html", "Sentiments in reviews show a mix of positive and negative feedback, with a slight lean towards positive sentiments.")

In [104]:
#Figure 5
# Installs by Category Plot
installs_by_category = apps_df.groupby('Category')['Installs'].sum().nlargest(10)
fig5 = px.bar(
    x=installs_by_category.values,
    y=installs_by_category.index,
    orientation='h',
    labels={'x': 'Installs', 'y': 'Category'},
    title='Installs by Category',
    color=installs_by_category.index,
    color_discrete_sequence=px.colors.sequential.Blues,
    width=plot_width,
    height=plot_height
)
fig5.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
fig5.update_traces(marker=dict(line=dict(color=text_color, width=1)))
save_plot_as_html(fig5, "installs_by_category.html", "The categories with the most installs are social and communication apps, which reflects their broad appeal and daily usage.")

In [105]:
#Figure 6
# Updates Per Year Plot
updates_per_year = apps_df['Last Updated'].dt.year.value_counts().sort_index()
fig6 = px.line(
    x=updates_per_year.index,
    y=updates_per_year.values,
    labels={'x': 'Year', 'y': 'Number of Updates'},
    title='Number of Updates Over the Years',
    color_discrete_sequence=['#AB63FA'],
    width=plot_width,
    height=plot_height
)
fig6.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
save_plot_as_html(fig6, "updates_per_year.html", "Updates have been increasing over the years, showing that developers are actively maintaining and improving their apps.")

In [106]:
#Figure 7
# Revenue by Category Plot
revenue_by_category = apps_df.groupby('Category')['Revenue'].sum().nlargest(10)
fig7 = px.bar(
    x=revenue_by_category.index,
    y=revenue_by_category.values,
    labels={'x': 'Category', 'y': 'Revenue'},
    title='Revenue by Category',
    color=revenue_by_category.index,
    color_discrete_sequence=px.colors.sequential.Greens,
    width=plot_width,
    height=plot_height
)
fig7.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
fig7.update_traces(marker=dict(line=dict(color=text_color, width=1)))
save_plot_as_html(fig7, "revenue_by_category.html", "Categories such as Business and Productivity lead in revenue generation, indicating their monetization potential.")

In [107]:
#Figure 8
# Genre Count Plot
genre_counts = apps_df['Genres'].str.split(';', expand=True).stack().value_counts().nlargest(10)
fig8 = px.bar(
    x=genre_counts.index,
    y=genre_counts.values,
    labels={'x': 'Genre', 'y': 'Count'},
    title='Top Genres',
    color=genre_counts.index,
    color_discrete_sequence=px.colors.sequential.OrRd,
    width=plot_width,
    height=plot_height
)
fig8.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
fig8.update_traces(marker=dict(line=dict(color=text_color, width=1)))
save_plot_as_html(fig8, "genres_counts.html", "Action and Casual genres are the most common, reflecting users' preference for engaging and easy-to-play games.")

In [108]:
#Figure 9
# Impact of Last Update on Rating
fig9 = px.scatter(
    apps_df,
    x='Last Updated',
    y='Rating',
    color='Type',
    title='Impact of Last Update on Rating',
    color_discrete_sequence=px.colors.qualitative.Vivid,
    width=plot_width,
    height=plot_height
)
fig9.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
save_plot_as_html(fig9, "update_on_rating.html", "The scatter plot shows a weak correlation between the last update date and ratings, suggesting that more frequent updates don't always result in better ratings.")

In [109]:
#figure 10
# Ratings for Paid vs Free Apps
fig10 = px.box(
    apps_df,
    x='Type',
    y='Rating',
    color='Type',
    title='Ratings for Paid vs Free Apps',
    color_discrete_sequence=px.colors.qualitative.Pastel,
    width=plot_width,
    height=plot_height
)
fig10.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
save_plot_as_html(fig10, "ratings_paid_free.html", "Paid apps generally have higher ratings compared to free apps, suggesting that users expect higher quality from apps they pay for.")

In [110]:
# Split plot_containers to handle the last plot properly
plot_containers_split = plot_containers.split('</div>')
if len(plot_containers_split) > 1:
    final_plot = plot_containers_split[-2] + '</div>'
else:
    final_plot = plot_containers  # Use plot_containers as default if splitting isn't sufficient

In [111]:
# HTML template for the dashboard
dashboard_html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Google Play Store Review Analytics</title>
    <style>
        body {{
            font-family: Arial, sans-serif;
            background-color: #333;
            color: #fff;
            margin: 0;
            padding: 0;
        }}
        .header {{
            display: flex;
            align-items: center;
            justify-content: center;
            padding: 20px;
            background-color: #444;
        }}
        .header img {{
            margin: 0 10px;
            height: 50px;
        }}
        .container {{
            display: flex;
            flex-wrap: wrap;
            justify-content: center;
            padding: 20px;
        }}
        .plot-container {{
            border: 2px solid #555;
            margin: 10px;
            padding: 10px;
            width: {plot_width}px;
            height: {plot_height}px;
            overflow: hidden;
            position: relative;
            cursor: pointer;
        }}
        .insights {{
            display: none;
            position: absolute;
            right: 10px;
            top: 10px;
            background-color: rgba(0,0,0,0.7);
            padding: 5px;
            border-radius: 5px;
            color: #fff;
        }}
        .plot-container:hover .insights {{
            display: block;
        }}
    </style>
    <script>
        function openPlot(filename) {{
            window.open(filename, '_blank');
        }}
    </script>
</head>
<body>
    <div class="header">
        <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/4/4a/Logo_2013_Google.png/800px-Logo_2013_Google.png" alt="Google Logo">
        <h1>Google Play Store Reviews Analytics</h1>
        <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/7/78/Google_Play_Store_badge_EN.svg/1024px-Google_Play_Store_badge_EN.svg.png">
    </div>
    <div class="container">
        {plots}
    </div>
</body>
</html>
"""

In [112]:
# Use these containers to fill in your dashboard HTML
final_html = dashboard_html.format(plots=plot_containers, plot_width=plot_width, plot_height=plot_height)

In [113]:
# Save the final dashboard to an HTML file
dashboard_path = os.path.join(html_files_path, "dashboard.html")
with open(dashboard_path, "w", encoding="utf-8") as f:
    f.write(final_html)

In [114]:
# Automatically open the generated HTML file in a web browser
webbrowser.open('file://' + os.path.realpath(dashboard_path))

True

**Nullclass Internship Tasks (11/03/2025-11/05/2025)**


**Task 1.** Create a scatter plot to visualize the relationship between revenue and the number of installs for paid apps only. Add a trendline to show the correlation and color-code the points based on app categories.

In [115]:
# Step 1: Filtering the apps data for paid apps only
paid_apps = apps_df[apps_df['Type'] == "Paid"]
paid_apps

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Log_Installs,Log_Reviews,Rating_Group,Revenue,Year
234,TurboScan: scan documents and receipts in PDF,BUSINESS,4.7,11442,6.80000,100000,Paid,4.99,Everyone,Business,2018-03-25,1.5.2,4.0 and up,11.512935,9.345133,Top rated,499000.00,2018
235,Tiny Scanner Pro: PDF Doc Scan,BUSINESS,4.8,10295,39.00000,100000,Paid,4.99,Everyone,Business,2017-04-11,3.4.6,3.0 and up,11.512935,9.239511,Top rated,499000.00,2017
427,Puffin Browser Pro,COMMUNICATION,4.0,18247,,100000,Paid,3.99,Everyone,Communication,2018-07-05,7.5.3.20547,4.1 and up,11.512935,9.811811,Top rated,399000.00,2018
476,"Moco+ - Chat, Meet People",DATING,4.2,1545,,10000,Paid,3.99,Mature 17+,Dating,2018-06-19,2.6.139,4.1 and up,9.210440,7.343426,Top rated,39900.00,2018
477,Calculator,DATING,2.6,57,6.20000,1000,Paid,6.99,Everyone,Dating,2017-10-25,1.1.6,4.0 and up,6.908755,4.060443,Average,6990.00,2017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10690,FO Bixby,PERSONALIZATION,5.0,5,0.84082,100,Paid,0.99,Everyone,Personalization,2018-04-25,0.2,7.0 and up,4.615121,1.791759,Top rated,99.00,2018
10697,Mu.F.O.,GAME,5.0,2,16.00000,1,Paid,0.99,Everyone,Arcade,2017-03-03,1.0,2.3 and up,0.693147,1.098612,Top rated,0.99,2017
10760,Fast Tract Diet,HEALTH_AND_FITNESS,4.4,35,2.40000,1000,Paid,7.99,Everyone,Health & Fitness,2018-08-08,1.9.3,4.2 and up,6.908755,3.583519,Top rated,7990.00,2018
10782,Trine 2: Complete Story,GAME,3.8,252,11.00000,10000,Paid,16.99,Teen,Action,2015-02-27,2.22,5.0 and up,9.210440,5.533389,Above average,169900.00,2015


In [116]:
#Step 2: Ensuring revenue and installs are numeric for proper visualization
paid_apps['Revenue'] = pd.to_numeric(paid_apps['Revenue'], errors='coerce')
paid_apps['Installs'] = pd.to_numeric(paid_apps['Installs'], errors='coerce')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [117]:
# Step 3: Adding a trendline (using numpy for computation)
x = paid_apps['Installs']
y = paid_apps['Revenue']
coeff = np.polyfit(x, y, 1)  
tl = coeff[0] * x + coeff[1]

In [159]:
#Figure 11
# Step 4: Creating an interactive scatter plot based on App Categories with Plotly
fig11 = px.scatter(
    paid_apps,
    x='Installs',
    y='Revenue',
    color='Category',
    title="Relationship Between Revenue and Installs for Paid Apps",
    labels={"Installs": "Number of Installs", "Revenue": "Revenue (in $)"},
    hover_data=['App'],
    opacity=0.8,  
    template='plotly_dark' 
)

# Step 5: Adding the trendline to the plot
fig11.add_scatter(
    x=paid_apps['Installs'], 
    y=tl, 
    mode='lines', 
    name='Trendline',
    line=dict(color='yellow', width=2)
)

# Step 6: Customing the layout
fig11.update_layout(
    title="Relationship Between Revenue and Installs for Paid Apps",
    xaxis=dict(title="Number of Installs", gridcolor='gray'),
    yaxis=dict(title="Revenue (in $)", gridcolor='gray'),
    legend_title="App Categories",
    plot_bgcolor='black',
    paper_bgcolor="black",
    font_color="white",
    title_font=dict(size=18, color="white"),
    xaxis_title_font=dict(size=14),
    yaxis_title_font=dict(size=14),
    legend_title_font=dict(size=14),
    legend=dict(font=dict(size=12)) 
)

save_plot_as_html(fig11, "revenue_installs_paid_apps.html", "The scatter plot shows the relationship between revenue and installs for paid apps. The trendline indicates a positive correlation between the two variables.")

import webbrowser
webbrowser.open("revenue_installs_paid_apps.html")

True

**Task 2.** Create a dual-axis chart comparing the average installs and revenue for free vs. paid apps within the top 3 app categories. Apply filters to exclude apps with fewer than 10,000 installs and revenue below $10,000 and android version should be more than 4.0 as well as size should be more than 15M and content rating should be Everyone and app name should not have more than 30 characters including space and special character .this graph should work only between 1 PM IST to 2 PM IST apart from that time we should not show this graph in dashboard itself.

In [119]:
#Installing the needed Library
import plotly.graph_objects as go 
from datetime import datetime, time

In [120]:
#Filter 1: Apps with installs > 10,000
filtered_apps = apps_df[apps_df['Installs'] > 10_000]
filtered_apps

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Log_Installs,Log_Reviews,Rating_Group,Revenue,Year
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.000000,500000,Free,0.0,Everyone,Art & Design;Pretend Play,2018-01-15,2.0.0,4.0.3 and up,13.122365,6.875232,Above average,0.0,2018
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.700000,5000000,Free,0.0,Everyone,Art & Design,2018-08-01,1.2.4,4.0.3 and up,15.424949,11.379520,Top rated,0.0,2018
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25.000000,50000000,Free,0.0,Teen,Art & Design,2018-06-08,Varies with device,4.2 and up,17.727534,12.281389,Top rated,0.0,2018
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.800000,100000,Free,0.0,Everyone,Art & Design;Creativity,2018-06-20,1.1,4.4 and up,11.512935,6.875232,Top rated,0.0,2018
5,Paper flowers instructions,ART_AND_DESIGN,4.4,167,5.600000,50000,Free,0.0,Everyone,Art & Design,2017-03-26,1.0,2.3 and up,10.819798,5.123964,Top rated,0.0,2017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10817,HTC Sense Input - FR,TOOLS,4.0,885,8.000000,100000,Free,0.0,Everyone,Tools,2015-10-30,1.0.612928,5.0 and up,11.512935,6.786717,Top rated,0.0,2015
10826,Frim: get new friends on local chat rooms,SOCIAL,4.0,88486,,5000000,Free,0.0,Mature 17+,Social,2018-03-23,Varies with device,Varies with device,15.424949,11.390611,Top rated,0.0,2018
10830,News Minecraft.fr,NEWS_AND_MAGAZINES,3.8,881,2.300000,100000,Free,0.0,Everyone,News & Magazines,2014-01-20,1.5,1.6 and up,11.512935,6.782192,Above average,0.0,2014
10832,FR Tides,WEATHER,3.8,1195,0.568359,100000,Free,0.0,Everyone,Weather,2014-02-16,6.0,2.1 and up,11.512935,7.086738,Above average,0.0,2014


In [121]:
#Filter 2: Revenue > $10,000
filtered_apps = filtered_apps[filtered_apps['Revenue'] > 10_000]
filtered_apps

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Log_Installs,Log_Reviews,Rating_Group,Revenue,Year
234,TurboScan: scan documents and receipts in PDF,BUSINESS,4.7,11442,6.8,100000,Paid,4.99,Everyone,Business,2018-03-25,1.5.2,4.0 and up,11.512935,9.345133,Top rated,499000.0,2018
235,Tiny Scanner Pro: PDF Doc Scan,BUSINESS,4.8,10295,39.0,100000,Paid,4.99,Everyone,Business,2017-04-11,3.4.6,3.0 and up,11.512935,9.239511,Top rated,499000.0,2017
427,Puffin Browser Pro,COMMUNICATION,4.0,18247,,100000,Paid,3.99,Everyone,Communication,2018-07-05,7.5.3.20547,4.1 and up,11.512935,9.811811,Top rated,399000.0,2018
853,Toca Life: City,EDUCATION,4.7,31085,24.0,500000,Paid,3.99,Everyone,Education;Pretend Play,2018-07-06,1.5-play,4.4 and up,13.122365,10.344513,Top rated,1995000.0,2018
854,Toca Life: Hospital,EDUCATION,4.7,3528,24.0,100000,Paid,3.99,Everyone,Education;Pretend Play,2018-06-12,1.1.1-play,4.4 and up,11.512935,8.168770,Top rated,399000.0,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10122,Organizer,PRODUCTIVITY,4.4,936,5.4,50000,Paid,2.99,Everyone,Productivity,2018-06-17,2.96,4.0 and up,10.819798,6.842683,Top rated,149500.0,2018
10531,Kernel Manager for Franco Kernel ✨,TOOLS,4.8,12700,10.0,100000,Paid,3.49,Everyone,Tools,2018-08-03,3.2.5,5.0 and up,11.512935,9.449436,Top rated,349000.0,2018
10645,Football Manager Mobile 2018,SPORTS,3.9,11460,,100000,Paid,8.99,Everyone,Sports,2018-06-27,Varies with device,4.1 and up,11.512935,9.346705,Above average,899000.0,2018
10679,Solitaire+,GAME,4.6,11235,,100000,Paid,2.99,Everyone,Card,2018-07-30,Varies with device,Varies with device,11.512935,9.326878,Top rated,299000.0,2018


In [122]:
# Converting the 'Android Ver' column to numeric by extracting the numeric part
filtered_apps['Android Ver'] = filtered_apps['Android Ver'].str.extract(r'(\d+(\.\d+)?)')[0]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [123]:
# Converting to numeric type
filtered_apps['Android Ver'] = pd.to_numeric(filtered_apps['Android Ver'], errors='coerce')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [124]:
# Filter 3: Android version > 4.0
filtered_apps = filtered_apps[filtered_apps['Android Ver'] > 4.0]

# Filter 4: Size > 15M
filtered_apps = filtered_apps[filtered_apps['Size'] > 15.0]

# Filter 5: Content Rating == "Everyone"
filtered_apps = filtered_apps[filtered_apps['Content Rating'] == 'Everyone']

# Filter 6: App name length <= 30 characters (including spaces and special characters)
filtered_apps = filtered_apps[filtered_apps['App'].str.len() <=30]
filtered_apps

 

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Log_Installs,Log_Reviews,Rating_Group,Revenue,Year
853,Toca Life: City,EDUCATION,4.7,31085,24.0,500000,Paid,3.99,Everyone,Education;Pretend Play,2018-07-06,1.5-play,4.4,13.122365,10.344513,Top rated,1995000.0,2018
854,Toca Life: Hospital,EDUCATION,4.7,3528,24.0,100000,Paid,3.99,Everyone,Education;Pretend Play,2018-06-12,1.1.1-play,4.4,11.512935,8.16877,Top rated,399000.0,2018
1831,The Game of Life,GAME,4.4,18621,63.0,100000,Paid,2.99,Everyone,Board,2018-07-04,2.1.2,4.4,11.512935,9.832099,Top rated,299000.0,2018
1833,The Room: Old Sins,GAME,4.9,21119,48.0,100000,Paid,4.99,Everyone,Puzzle,2018-04-18,1.0.1,4.4,11.512935,9.957976,Top rated,499000.0,2018
1839,Monument Valley 2,GAME,4.6,9394,33.0,100000,Paid,4.99,Everyone,Puzzle,2017-11-30,1.2.9,4.4,11.512935,9.147933,Top rated,499000.0,2017
2151,Toca Life: City,FAMILY,4.7,31100,24.0,500000,Paid,3.99,Everyone,Education;Pretend Play,2018-07-06,1.5-play,4.4,13.122365,10.344995,Top rated,1995000.0,2018
2883,Facetune - For Free,PHOTOGRAPHY,4.4,49553,48.0,1000000,Paid,5.99,Everyone,Photography,2018-07-25,1.3.1,4.1,13.815512,10.810818,Top rated,5990000.0,2018
3039,Golfshot Plus: Golf GPS,SPORTS,4.1,3387,25.0,50000,Paid,29.99,Everyone,Sports,2018-07-11,4.18.0,4.1,10.819798,8.127995,Top rated,1499500.0,2018
3405,HD Widgets,PERSONALIZATION,4.3,58617,26.0,1000000,Paid,0.99,Everyone,Personalization,2016-12-07,4.3.2,4.4,13.815512,10.978797,Top rated,990000.0,2016
4260,Cut the Rope GOLD,FAMILY,4.6,61264,43.0,1000000,Paid,0.99,Everyone,Puzzle,2018-06-19,3.7.0,4.1,13.815512,11.022964,Top rated,990000.0,2018


In [125]:
#Selecting Top 3 Categories
top_categories = (
    filtered_apps.groupby('Category')['Installs']
    .sum()
    .nlargest(3)
    .index
)
filtered_apps = filtered_apps[filtered_apps['Category'].isin(top_categories)]



In [126]:
# Calculating average installs and revenue for free and paid apps
comparison_data = filtered_apps.groupby(['Category', 'Type'])[['Installs', 'Revenue']].mean().reset_index()

In [162]:

# Ensuring that the chart is displayed only between 1 PM and 2 PM IST
start_time = time(13, 0)  
end_time = time(14, 0)

# Getting current time in IST
current_time_utc = datetime.utcnow()
current_time_ist = (current_time_utc + pd.Timedelta(hours=5, minutes=30)).time()


datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).



In [163]:
#Figure 12
if start_time <= current_time_ist <= end_time:
    fig12= go.Figure()

    # Adding bar traces for average revenue
    for c in top_categories:
        fig12.add_trace(
            go.Bar(
                x=comparison_data[comparison_data['Category'] == c]['Type'],
                y=comparison_data[comparison_data['Category'] == c]['Revenue'],
                name=f"{c} Revenue",
                yaxis="y1",  # Mapping to the first y-axis
                text=comparison_data[comparison_data['Category'] == c]['Revenue'],
                textposition='auto'
            )
        )
    
    # Adding line traces for average installs
    for c1 in top_categories:
        fig12.add_trace(
            go.Scatter(
                x=comparison_data[comparison_data['Category'] ==c1]['Type'],
                y=comparison_data[comparison_data['Category'] == c1]['Installs'],
                name=f"{c1} Installs",
                yaxis="y2", # Mapping to the second y-axis
                mode="lines+markers"
            )
        )

    # Layout configuration
    fig12.update_layout(
        title="Average Installs vs Revenue for Free vs Paid Apps (Top 3 Categories)",
        xaxis=dict(title="App Type (Free/Paid)"),
        yaxis=dict(
            title="Average Revenue (in USD)",
            title_font=dict(color="blue"),
            tickfont=dict(color="blue"),
        ),
        yaxis2=dict(
            title="Average Installs (in Millions)",
            title_font=dict(color="green"),
            tickfont=dict(color="green"),
            overlaying="y",
            side="right",
        ),
        
        legend=dict(x=0.1, y=1.1),
        barmode="group"
    )
    # Saving chart as HTML file
    save_plot_as_html(fig12,"dual_axis_chart.html","")
    
# Opening the chart in the browser
    webbrowser.open("dual_axis_chart.html")
else:
    print("Chart is not available outside the time range (1 PM - 2 PM IST).")

Chart is not available outside the time range (1 PM - 2 PM IST).


**Task 3.** Use a grouped bar chart to compare the average rating and total review count for the top 10 app categories by number of installs. Filter out any categories where the average rating is below 4.0 and size below 10 M and last update should be Jan month . this graph should work only between 3PM IST to 5 PM IST apart from that time we should not show this graph in dashboard itself.

In [129]:
print(apps_df['Last Updated'].dtype)

datetime64[ns]


In [130]:
# Adding a extra "Month" column extracting from "Last Updated" column for filtering according to the task
apps_df['Month'] = apps_df['Last Updated'].dt.month

In [131]:
# Applying filters
filtered_apps_T3 = apps_df[
    (apps_df['Rating'] >= 4.0) & (apps_df['Size'] >= 10.0) & (apps_df['Month'] == 1)
]
filtered_apps_T3

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Log_Installs,Log_Reviews,Rating_Group,Revenue,Year,Month
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19.0,10000,Free,0.0,Everyone,Art & Design,2018-01-07,1.0.0,4.0.3 and up,9.210440,5.075174,Top rated,0.0,2018,1
33,Easy Origami Ideas,ART_AND_DESIGN,4.2,1015,11.0,100000,Free,0.0,Everyone,Art & Design,2018-01-06,1.1.0,4.1 and up,11.512935,6.923629,Top rated,0.0,2018,1
44,Popsicle Sticks and Similar DIY Craft Ideas,ART_AND_DESIGN,4.2,26,12.0,10000,Free,0.0,Everyone,Art & Design,2018-01-03,1.0.0,4.1 and up,9.210440,3.295837,Top rated,0.0,2018,1
446,Video Caller Id,COMMUNICATION,4.2,15287,17.0,1000000,Free,0.0,Everyone,Communication,2018-01-24,2.2.245,4.0.3 and up,13.815512,9.634823,Top rated,0.0,2018,1
719,Monster Truck Driver & Racing,EDUCATION,4.4,748,51.0,1000000,Free,0.0,Everyone,Education;Action & Adventure,2017-01-19,1.0.9,2.3 and up,13.815512,6.618739,Top rated,0.0,2017,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10649,How it Works: FN SCAR assault rifle,FAMILY,4.6,44,45.0,10000,Free,0.0,Everyone,Casual,2018-01-09,2.1.9f7,4.0.3 and up,9.210440,3.806662,Top rated,0.0,2018,1
10683,Hunting Safari 3D,SPORTS,4.2,36183,20.0,5000000,Free,0.0,Teen,Sports,2018-01-20,1.4,2.1 and up,15.424949,10.496372,Top rated,0.0,2018,1
10686,Armed Cam Gun Pack,GAME,4.2,1012,50.0,10000,Free,0.0,Teen,Action,2015-01-18,1.0.2,3.0 and up,9.210440,6.920672,Top rated,0.0,2015,1
10767,NFP 2018,EVENTS,4.8,8,16.0,500,Free,0.0,Everyone,Events,2018-01-09,1.0.3,4.2 and up,6.216606,2.197225,Top rated,0.0,2018,1


In [132]:
# Getting the top 10 categories by number of installs
top_catg_T3= (
    filtered_apps_T3.groupby('Category')['Installs']
    .sum()
    .nlargest(10)
    .index
)
top_catg_T3

Index(['FAMILY', 'SPORTS', 'GAME', 'ENTERTAINMENT', 'PERSONALIZATION',
       'PHOTOGRAPHY', 'LIFESTYLE', 'EDUCATION', 'TOOLS', 'TRAVEL_AND_LOCAL'],
      dtype='object', name='Category')

In [133]:
# Now that we have top 10 categories based on the no. of Installs done, moving with filtering the apps based on it.
filtered_apps_T3 = filtered_apps_T3[filtered_apps_T3['Category'].isin(top_catg_T3)]
filtered_apps_T3

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Log_Installs,Log_Reviews,Rating_Group,Revenue,Year,Month
719,Monster Truck Driver & Racing,EDUCATION,4.4,748,51.0,1000000,Free,0.0,Everyone,Education;Action & Adventure,2017-01-19,1.0.9,2.3 and up,13.815512,6.618739,Top rated,0.0,2017,1
748,Memorado - Brain Games,EDUCATION,4.4,56897,97.0,1000000,Free,0.0,Everyone,Education;Brain Games,2017-01-16,1.10.0,4.1 and up,13.815512,10.949015,Top rated,0.0,2017,1
917,Nick,ENTERTAINMENT,4.2,123279,25.0,10000000,Free,0.0,Everyone 10+,Entertainment;Music & Video,2018-01-24,2.0.8,4.4 and up,16.118096,11.722213,Top rated,0.0,2018,1
945,WWE,ENTERTAINMENT,4.5,736864,20.0,10000000,Free,0.0,Teen,Entertainment,2018-01-19,3.17.2,4.1 and up,16.118096,13.510160,Top rated,0.0,2018,1
986,Nick Jr. - Shows & Games,ENTERTAINMENT,4.2,8968,35.0,1000000,Free,0.0,Everyone,Entertainment;Music & Video,2018-01-23,1.0.20,4.4 and up,13.815512,9.101529,Top rated,0.0,2018,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10552,FK Crvena zvezda,SPORTS,4.9,1211,15.0,10000,Free,0.0,Everyone,Sports,2018-01-22,1.0.1,4.4 and up,9.210440,7.100027,Top rated,0.0,2018,1
10649,How it Works: FN SCAR assault rifle,FAMILY,4.6,44,45.0,10000,Free,0.0,Everyone,Casual,2018-01-09,2.1.9f7,4.0.3 and up,9.210440,3.806662,Top rated,0.0,2018,1
10683,Hunting Safari 3D,SPORTS,4.2,36183,20.0,5000000,Free,0.0,Teen,Sports,2018-01-20,1.4,2.1 and up,15.424949,10.496372,Top rated,0.0,2018,1
10686,Armed Cam Gun Pack,GAME,4.2,1012,50.0,10000,Free,0.0,Teen,Action,2015-01-18,1.0.2,3.0 and up,9.210440,6.920672,Top rated,0.0,2015,1


In [134]:
# Calculating average rating and total review counts for each category
comp_T3 = (
    filtered_apps_T3.groupby('Category')[['Rating', 'Reviews']]
    .agg({'Rating': 'mean', 'Reviews': 'sum'})
    .reset_index()
)
comp_T3

Unnamed: 0,Category,Rating,Reviews
0,EDUCATION,4.4,57645
1,ENTERTAINMENT,4.3,869111
2,FAMILY,4.395455,4544623
3,GAME,4.313333,2397589
4,LIFESTYLE,4.38,42809
5,PERSONALIZATION,4.475,155996
6,PHOTOGRAPHY,4.15,563720
7,SPORTS,4.342857,1982017
8,TOOLS,4.2,8010
9,TRAVEL_AND_LOCAL,4.1,974


In [166]:
# Ensuring that the chart is displayed only between 3 PM and 5 PM IST
start_T3 = time(15, 0)  
end_T3 = time(17, 0)

current_time_T3 = datetime.utcnow()
current_time_T3 = (current_time_T3+ pd.Timedelta(hours=5, minutes=30)).time()


datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).



In [167]:
#Figure 13
if start_T3  <= current_time_T3  <= end_T3:
    
    fig13 = go.Figure()

    # Adding bar trace for average rating
    fig13.add_trace(
        go.Bar(
            x=comp_T3['Category'],
            y=comp_T3['Rating'],
            name="Average Rating",
            text=comp_T3['Rating'],
            textposition='auto',
            marker_color='purple',
            opacity=0.7
        )
    )

    # Adding bar trace for total review counts
    fig13.add_trace(
        go.Bar(
            x=comp_T3['Category'],
            y=comp_T3['Reviews'],
            name="Total Review Count",
            text=comp_T3['Reviews'],
            textposition='auto',
            marker_color='orange',
            opacity=0.7
        )
    )

    fig13.update_layout(
        title="Comparison of Average Rating and Total Review Counts (Top 10 Categories by Installs)",
        xaxis=dict(title="App Category", tickangle=-45),
        yaxis=dict(title="Average Rating"),
        yaxis2=dict(
            title="Total Review Count",
            overlaying='y',
            side='right'
        ),
        legend=dict(x=0.1, y=1.1),
        barmode="group"
    )

    # Saving chart as HTML file
    save_plot_as_html(fig13,"grouped_bar_chart.html","")

    # Opening the chart in the browser
    webbrowser.open("grouped_bar_chart.html")
else:
    print("Chart is not available outside the time range (3 PM - 5 PM IST).")

Chart is not available outside the time range (3 PM - 5 PM IST).


**Task 4** Plot a time series line chart to show the trend of total installs over time, segmented by app category. Highlight periods of significant growth by shading the areas under the curve where the increase in installs exceeds 20% month-over-month and app name should not starts with x, y ,z and app category should start with letter " E " or " C " or " B " and reviews should be more than 500 as well as this graph should work only between 6 PM IST to 9 PM IST apart from that time we should not show this graph in dashboard itself.

In [137]:
filtered_data_T4 = apps_df[(apps_df['Content Rating'] == 'Teen')]
filtered_data_T4

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Log_Installs,Log_Reviews,Rating_Group,Revenue,Year,Month
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25.0,50000000,Free,0.0,Teen,Art & Design,2018-06-08,Varies with device,4.2 and up,17.727534,12.281389,Top rated,0.0,2018,6
12,Tattoo Name On My Photo Editor,ART_AND_DESIGN,4.2,44829,20.0,10000000,Free,0.0,Teen,Art & Design,2018-04-02,3.8,4.1 and up,16.118096,10.710633,Top rated,0.0,2018,4
34,I Creative Idea,ART_AND_DESIGN,4.7,353,4.2,10000,Free,0.0,Teen,Art & Design,2018-04-27,1.6,4.1 and up,9.210440,5.869297,Top rated,0.0,2018,4
72,"Android Auto - Maps, Media, Messaging & Voice",AUTO_AND_VEHICLES,4.2,271920,16.0,10000000,Free,0.0,Teen,Auto & Vehicles,2018-07-11,Varies with device,5.0 and up,16.118096,12.513267,Top rated,0.0,2018,7
112,Selfie Camera Photo Editor & Filter & Sticker,BEAUTY,4.1,187,30.0,50000,Free,0.0,Teen,Beauty,2018-07-24,3.0.1,4.0.3 and up,10.819798,5.236442,Top rated,0.0,2018,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10803,Fatal Raid - No.1 Mobile FPS,GAME,4.3,56496,81.0,1000000,Free,0.0,Teen,Action,2018-08-07,1.5.447,4.0 and up,13.815512,10.941943,Top rated,0.0,2018,8
10804,Poker Pro.Fr,GAME,4.2,5442,17.0,100000,Free,0.0,Teen,Card,2018-05-22,4.1.3,2.3 and up,11.512935,8.602086,Top rated,0.0,2018,5
10814,FR: My Secret Pets!,FAMILY,4.0,785,31.0,50000,Free,0.0,Teen,Entertainment,2015-06-03,1.3.1,3.0 and up,10.819798,6.666957,Top rated,0.0,2015,6
10819,Fanfic-FR,BOOKS_AND_REFERENCE,3.3,52,3.6,5000,Free,0.0,Teen,Books & Reference,2017-08-05,0.3.4,4.1 and up,8.517393,3.970292,Above average,0.0,2017,8


In [138]:
filtered_data_T4 =filtered_data_T4[(filtered_data_T4['App'].str.startswith('E'))]
filtered_data_T4

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Log_Installs,Log_Reviews,Rating_Group,Revenue,Year,Month
1047,Endurance Lifestyle,EVENTS,4.6,7,3.8,500,Free,0.0,Teen,Events,2017-07-10,2.1,4.2 and up,6.216606,2.079442,Top rated,0.0,2017,7
1354,"Eve Period Tracker - Love, Sex & Relationships...",HEALTH_AND_FITNESS,4.6,20326,28.0,1000000,Free,0.0,Teen,Health & Fitness,2018-08-04,2.9.18,4.1 and up,13.815512,9.919705,Top rated,0.0,2018,8
1786,Episode - Choose Your Story,GAME,4.3,1841061,,50000000,Free,0.0,Teen,Simulation,2018-07-31,Varies with device,Varies with device,17.727534,14.425853,Top rated,0.0,2018,7
1927,Earn to Die 2,GAME,4.6,1327265,99.0,50000000,Free,0.0,Teen,Racing,2017-04-12,1.3,2.3.3 and up,17.727534,14.098632,Top rated,0.0,2017,4
1978,Earn to Die 2,GAME,4.6,1327269,99.0,50000000,Free,0.0,Teen,Racing,2017-04-12,1.3,2.3.3 and up,17.727534,14.098635,Top rated,0.0,2017,4
2677,EHS Dongsen Shopping,SHOPPING,3.6,3656,9.0,1000000,Free,0.0,Teen,Shopping,2018-08-03,4.16.2,4.1 and up,13.815512,8.204398,Above average,0.0,2018,8
2768,Etsy: Handmade & Vintage Goods,SHOPPING,4.3,95520,15.0,10000000,Free,0.0,Teen,Shopping,2018-08-03,5.3.1,4.1 and up,16.118096,11.467101,Top rated,0.0,2018,8
4066,E!,FAMILY,3.1,740,20.0,100000,Free,0.0,Teen,Entertainment,2018-05-24,3.3.1.41,4.4 and up,11.512935,6.608001,Above average,0.0,2018,5
4067,E! News,NEWS_AND_MAGAZINES,4.0,15443,25.0,1000000,Free,0.0,Teen,News & Magazines,2018-06-19,4.2.133,4.4 and up,13.815512,9.644976,Top rated,0.0,2018,6
4073,Eternium,FAMILY,4.8,1506783,89.0,10000000,Free,0.0,Teen,Role Playing,2018-07-18,1.2.115,4.0 and up,16.118096,14.225488,Top rated,0.0,2018,7


In [139]:
filtered_data_T4 = filtered_data_T4[(filtered_data_T4['Installs'] > 10000)]
filtered_data_T4

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Log_Installs,Log_Reviews,Rating_Group,Revenue,Year,Month
1354,"Eve Period Tracker - Love, Sex & Relationships...",HEALTH_AND_FITNESS,4.6,20326,28.0,1000000,Free,0.0,Teen,Health & Fitness,2018-08-04,2.9.18,4.1 and up,13.815512,9.919705,Top rated,0.0,2018,8
1786,Episode - Choose Your Story,GAME,4.3,1841061,,50000000,Free,0.0,Teen,Simulation,2018-07-31,Varies with device,Varies with device,17.727534,14.425853,Top rated,0.0,2018,7
1927,Earn to Die 2,GAME,4.6,1327265,99.0,50000000,Free,0.0,Teen,Racing,2017-04-12,1.3,2.3.3 and up,17.727534,14.098632,Top rated,0.0,2017,4
1978,Earn to Die 2,GAME,4.6,1327269,99.0,50000000,Free,0.0,Teen,Racing,2017-04-12,1.3,2.3.3 and up,17.727534,14.098635,Top rated,0.0,2017,4
2677,EHS Dongsen Shopping,SHOPPING,3.6,3656,9.0,1000000,Free,0.0,Teen,Shopping,2018-08-03,4.16.2,4.1 and up,13.815512,8.204398,Above average,0.0,2018,8
2768,Etsy: Handmade & Vintage Goods,SHOPPING,4.3,95520,15.0,10000000,Free,0.0,Teen,Shopping,2018-08-03,5.3.1,4.1 and up,16.118096,11.467101,Top rated,0.0,2018,8
4066,E!,FAMILY,3.1,740,20.0,100000,Free,0.0,Teen,Entertainment,2018-05-24,3.3.1.41,4.4 and up,11.512935,6.608001,Above average,0.0,2018,5
4067,E! News,NEWS_AND_MAGAZINES,4.0,15443,25.0,1000000,Free,0.0,Teen,News & Magazines,2018-06-19,4.2.133,4.4 and up,13.815512,9.644976,Top rated,0.0,2018,6
4073,Eternium,FAMILY,4.8,1506783,89.0,10000000,Free,0.0,Teen,Role Playing,2018-07-18,1.2.115,4.0 and up,16.118096,14.225488,Top rated,0.0,2018,7
4332,EXO-L Amino for EXO Fans,SOCIAL,4.9,5677,67.0,50000,Free,0.0,Teen,Social,2018-07-13,1.8.19106,4.0.3 and up,10.819798,8.644354,Top rated,0.0,2018,7


In [140]:
# Extract year and month
filtered_data_T4['YearMonth'] = filtered_data_T4['Last Updated'].dt.to_period('M')
filtered_data_T4



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Log_Installs,Log_Reviews,Rating_Group,Revenue,Year,Month,YearMonth
1354,"Eve Period Tracker - Love, Sex & Relationships...",HEALTH_AND_FITNESS,4.6,20326,28.0,1000000,Free,0.0,Teen,Health & Fitness,2018-08-04,2.9.18,4.1 and up,13.815512,9.919705,Top rated,0.0,2018,8,2018-08
1786,Episode - Choose Your Story,GAME,4.3,1841061,,50000000,Free,0.0,Teen,Simulation,2018-07-31,Varies with device,Varies with device,17.727534,14.425853,Top rated,0.0,2018,7,2018-07
1927,Earn to Die 2,GAME,4.6,1327265,99.0,50000000,Free,0.0,Teen,Racing,2017-04-12,1.3,2.3.3 and up,17.727534,14.098632,Top rated,0.0,2017,4,2017-04
1978,Earn to Die 2,GAME,4.6,1327269,99.0,50000000,Free,0.0,Teen,Racing,2017-04-12,1.3,2.3.3 and up,17.727534,14.098635,Top rated,0.0,2017,4,2017-04
2677,EHS Dongsen Shopping,SHOPPING,3.6,3656,9.0,1000000,Free,0.0,Teen,Shopping,2018-08-03,4.16.2,4.1 and up,13.815512,8.204398,Above average,0.0,2018,8,2018-08
2768,Etsy: Handmade & Vintage Goods,SHOPPING,4.3,95520,15.0,10000000,Free,0.0,Teen,Shopping,2018-08-03,5.3.1,4.1 and up,16.118096,11.467101,Top rated,0.0,2018,8,2018-08
4066,E!,FAMILY,3.1,740,20.0,100000,Free,0.0,Teen,Entertainment,2018-05-24,3.3.1.41,4.4 and up,11.512935,6.608001,Above average,0.0,2018,5,2018-05
4067,E! News,NEWS_AND_MAGAZINES,4.0,15443,25.0,1000000,Free,0.0,Teen,News & Magazines,2018-06-19,4.2.133,4.4 and up,13.815512,9.644976,Top rated,0.0,2018,6,2018-06
4073,Eternium,FAMILY,4.8,1506783,89.0,10000000,Free,0.0,Teen,Role Playing,2018-07-18,1.2.115,4.0 and up,16.118096,14.225488,Top rated,0.0,2018,7,2018-07
4332,EXO-L Amino for EXO Fans,SOCIAL,4.9,5677,67.0,50000,Free,0.0,Teen,Social,2018-07-13,1.8.19106,4.0.3 and up,10.819798,8.644354,Top rated,0.0,2018,7,2018-07


In [141]:
# Step 3: Aggregating the installs by category and month
installs = (
    filtered_data_T4.groupby(['YearMonth', 'Category'])['Installs']
    .sum()
    .reset_index()
    .sort_values('YearMonth')
)

In [142]:
# Step 4: Calculating the Month-over-Month Growth
installs['Pct_Change'] = (
    installs.groupby('Category')['Installs']
    .pct_change() * 100
)

In [143]:
# Step 5: Adding a 'Significant Growth' flag
installs['Significant Growth'] = installs['Pct_Change'] > 20
installs

Unnamed: 0,YearMonth,Category,Installs,Pct_Change,Significant Growth
0,2014-07,GAME,5000000,,False
1,2015-08,FAMILY,1000000,,False
2,2017-03,SOCIAL,1000000,,False
3,2017-04,FAMILY,100000,-90.0,False
4,2017-04,GAME,100000000,1900.0,True
5,2017-07,SPORTS,50000000,,False
6,2018-01,PHOTOGRAPHY,1000000,,False
7,2018-05,FAMILY,2100000,2000.0,True
8,2018-06,FAMILY,650000,-69.047619,False
9,2018-06,NEWS_AND_MAGAZINES,1000000,,False


**Base on the above output we can see that -**
1. The 'YearMonth' column offers a clear chronological sequence for monitoring app install trends and is well-structured.
2. The monthly percentage change in installs is appropriately shown in the Pct_Change column. Changes, both positive and bad, are accurately documented.
3. The Significant Growth column correctly flags periods where the percentage increase in installs exceeds 20%.
4. Instances (such as first entries for categories) where the Pct_Change cannot be computed because of missing prior data are suitably denoted as NaN.
5. The output accurately depicts key growth periods, including:


a. Growth rate for 2017-04 (GAME): 1900%. b. FAMILY 2018-05: 2000% growth. c. Family: 9923.08% increase in 2018–07.

In [144]:
installs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17 entries, 0 to 16
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype    
---  ------              --------------  -----    
 0   YearMonth           17 non-null     period[M]
 1   Category            17 non-null     object   
 2   Installs            17 non-null     int64    
 3   Pct_Change          9 non-null      float64  
 4   Significant Growth  17 non-null     bool     
dtypes: bool(1), float64(1), int64(1), object(1), period[M](1)
memory usage: 693.0+ bytes


In [145]:
#Before creating the chart, converting the YearMonth column to a string format first
installs['YearMonth'] = installs['YearMonth'].astype(str)
installs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17 entries, 0 to 16
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   YearMonth           17 non-null     object 
 1   Category            17 non-null     object 
 2   Installs            17 non-null     int64  
 3   Pct_Change          9 non-null      float64
 4   Significant Growth  17 non-null     bool   
dtypes: bool(1), float64(1), int64(1), object(2)
memory usage: 693.0+ bytes


In [146]:
#There are values in the 'YearMonth' column that are not in the correct YYYY-MM format. In particular the string may have extra text at the end.
#Checking the actual values in the YearMonth column to understand the format.
print(installs['YearMonth'].unique())

['2014-07' '2015-08' '2017-03' '2017-04' '2017-07' '2018-01' '2018-05'
 '2018-06' '2018-07' '2018-08']


In [147]:
# After cleaning, converting the 'YearMonth' to the datetime format finally.

installs['YearMonth'] = pd.to_datetime(installs['YearMonth'], format='%Y-%m', errors='coerce')
installs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17 entries, 0 to 16
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   YearMonth           17 non-null     datetime64[ns]
 1   Category            17 non-null     object        
 2   Installs            17 non-null     int64         
 3   Pct_Change          9 non-null      float64       
 4   Significant Growth  17 non-null     bool          
dtypes: bool(1), datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 693.0+ bytes


In [172]:
# Step 5: Plot the Time Series Chart
current_time = datetime.now().time()
start_time = datetime.strptime("18:00", "%H:%M").time()
end_time = datetime.strptime("21:00", "%H:%M").time()

In [174]:
#Figure 14
if start_time <= current_time <= end_time:
    # Creating the line chart with Plotly
    fig14 = px.line(
        installs,
        x='YearMonth',
        y='Installs',
        color='Category',
        line_group='Category',
        title="Trend of Total Installs Over Time (Teen, Apps Starting with 'E')",
        labels={'YearMonth': 'Month-Year', 'Installs': 'Total Installs', 'Category': 'App Category'},
    )

    # Highlighting significant growth areas by adding filled areas
    for category in installs['Category'].unique():
        category_data = installs[(installs['Category'] == category) & (installs['Significant Growth'])]
        fig14.add_scatter(
            x=category_data['YearMonth'],
            y=category_data['Installs'],
            fill='tozeroy',
            mode='lines',
            name=f"Significant Growth: {category}",
            opacity=0.3
        )

    # Updating the layout for better visualization
    fig14.update_layout(
        xaxis_title="Month-Year",
        yaxis_title="Total Installs",
        template="plotly_white",
        legend_title="App Categories",
    )

    # Saving the chart as an HTML file
    html_file = "time_series.html"
    fig14.write_html(html_file)

    # Opening the chart in a web browser
    webbrowser.open(html_file)
else:
    print("Time Series Chart is not available outside the time range (6 PM - 9 PM IST).")

Time Series Chart is not available outside the time range (6 PM - 9 PM IST).


**Task 5.** Plot a bubble chart to analyze the relationship between app size (in MB) and average rating, with the bubble size representing the number of installs. Include a filter to show only apps with a rating higher than 3.5 and that belong to the Game, Beauty ,business , commics , commication , Dating , Entertainment , social and event categories. Reviews should be greater than 500 and sentiment subjectivity should be more than 0.5 and Installs should be more than 50k as well as this graph should work only between 5 PM IST to 7 PM IST apart from that time we should not show this graph in dashboard itself.

In [150]:
apps_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8892 entries, 0 to 10840
Data columns (total 19 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   App             8892 non-null   object        
 1   Category        8892 non-null   object        
 2   Rating          8892 non-null   float64       
 3   Reviews         8892 non-null   int64         
 4   Size            7424 non-null   float64       
 5   Installs        8892 non-null   int64         
 6   Type            8892 non-null   object        
 7   Price           8892 non-null   float64       
 8   Content Rating  8892 non-null   object        
 9   Genres          8892 non-null   object        
 10  Last Updated    8892 non-null   datetime64[ns]
 11  Current Ver     8892 non-null   object        
 12  Android Ver     8892 non-null   object        
 13  Log_Installs    8892 non-null   float64       
 14  Log_Reviews     8892 non-null   float64       
 15  Rating_G

In [151]:
# Filter 1: Rating > 3.5
filtered_data_T5 = apps_df[apps_df['Rating'] > 3.5]
filtered_data_T5

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Log_Installs,Log_Reviews,Rating_Group,Revenue,Year,Month
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19.0,10000,Free,0.0,Everyone,Art & Design,2018-01-07,1.0.0,4.0.3 and up,9.210440,5.075174,Top rated,0.0,2018,1
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,2018-01-15,2.0.0,4.0.3 and up,13.122365,6.875232,Above average,0.0,2018,1
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7,5000000,Free,0.0,Everyone,Art & Design,2018-08-01,1.2.4,4.0.3 and up,15.424949,11.379520,Top rated,0.0,2018,8
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25.0,50000000,Free,0.0,Teen,Art & Design,2018-06-08,Varies with device,4.2 and up,17.727534,12.281389,Top rated,0.0,2018,6
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8,100000,Free,0.0,Everyone,Art & Design;Creativity,2018-06-20,1.1,4.4 and up,11.512935,6.875232,Top rated,0.0,2018,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10834,FR Calculator,FAMILY,4.0,7,2.6,500,Free,0.0,Everyone,Education,2017-06-18,1.0.0,4.1 and up,6.216606,2.079442,Top rated,0.0,2017,6
10836,Sya9a Maroc - FR,FAMILY,4.5,38,53.0,5000,Free,0.0,Everyone,Education,2017-07-25,1.48,4.1 and up,8.517393,3.663562,Top rated,0.0,2017,7
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,3.6,100,Free,0.0,Everyone,Education,2018-07-06,1.0,4.1 and up,4.615121,1.609438,Top rated,0.0,2018,7
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,,1000,Free,0.0,Mature 17+,Books & Reference,2015-01-19,Varies with device,Varies with device,6.908755,4.744932,Top rated,0.0,2015,1


In [152]:
#Listing all the castegories present along with their counts before applying filter-2

# Count of apps in each category (original apps dataset)
category_counts_original = apps_df['Category'].value_counts()
print("App counts per category in the original dataset:")
print(category_counts_original)

# Count of apps in each category (after applying filter-1)
category_counts_filtered = filtered_data_T5['Category'].value_counts()
print("\nApp counts per category in the filtered dataset after Filter 1:")
print(category_counts_filtered)

App counts per category in the original dataset:
Category
FAMILY                 1718
GAME                   1074
TOOLS                   734
PRODUCTIVITY            334
FINANCE                 317
PERSONALIZATION         310
COMMUNICATION           307
LIFESTYLE               305
PHOTOGRAPHY             304
MEDICAL                 302
SPORTS                  286
BUSINESS                270
HEALTH_AND_FITNESS      262
SOCIAL                  244
NEWS_AND_MAGAZINES      214
TRAVEL_AND_LOCAL        205
SHOPPING                202
BOOKS_AND_REFERENCE     177
VIDEO_PLAYERS           160
DATING                  159
EDUCATION               129
MAPS_AND_NAVIGATION     124
ENTERTAINMENT           111
FOOD_AND_DRINK          106
WEATHER                  75
AUTO_AND_VEHICLES        73
HOUSE_AND_HOME           68
LIBRARIES_AND_DEMO       65
ART_AND_DESIGN           62
COMICS                   58
PARENTING                50
EVENTS                   45
BEAUTY                   42
Name: count, dtype

In [153]:
# Filter 2: Category is "GAME"
filtered_data_T5 = filtered_data_T5[filtered_data_T5['Category'] == 'GAME']
filtered_data_T5 

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Log_Installs,Log_Reviews,Rating_Group,Revenue,Year,Month
1653,ROBLOX,GAME,4.5,4447388,67.0,100000000,Free,0.0,Everyone 10+,Adventure;Action & Adventure,2018-07-31,2.347.225742,4.1 and up,18.420681,15.307828,Top rated,0.0,2018,7
1654,Subway Surfers,GAME,4.5,27722264,76.0,1000000000,Free,0.0,Everyone 10+,Arcade,2018-07-12,1.90.0,4.1 and up,20.723266,17.137746,Top rated,0.0,2018,7
1655,Candy Crush Saga,GAME,4.4,22426677,74.0,500000000,Free,0.0,Everyone,Casual,2018-07-05,1.129.0.2,4.1 and up,20.030119,16.925762,Top rated,0.0,2018,7
1656,Solitaire,GAME,4.7,254258,23.0,10000000,Free,0.0,Everyone,Card,2018-08-01,2.137.0,4.1 and up,16.118096,12.446109,Top rated,0.0,2018,8
1657,Bubble Shooter,GAME,4.5,148897,46.0,10000000,Free,0.0,Everyone,Casual,2018-07-17,1.20.1,4.0.3 and up,16.118096,11.911017,Top rated,0.0,2018,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10791,Winter Wonderland,GAME,4.0,1287,38.0,50000,Free,0.0,Everyone,Word,2013-12-18,1.0,2.2 and up,10.819798,7.160846,Top rated,0.0,2013,12
10792,Soccer Clubs Logo Quiz,GAME,4.2,21661,16.0,1000000,Free,0.0,Everyone,Trivia,2018-05-24,1.3.81,4.0 and up,13.815512,9.983315,Top rated,0.0,2018,5
10793,Sid Story,GAME,4.4,28510,78.0,500000,Free,0.0,Teen,Card,2018-08-01,2.6.6,4.0.3 and up,13.122365,10.258045,Top rated,0.0,2018,8
10803,Fatal Raid - No.1 Mobile FPS,GAME,4.3,56496,81.0,1000000,Free,0.0,Teen,Action,2018-08-07,1.5.447,4.0 and up,13.815512,10.941943,Top rated,0.0,2018,8


In [154]:
# Filter 3: Installs > 50,000
filtered_data_T5 = filtered_data_T5 [filtered_data_T5 ['Installs'] > 50000]
filtered_data_T5

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Log_Installs,Log_Reviews,Rating_Group,Revenue,Year,Month
1653,ROBLOX,GAME,4.5,4447388,67.0,100000000,Free,0.0,Everyone 10+,Adventure;Action & Adventure,2018-07-31,2.347.225742,4.1 and up,18.420681,15.307828,Top rated,0.0,2018,7
1654,Subway Surfers,GAME,4.5,27722264,76.0,1000000000,Free,0.0,Everyone 10+,Arcade,2018-07-12,1.90.0,4.1 and up,20.723266,17.137746,Top rated,0.0,2018,7
1655,Candy Crush Saga,GAME,4.4,22426677,74.0,500000000,Free,0.0,Everyone,Casual,2018-07-05,1.129.0.2,4.1 and up,20.030119,16.925762,Top rated,0.0,2018,7
1656,Solitaire,GAME,4.7,254258,23.0,10000000,Free,0.0,Everyone,Card,2018-08-01,2.137.0,4.1 and up,16.118096,12.446109,Top rated,0.0,2018,8
1657,Bubble Shooter,GAME,4.5,148897,46.0,10000000,Free,0.0,Everyone,Casual,2018-07-17,1.20.1,4.0.3 and up,16.118096,11.911017,Top rated,0.0,2018,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10784,Big Hunter,GAME,4.3,245455,84.0,10000000,Free,0.0,Everyone 10+,Action,2018-05-31,2.8.6,4.0 and up,16.118096,12.410873,Top rated,0.0,2018,5
10792,Soccer Clubs Logo Quiz,GAME,4.2,21661,16.0,1000000,Free,0.0,Everyone,Trivia,2018-05-24,1.3.81,4.0 and up,13.815512,9.983315,Top rated,0.0,2018,5
10793,Sid Story,GAME,4.4,28510,78.0,500000,Free,0.0,Teen,Card,2018-08-01,2.6.6,4.0.3 and up,13.122365,10.258045,Top rated,0.0,2018,8
10803,Fatal Raid - No.1 Mobile FPS,GAME,4.3,56496,81.0,1000000,Free,0.0,Teen,Action,2018-08-07,1.5.447,4.0 and up,13.815512,10.941943,Top rated,0.0,2018,8


In [None]:
# Time-based Display
start_time = time(17, 0) 
end_time = time(19, 0)   
current_time = datetime.now().time()

In [176]:
#Figure 15
# Displaying Bubble Chart only within that above time range
if start_time <= current_time <= end_time:
    #Creating the Bubble Chart
    fig15 = px.scatter(
        filtered_data_T5,
        x='Size',
        y='Rating',
        size='Installs',
        color='Installs',
        hover_name='App',
        title="Bubble Chart: Relationship between App Size and Ratings (Games Category)",
        labels={'Size_MB': 'App Size (MB)', 'Rating': 'Average Rating', 'Installs': 'Number of Installs'},
    )

    # Updating the layout for better visualization
    fig15.update_layout(
        xaxis_title="App Size (in MB)",
        yaxis_title="Average Rating",
        coloraxis_colorbar=dict(title="Installs"),
        template="plotly_white",
    )
    
    # Saving the chart as an HTML file
    html_file = "bubble_chart.html"
    fig15.write_html(html_file)
    
    # Opening the chart in a web browser
    webbrowser.open(html_file)
else:
    print("Bubble chart is not available outside the time range (5 PM - 7 PM IST).")

**Creating Final Dashboard for 5 Tasks**

In [None]:
import webbrowser

# Saving the dashboard HTML content
dashboard_filename = "Final-Dashboard.html"

dashboard_html = """ 
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Google Play Store Analytics Dashboard</title>
    <style>
        /* General Styles */
        body {
            font-family: 'Arial', sans-serif;
            margin: 0;
            padding: 0;
            background-color: #121212;
            color: white;
            transition: background-color 0.5s, color 0.5s;
            overflow-x: hidden;
        }

        /* Light Mode */
        body.light-mode {
            background-color: white;
            color: black;
        }

        /* Header */
        .header {
            text-align: center;
            padding: 20px;
            background: linear-gradient(90deg, #34A853, #0F9D58);
            color: white;
            font-size: 24px;
            font-weight: bold;
            display: flex;
            justify-content: center;
            align-items: center;
            gap: 15px;
            position: relative;
        }

        .header img {
            height: 50px;
            cursor: pointer;
            transition: transform 0.3s ease-in-out;
        }

        .light-mode .header {
            background: linear-gradient(90deg, #ffcc00, #ff9900);
        }

        /* Toggle Mode Button */
        .toggle-container {
            position: absolute;
            right: 20px;
            top: 20px;
            cursor: pointer;
            display: flex;
            align-items: center;
            gap: 10px;
        }

        .toggle-icon {
            width: 40px;
            height: 40px;
            transition: transform 0.3s ease-in-out;
        }

        .light-mode .toggle-icon {
            transform: rotate(180deg);
        }

        /* Container */
        .container {
            display: flex;
            flex-direction: column;
            align-items: center;
            padding: 20px;
            width: 100%;
        }

        /* Plot Cards */
        .plot-card {
            width: 90%;
            height: 600px;
            background: #1E1E1E;
            border-radius: 10px;
            overflow: hidden;
            position: relative;
            transition: transform 0.3s ease, box-shadow 0.3s ease;
            cursor: pointer;
            margin-bottom: 30px;
            box-shadow: 0px 5px 15px rgba(0, 0, 0, 0.3);
        }

        .light-mode .plot-card {
            background: #f9f9f9;
            color: black;
        }

        .plot-card:hover {
            transform: scale(1.02);
            box-shadow: 0px 10px 20px rgba(0, 0, 0, 0.4);
        }
        
        /* Disabled Graphs */
        .disabled {
            background: #333 !important;
            cursor: not-allowed;
            color: #bbb;
            text-align: centr;
            font-size: 20px;
            padding: 50px;
        }

        .light-mode .disabled {
            background: #e0e0e0;
            color: #666;
        }

        /* Plot Titles */
        .plot-title {
            text-align: center;
            font-size: 18px;
            font-weight: bold;
            padding: 10px;
        }

        /* Embed Graphs */
        .plot-card embed {
            width: 100%;
            height: 100%;
            border: none;
        }

        /* Responsive Design */
        @media (max-width: 768px) {
            .plot-card {
                width: 100%;
                height: 500px;
            }
        }
    </style>
    <script>
        function openPlot(filename) {
            window.open(filename, '_blank');
        }

        // Toggle Light/Dark Mode
        function toggleMode() {
            document.body.classList.toggle("light-mode");
            let modeIcon = document.getElementById("modeIcon");

            if (document.body.classList.contains("light-mode")) {
                localStorage.setItem("theme", "light");
                modeIcon.src = "https://cdn-icons-png.flaticon.com/512/1164/1164954.png"; // Light mode icon
            } else {
                localStorage.setItem("theme", "dark");
                modeIcon.src = "https://cdn-icons-png.flaticon.com/512/747/747374.png"; // Dark mode icon
            }
        }

        // Load the theme from localStorage
        window.onload = function () {
            if (localStorage.getItem("theme") === "light") {
                document.body.classList.add("light-mode");
                document.getElementById("modeIcon").src = "https://cdn-icons-png.flaticon.com/512/1164/1164954.png";
            }
        };
    </script>
</head>
<body>
    <div class="header">
        <img src="https://upload.wikimedia.org/wikipedia/commons/7/78/Google_Play_Store_badge_EN.svg" alt="Google Play Store Logo">
        Google Play Store Review Analytics
        <div class="toggle-container" onclick="toggleMode()">
            <img id="modeIcon" class="toggle-icon" src="https://cdn-icons-png.flaticon.com/512/747/747374.png" alt="Toggle Theme">
        </div>
    </div>
    <div class="container">
        <!-- Available Graphs -->
                
        <div class="plot-card" onclick="openPlot('revenue_installs_paid_apps.html')">
            <embed src="revenue_installs_paid_apps.html">
            <p class="plot-title">Revenue vs Installs (Paid Apps)</p>
        </div>

        <!-- Time-Restricted Graphs -->
        <div class="plot-card disabled">
            <p class="plot-title">Dual Axis Chart (Available 1 PM - 2 PM)</p>
        </div>
        <div class="plot-card disabled">
            <p class="plot-title">Grouped Bar Chart (Available 3 PM - 5 PM)</p>
        </div>
        <div class="plot-card disabled">
            <p class="plot-title">Time Series Chart (Available 6 PM - 9 PM)</p>
        </div>
        <div class="plot-card disabled">
            <p class="plot-title">Bubble Chart (Available 5 PM - 7 PM)</p>
        </div>
       
    </div>
</body>
</html>
"""

# Saving the dashboard as an HTML file
with open(dashboard_filename, "w", encoding="utf-8") as file:
    file.write(dashboard_html)

# Opening the dashboard in the web browser
webbrowser.open(dashboard_filename)

print("Dashboard has been successfully opened in the browser.")

Dashboard has been successfully opened in the browser.


In [None]:
import webbrowser
import http.server
import socketserver
import threading

# Defining the port
PORT = 8000

# Function to start the server in a separate thread
def start_server():
    handler = http.server.SimpleHTTPRequestHandler
    with socketserver.TCPServer(("", PORT), handler) as httpd:
        print(f"Serving at http://localhost:{PORT}")
        httpd.serve_forever()

# Starting the server in the background
threading.Thread(target=start_server, daemon=True).start()

# Opening the dashboard in the browser
webbrowser.open(f"http://localhost:{PORT}/Final-Dashboard.html")
print("Dashboard is now accessible at http://localhost:8000/Final-Dashboard.html")

Exception in thread Thread-4 (start_server):


Traceback (most recent call last):
  File "c:\Users\itzzb\AppData\Local\Programs\Python\Python313\Lib\threading.py", line 1041, in _bootstrap_inner
    self.run()
    ~~~~~~~~^^
  File "C:\Users\itzzb\AppData\Roaming\Python\Python313\site-packages\ipykernel\ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
    ~~~~~~~~~~~~~~~~~~~~~^^^^^^
  File "c:\Users\itzzb\AppData\Local\Programs\Python\Python313\Lib\threading.py", line 992, in run
    self._target(*self._args, **self._kwargs)
    ~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\itzzb\AppData\Local\Temp\ipykernel_13804\2072188347.py", line 12, in start_server
    with socketserver.TCPServer(("", PORT), handler) as httpd:
         ~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\itzzb\AppData\Local\Programs\Python\Python313\Lib\socketserver.py", line 457, in __init__
    self.server_bind()
    ~~~~~~~~~~~~~~~~^^
  File "c:\Users\itzzb\AppData\Local\Programs\Python\Python313\Lib\socketserver.