# Google Play Store (user review)

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd

First, we'll load the dataset into memory. Here, we're using pandas to read a CSV file.

In [2]:
df = pd.read_csv('googleplaystore_user_reviews.csv')
df.head()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
2,10 Best Foods for You,,,,
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64295 entries, 0 to 64294
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   App                     64295 non-null  object 
 1   Translated_Review       37427 non-null  object 
 2   Sentiment               37432 non-null  object 
 3   Sentiment_Polarity      37432 non-null  float64
 4   Sentiment_Subjectivity  37432 non-null  float64
dtypes: float64(2), object(3)
memory usage: 2.5+ MB


In [4]:
df.describe()

Unnamed: 0,Sentiment_Polarity,Sentiment_Subjectivity
count,37432.0,37432.0
mean,0.182146,0.492704
std,0.351301,0.259949
min,-1.0,0.0
25%,0.0,0.357143
50%,0.15,0.514286
75%,0.4,0.65
max,1.0,1.0


## Cleaning data

In [5]:
df.duplicated().sum()

33616

In [6]:
df.duplicated(subset='Translated_Review').sum()

36300

In [7]:
df[df.duplicated(subset='Translated_Review', keep=False)]

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.00,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
2,10 Best Foods for You,,,,
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.40,0.875000
4,10 Best Foods for You,Best idea us,Positive,1.00,0.300000
...,...,...,...,...,...
64290,Houzz Interior Design Ideas,,,,
64291,Houzz Interior Design Ideas,,,,
64292,Houzz Interior Design Ideas,,,,
64293,Houzz Interior Design Ideas,,,,


In [8]:
df.isnull().sum().sort_values(ascending=False)

Translated_Review         26868
Sentiment                 26863
Sentiment_Polarity        26863
Sentiment_Subjectivity    26863
App                           0
dtype: int64

In [9]:
(df.isnull().sum() * 100 / len(df)).sort_values(ascending=False)

Translated_Review         41.788631
Sentiment                 41.780854
Sentiment_Polarity        41.780854
Sentiment_Subjectivity    41.780854
App                        0.000000
dtype: float64

In [10]:
df['App'].unique()

array(['10 Best Foods for You', '104 找工作 - 找工作 找打工 找兼職 履歷健檢 履歷診療室',
       '11st', ..., 'Hotwire Hotel & Car Rental App',
       'Housing-Real Estate & Property', 'Houzz Interior Design Ideas'],
      dtype=object)

In [11]:
df['App'].nunique()

1074

In [12]:
missing_data_sorted = df.isnull().groupby(df['App']).sum().assign(Total_Missing=lambda x: x.sum(axis=1)).sort_values(by='Total_Missing', ascending=False)
missing_data_sorted

Unnamed: 0_level_0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity,Total_Missing
App,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ESPN,0,240,240,240,240,960
Calorie Counter by FatSecret,0,200,200,200,200,800
"Bleacher Report: sports news, scores, & highlights",0,200,200,200,200,800
Granny,0,191,191,191,191,764
ClassDojo,0,180,180,180,180,720
...,...,...,...,...,...,...
Epocrates Plus,0,0,0,0,0,0
ConvertPad - Unit Converter,0,0,0,0,0,0
Candy Smash,0,0,0,0,0,0
Google PDF Viewer,0,0,0,0,0,0


In [13]:
df = df.dropna(subset=['Translated_Review', 'Sentiment', 'Sentiment_Polarity', 'Sentiment_Subjectivity'], how='all')

In [14]:
df.duplicated().sum()

7735

In [15]:
df = df.drop_duplicates()

In [16]:
df.duplicated().sum()

0

In [17]:
df.isnull().sum().sort_values(ascending=False)

Translated_Review         5
App                       0
Sentiment                 0
Sentiment_Polarity        0
Sentiment_Subjectivity    0
dtype: int64

In [18]:
null_translated_review = df[df['Translated_Review'].isnull()]
null_translated_review

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
268,11st,,Neutral,0.0,0.0
15048,Birds Sounds Ringtones & Wallpapers,,Neutral,0.0,0.0
22092,Calorie Counter - MyFitnessPal,,Neutral,0.0,0.0
31623,DC Comics,,Neutral,0.0,0.0
52500,Garden Photo Frames - Garden Photo Editor,,Neutral,0.0,0.0


In [19]:
df = df.dropna(subset=['Translated_Review'])

In [20]:
df.isnull().sum().sort_values(ascending=False)

App                       0
Translated_Review         0
Sentiment                 0
Sentiment_Polarity        0
Sentiment_Subjectivity    0
dtype: int64

In [21]:
df.duplicated(subset='Translated_Review').sum()

1698

In [28]:
# Identify duplicate instances in the 'Translated_Review' column
duplicated_instances = df[df.duplicated(subset=['Translated_Review'], keep=False)]

# Filter the original DataFrame to find both original and duplicated lines
original_and_duplicates = df[df['Translated_Review'].isin(duplicated_instances['Translated_Review'])]

# Show the original and duplicated lines
original_and_duplicates

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
6,10 Best Foods for You,Amazing,Positive,0.6,0.90
36,10 Best Foods for You,Best,Positive,1.0,0.30
40,10 Best Foods for You,Good,Positive,0.7,0.60
51,10 Best Foods for You,I like,Neutral,0.0,0.00
56,10 Best Foods for You,This helpful,Neutral,0.0,0.00
...,...,...,...,...,...
64116,Hotstar,running,Neutral,0.0,0.00
64117,Hotstar,Best,Positive,1.0,0.30
64171,Hotwire Hotel & Car Rental App,Great,Positive,0.8,0.75
64192,Hotwire Hotel & Car Rental App,Great deals,Positive,0.8,0.75


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29692 entries, 0 to 64230
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   App                     29692 non-null  object 
 1   Translated_Review       29692 non-null  object 
 2   Sentiment               29692 non-null  object 
 3   Sentiment_Polarity      29692 non-null  float64
 4   Sentiment_Subjectivity  29692 non-null  float64
dtypes: float64(2), object(3)
memory usage: 1.4+ MB


In [29]:
df['App'].nunique()

865

In [32]:
df['App'].unique()

array(['10 Best Foods for You', '104 找工作 - 找工作 找打工 找兼職 履歷健檢 履歷診療室',
       '11st', '1800 Contacts - Lens Store',
       '1LINE – One Line with One Touch',
       '2018Emoji Keyboard 😂 Emoticons Lite -sticker&gif',
       '21-Day Meditation Experience',
       '2Date Dating App, Love and matching',
       '2GIS: directory & navigator', '2RedBeans',
       '2ndLine - Second Phone Number',
       '30 Day Fitness Challenge - Workout at Home',
       '365Scores - Live Scores', '3D Live Neon Weed Launcher',
       '4 in a Row', '4K Wallpapers and Ultra HD Backgrounds',
       '591房屋交易-租屋、中古屋、新建案、實價登錄、別墅透天、公寓套房、捷運、買房賣房行情、房價房貸查詢', '591房屋交易-香港',
       '7 Cups: Anxiety & Stress Chat', '7 Day Food Journal Challenge',
       '7 Minute Workout', '7 Weeks - Habit & Goal Tracker',
       '8 Ball Pool', '850 Sports News Digest',
       '8fit Workouts & Meal Planner', '95Live -SG#1 Live Streaming App',
       'A Call From Santa Claus!', 'A Word A Day',
       'A&E - Watch Full Episodes of TV Shows',
 

In [35]:
df['Sentiment_Polarity'].unique()

array([ 1.        ,  0.25      ,  0.4       , ..., -0.52857143,
       -0.37777778,  0.17333333])

In [36]:
df['Sentiment_Subjectivity'].unique()

array([0.53333333, 0.28846154, 0.875     , ..., 0.51145833, 0.7172619 ,
       0.2594697 ])

In [34]:
df['Sentiment'].unique()

array(['Positive', 'Neutral', 'Negative'], dtype=object)

## Analysis

In [37]:
# Filter positive reviews
positive_reviews = df[df['Sentiment'] == 'Positive']

# Count positive reviews per app
positive_reviews_count = positive_reviews['App'].value_counts()

# Show the top apps with the most positive reviews
top_apps_positive_reviews = positive_reviews_count.head(10)  # Adjust the number to display more or fewer apps
print("Top apps with the most positive reviews:")
print(top_apps_positive_reviews)

Top apps with the most positive reviews:
ColorNote Notepad Notes           91
Calorie Counter - Macros          87
Family Locator - GPS Tracker      85
8fit Workouts & Meal Planner      82
Calorie Counter - MyNetDiary      82
Bible                             80
10 Best Foods for You             79
Google Photos                     76
Calorie Counter & Diet Tracker    75
Episode - Choose Your Story       74
Name: App, dtype: int64


In [38]:
# Filter negative reviews
negative_reviews = df[df['Sentiment'] == 'Negative']

# Count negative reviews per app
negative_reviews_count = negative_reviews['App'].value_counts()

# Show the top apps with the most negative reviews
top_apps_negative_reviews = negative_reviews_count.head(10)  # Adjust the number to display more or fewer apps
print("Top apps with the most negative reviews:")
print(top_apps_negative_reviews)

Top apps with the most negative reviews:
Facebook                 59
Be A Legend: Soccer      59
Angry Birds Classic      59
Cooking Fever            57
Candy Crush Soda Saga    50
Gardenscapes             50
8 Ball Pool              46
Agar.io                  44
Basketball Stars         41
Block Puzzle             40
Name: App, dtype: int64


In [39]:
# Filter neutral reviews
neutral_reviews = df[df['Sentiment'] == 'Neutral']

# Count neutral reviews per app
neutral_reviews_count = neutral_reviews['App'].value_counts()

# Show the top apps with the most neutral reviews
top_apps_neutral_reviews = neutral_reviews_count.head(10)  # Adjust the number to display more or fewer apps
print("Top apps with the most neutral reviews:")
print(top_apps_neutral_reviews)

Top apps with the most neutral reviews:
BestCam Selfie-selfie, beauty camera, photo editor    35
Azar                                                  29
Fast Secure VPN                                       23
Easy Installer - Apps On SD                           23
Color by Number - Draw Sandbox Pixel Art              20
Facebook                                              18
A+ Mobile                                             18
AdWords Express                                       18
Blood Pressure(BP) Diary                              18
Calorie Counter - MyFitnessPal                        17
Name: App, dtype: int64


## Downloading data

In [26]:
df.to_csv('googleplaystore_user_reviews_clean.csv', index=False)