In [199]:
import pandas as pd
data = pd.read_csv('../week2/cleaned_playstore.csv', index_col=0)
data.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19000000.0,10000,Free,0.0,Everyone
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14000000.0,500000,Free,0.0,Everyone
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8700000.0,5000000,Free,0.0,Everyone
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25000000.0,50000000,Free,0.0,Teen
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2800000.0,100000,Free,0.0,Everyone


### Task 1: 
Find if there is a correlation between the price of the apps and the Content Rating (Teen, Everyone, Mature. etc).

In [200]:
df = pd.DataFrame(data)

In [201]:
corr = df['Content Rating'].str.get_dummies().corrwith(df.Price)

In [202]:
corr # this shows that Content Rating is not correlated with Price

Adults only 18+   -0.001150
Everyone           0.016444
Everyone 10+      -0.007333
Mature 17+        -0.010259
Teen              -0.009582
Unrated           -0.000939
dtype: float64

### Task 2:
Find the sentiment of all apps using np files and "afinn" lib

#### NP files

In [203]:
n = list(pd.read_excel('n.xlsx', header = None)[0])
p = list(pd.read_excel('p.xlsx', header = None)[0])
review = pd.read_csv('../week2/cleaned_playstore_reviews.csv', index_col = 0)

In [204]:
review.drop(columns= ['Sentiment', 'Sentiment_Polarity', 'Sentiment_Subjectivity'], inplace=True)

In [205]:
import re
Sentiment_score = []
for i in review['Translated_Review'].str.lower():
    score = 0
    sentence = re.sub('[^A-Za-z0-9 ]+', '', i)
    words = sentence.split()
    for word in words:
        if word in p:
            score += 1
        elif word in n:
            score -= 1
        else:
            continue
    Sentiment_score.append(score)
review['Sentiment NP'] = Sentiment_score

In [206]:
review.head()

Unnamed: 0,App,Translated_Review,Sentiment NP
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,4
1,104 找工作 - 找工作 找打工 找兼職 履歷健檢 履歷診療室,Great,1
2,11st,Horrible ID verification,-1
3,1800 Contacts - Lens Store,Great hassle free way order contacts. Got call...,1
4,1LINE – One Line with One Touch,"gets 1* there's ad every single level restart,...",-1


#### Afinn lib

In [207]:
from afinn import Afinn

In [208]:
afn = Afinn()
scores = [afn.score(r) for r in review['Translated_Review']]
review['Sentiment Afinn'] = scores

In [209]:
review.head()

Unnamed: 0,App,Translated_Review,Sentiment NP,Sentiment Afinn
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,4,13.0
1,104 找工作 - 找工作 找打工 找兼職 履歷健檢 履歷診療室,Great,1,3.0
2,11st,Horrible ID verification,-1,-3.0
3,1800 Contacts - Lens Store,Great hassle free way order contacts. Got call...,1,4.0
4,1LINE – One Line with One Touch,"gets 1* there's ad every single level restart,...",-1,-2.0


### Task 3
for Free apps only list the top 5 highest and lowest sentiment numbers for afinn column with the name of the app and the app category (this task was supposed to be done for paid apps only but it's changed to Free apps only)

In [210]:
merged_df = pd.merge(review, df, on='App') # This will join tables with inner join on App column.

In [211]:
review.shape

(641, 4)

In [212]:
merged_df.shape # here we can see there is difference in number of rows between merged dataframe and review dataframe.

(609, 12)

In [213]:
review_test = merged_df.drop(columns=['Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating'])

In [214]:
pd.concat([review, review_test]).drop_duplicates(keep=False) # these are the apps that are missing in "cleaned playstore.csv"

Unnamed: 0,App,Translated_Review,Sentiment NP,Sentiment Afinn
1,104 找工作 - 找工作 找打工 找兼職 履歷健檢 履歷診療室,Great,1,3.0
14,591房屋交易-租屋、中古屋、新建案、實價登錄、別墅透天、公寓套房、捷運、買房賣房行情、房價...,This good searching house.,1,3.0
15,591房屋交易-香港,1. Look filtering criteria 2. Better store fil...,1,2.0
90,Aprender inglés con Wlingua,"A very good application, that's just the words...",1,3.0
118,BaBe Lite - Baca Berita Hemat Kuota,Great information,1,3.0
131,Bagan - Myanmar Keyboard,"Then updated hands-room, though, it hit the ke...",-1,-2.0
132,Banco Itaú,The new Itaú has gotten better on the one hand...,2,5.0
133,Banco do Brasil,"Lol, from Brazil bank are a joke. To what appl...",-3,0.0
134,Bancomer móvil,"I thought it was a mistake, but it turns out t...",-2,4.0
136,Bangla Newspaper – Prothom Alo,Develop app. It's slow. It takes much time vie...,-2,-2.0


In [215]:
merged_df.drop(columns=['Rating', 'Reviews', 'Size', 'Installs', 'Price', 'Content Rating', 'Sentiment NP', 'Translated_Review'], inplace=True)

In [216]:
merged_df = merged_df[merged_df.Type != 'Paid'].drop(columns='Type')

In [221]:
merged_df = merged_df[['App', 'Category', 'Sentiment Afinn']].reset_index().drop(columns=['index'])

In [222]:
merged_df

Unnamed: 0,App,Category,Sentiment Afinn
0,10 Best Foods for You,HEALTH_AND_FITNESS,13.0
1,11st,SHOPPING,-3.0
2,1800 Contacts - Lens Store,MEDICAL,4.0
3,1LINE – One Line with One Touch,GAME,-2.0
4,2018Emoji Keyboard 😂 Emoticons Lite -sticker&gif,PERSONALIZATION,4.0
...,...,...,...
596,Hotels Combined - Cheap deals,TRAVEL_AND_LOCAL,-3.0
597,Hotels.com: Book Hotel Rooms & Find Vacation D...,TRAVEL_AND_LOCAL,-2.0
598,Hotspot Shield Free VPN Proxy & Wi-Fi Security,TOOLS,-1.0
599,Hotstar,ENTERTAINMENT,0.0


In [223]:
merged_df.sort_values(by='Sentiment Afinn', ascending=False, inplace=True)

In [224]:
merged_df.head() # TOP 5

Unnamed: 0,App,Category,Sentiment Afinn
557,Hamilton — The Official App,ENTERTAINMENT,36.0
76,Angry Birds 2,GAME,32.0
538,HBO GO: Stream with TV Package,ENTERTAINMENT,30.0
256,Clash of Clans,GAME,27.0
315,Digit Save Money Automatically,FINANCE,26.0


In [226]:
merged_df.tail().sort_values(by='Sentiment Afinn') # LOW 5

Unnamed: 0,App,Category,Sentiment Afinn
148,"BestCam Selfie-selfie, beauty camera, photo ed...",BEAUTY,-24.0
162,Black People Meet Singles Date,DATING,-15.0
203,CBS Sports Fantasy,SPORTS,-14.0
401,FamilySearch Tree,BOOKS_AND_REFERENCE,-13.0
176,Booking.com Travel Deals,TRAVEL_AND_LOCAL,-11.0


In [228]:
pd.concat([merged_df.head(), merged_df.tail().sort_values(by='Sentiment Afinn')]) # TOP 5 AND LOW 5 IN SAME TABLE

Unnamed: 0,App,Category,Sentiment Afinn
557,Hamilton — The Official App,ENTERTAINMENT,36.0
76,Angry Birds 2,GAME,32.0
538,HBO GO: Stream with TV Package,ENTERTAINMENT,30.0
256,Clash of Clans,GAME,27.0
315,Digit Save Money Automatically,FINANCE,26.0
148,"BestCam Selfie-selfie, beauty camera, photo ed...",BEAUTY,-24.0
162,Black People Meet Singles Date,DATING,-15.0
203,CBS Sports Fantasy,SPORTS,-14.0
401,FamilySearch Tree,BOOKS_AND_REFERENCE,-13.0
176,Booking.com Travel Deals,TRAVEL_AND_LOCAL,-11.0


### Task 4
what is the best category according to afinn sentiment values

In [244]:
merged_df.groupby(by='Category').sum(numeric_only=True).sort_values(by='Sentiment Afinn', ascending=False).head(1)

Unnamed: 0_level_0,Sentiment Afinn
Category,Unnamed: 1_level_1
GAME,490.0
