In [19]:
import pandas as pd
import numpy as np 
import json
from openai import OpenAI
from google_play_scraper import Sort, reviews_all

Scraping Amazon app reviews from google PlayStore (Extraction)

In [20]:
def scrape_reviews(url):
    teams_reviews = reviews_all(
        url,
        sleep_milliseconds=0, # defaults to 0
        lang='en', # defaults to 'en'
        country='us', # defaults to 'us'
        sort=Sort.NEWEST, # defaults to Sort.MOST_RELEVANT
    ) 

    review_df = pd.DataFrame(np.array(teams_reviews),columns=['review'])
    review_df = review_df.join(pd.DataFrame(review_df.pop('review').tolist()))
    print("Number of Reviews: ", len(review_df))
    review_df.head() 
    return review_df

Cleaning and Preprocessing the dataframe (Transformation)

Storing Reviews in MongoDB (Load)

ETL Pipeline

In [21]:
def review_etl_pipeline(app_url):
    review_df = scrape_reviews(app_url)
    #transform()
    #load()
    return review_df

In [22]:
review_df = review_etl_pipeline('com.amazon.mShop.android.shopping')
display(review_df)

Number of Reviews:  199


Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion
0,dbf8b778-320b-468e-ad5c-a9fc9aae4d5f,Cynthia Nelson,https://play-lh.googleusercontent.com/a/ACg8oc...,thank you,5,0,28.6.0.100,2024-04-01 22:58:04,,,28.6.0.100
1,d0a09411-cb27-4c1a-8f89-4117478d811b,Joseph Edington,https://play-lh.googleusercontent.com/a-/ALV-U...,GOOD APP,5,0,,2024-04-01 22:46:12,,,
2,c9cc7f79-dba9-4539-a025-9ed36fe71d02,Zachary Limbert,https://play-lh.googleusercontent.com/a-/ALV-U...,My home page is now filled with Amazon influen...,2,0,28.6.0.100,2024-04-01 22:42:38,,,28.6.0.100
3,0e0408c4-4c5e-49ea-85f8-1a2bd5555ef3,Jerry,https://play-lh.googleusercontent.com/a/ACg8oc...,Very easy to connect using my Samsung phone.,5,0,,2024-04-01 22:38:40,,,
4,6679adf3-4abe-4419-9ae7-24f05c05713c,Steve Guerra,https://play-lh.googleusercontent.com/a-/ALV-U...,Like shopping,5,0,28.5.2.100,2024-04-01 22:32:58,,,28.5.2.100
...,...,...,...,...,...,...,...,...,...,...,...
194,454c85d3-6ee4-4051-b899-a11311e78a04,Hailee Mellor,https://play-lh.googleusercontent.com/a-/ALV-U...,"I love Amazon soo much, it makes things so eas...",5,0,28.6.0.100,2024-03-30 22:12:49,,,28.6.0.100
195,60f9cd15-5753-4191-b05c-7a93a8761a44,Rory Fortney jr,https://play-lh.googleusercontent.com/a-/ALV-U...,I Like Shopping,5,0,,2024-03-30 21:57:07,,,
196,96329942-0f0c-4a72-893e-c037cbc1ce76,Cora Wang,https://play-lh.googleusercontent.com/a-/ALV-U...,it's not possible to say any thing negative ab...,5,0,28.6.0.100,2024-03-30 21:55:56,,,28.6.0.100
197,9f48c1db-fc1f-4a1a-b8a2-d04889eea984,cliff pool,https://play-lh.googleusercontent.com/a/ACg8oc...,"How do I uninstall this, I don't use Amazon. I...",1,1,20.14.0.100,2024-03-30 21:48:44,,,20.14.0.100


Getting Insights from the reviews using OpenAI's GPT 3.5 turbo model (using API and Prompt Engineering)

In [23]:
client = OpenAI(
    api_key="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",
)

In [24]:
def get_completion_from_messages(system_prompt, user_prompt, interaction, model="gpt-3.5-turbo", temperature=0.4):

    chat_completion = client.chat.completions.create(
        messages =
        [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt + f"Interaction: {interaction}"},
        ],
        model=model,
        temperature=temperature
        
    )

    return chat_completion.choices[0].message.content

In [25]:
def get_prompt():
    system_prompt = """Analyze the provided Google Play Store review and extract the following insights:
    1) Sentiment: Determine the overall sentiment of the review, classifying as "Positive", "Negative", or "Neutral".
    2) Rating: Rate the review provided by the user, ranging from 1 to 5 stars.
    3) Issue Existence: Identify if there is any issue mentioned in the review. Classify as "Yes" or "No".
    4) Issue Type: If an issue is identified, categorize it into types such as "App praise", "Product praise", "Service praise", etc.
    5) Issue Details: Provide additional details regarding the identified issue, if available.
    6) Paise Existence: Identify if there is any Praise mentioned in the review. Classify as "Yes" or "No".
    7) Praise Type: If an Praise is identified, categorize it into types such as "App praise", "Product praise", "Service praise", etc.
    8) Praise Details: Provide additional details regarding the identified Praise, if available.
    9) Feature Request: Determine if the review includes any feature requests. Classify as "Yes" or "No".
    10) User Loyalty: Assess the user's likelihood to continue using the app based on their review. Classify as "Likely", "Unlikely", or "Neutral".
    11) User Profile: Gather demographic information about the user if available, such as age, gender, location, etc.
    12) User Activity Level: Determine the user's activity level within the app based on their review. Classify as "High", "Medium", or "Low".
    13) User Experience: Evaluate the overall user experience mentioned in the review.
    14) User Suggestions: Identify any suggestions provided by the user for improvement.
    15) User Satisfaction: Gauge the overall satisfaction of the user based on their review. Classify as "Satisfied", "Neutral", or "Not Satisfied".
    16) Promoter Intent: Determine whether the user is likely to recommend the app to others. Classify as "Promoter", "Passive", or "Detractor".
    17) User Status: Identify if the user is a new user, old user, or somewhere in between.
    Output format:
    {
        "sentiment": "Positive / Negative / Neutral",
        "rating": "1-5",
        "issue_existence": "Yes / No",
        "issue": [
            {
                "issue_type": "string",
                "issue_details": "string"
            }
        ],
        "praise_existence": "Yes / No",
        "praise": [
            {
                "praise_type": "string",
                "praise_details": "string"
            }
        ],
        "update_mention": true // or false,
        "feature_request": true // or false,
        "user_loyalty": "Likely / Unlikely / Neutral",
        "user_profile": {
            "age": "string", // If not mentioned, then "Not Mentioned"
            "gender": "string", // If not mentioned, then "Not Mentioned"
            "location": "string" // If not mentioned, then "Not Mentioned"
        },
        "user_activity_level": "High / Medium / Low",
        "user_experience": "string",
        "user_suggestions": "string",
        "user_satisfaction": "Satisfied / Neutral / Not Satisfied",
        "promoter_intent": "Promoter / Passive / Detractor",
        "user_status": "New User / Old User / In Between"
    }
    """

    user_prompt = """Please respond in English: 'en'."""

    return system_prompt, user_prompt


In [26]:
def parse_google_play_review(response):
    response_json = json.loads(response)
    
    sentiment = response_json['sentiment']
    rating = response_json['rating']
    issue_existence = response_json['issue_existence']
    
    # Extracting issue details
    issue_list = []
    for issue_entry in response_json.get('issue', []):
        issue_list.append({
            "issue_type": issue_entry.get('issue_type', ''),
            "issue_details": issue_entry.get('issue_details', '')
        })
    
    praise_existence = response_json['praise_existence']
    
    # Extracting issue details
    praise_list = []
    for praise_entry in response_json.get('praise', []):
        praise_list.append({
            "praise_type": praise_entry.get('praise_type', ''),
            "praise_details": praise_entry.get('praise_details', '')
        })

    update_mention = response_json['update_mention']
    feature_request = response_json['feature_request']
    user_loyalty = response_json['user_loyalty']
    
    # Extracting user profile
    user_profile = {
        "age": response_json['user_profile'].get('age', 'Not Mentioned'),
        "gender": response_json['user_profile'].get('gender', 'Not Mentioned'),
        "location": response_json['user_profile'].get('location', 'Not Mentioned')
    }

    user_activity_level = response_json.get('user_activity_level', '')
    user_experience = response_json.get('user_experience', '')
    user_suggestions = response_json.get('user_suggestions', '')
    user_satisfaction = response_json.get('user_satisfaction', '')
    promoter_intent = response_json.get('promoter_intent', '')
    user_status = response_json.get('user_status', '')

    return (
        sentiment, rating, issue_existence, issue_list,
        praise_existence, praise_list,
        update_mention, feature_request, user_loyalty, 
        user_profile, user_activity_level, user_experience, 
        user_suggestions, user_satisfaction, promoter_intent, user_status
    )

In [27]:
def insight_for_one(temp_df, review):
    system_prompt, user_prompt = get_prompt()
    gpt_response = get_completion_from_messages(system_prompt, user_prompt, review)
    (sentiment, rating, issue_existence, issue_list, praise_existence, praise_list, update_mention, feature_request, user_loyalty, 
    user_profile, user_activity_level, user_experience, user_suggestions, 
    user_satisfaction, promoter_intent, user_status) = parse_google_play_review(gpt_response)

    new_row = {
        'Review': [review],
        'Sentiment': [sentiment],
        'Rating': [rating],
        'Issue_Existence': [issue_existence],
        'Issue(s)': [issue_list],
        'Praise_Existence': [praise_existence],
        'Praise(s)': [praise_list],
        'Update_Mention': [update_mention],
        'Feature_Request': [feature_request],
        'User_Loyalty': [user_loyalty],
        'User_Profile': [user_profile],
        'User_Activity_Level': [user_activity_level],
        'User_Experience': [user_experience],
        'User_Suggestions': [user_suggestions],
        'User_Satisfaction': [user_satisfaction],
        'Promoter_Intent': [promoter_intent],
        'User_Status': [user_status]
    }
    # print(temp_df)
    # print(type(temp_df))
    # print(new_row)

    # Append the new row to the DataFrame
    new_df = pd.DataFrame(new_row)
    result_df = pd.concat([temp_df, new_df], ignore_index=True)
    return result_df

In [28]:
def review_insights_for_all(review_df):
    review_insights_df = pd.DataFrame(columns=['Review', 'Sentiment', 'Rating', 'Issue_Existence', 'Issue(s)', 'Praise_Existence',
                                'Praise(s)', 'Update_Mention', 'Feature_Request', 'User_Loyalty', 'User_Profile', 'User_Activity_Level', 
                                'User_Experience', 'User_Suggestions', 'User_Satisfaction', 'Promoter_Intent', 'User_Status'])
    i=0
    for review in review_df['content']:
        review_insights_df = insight_for_one(review_insights_df, review)
        i+=1
        print(i)

        if i>2:
            break
    display(review_insights_df)
    return review_insights_df

Store Review Insights to the Database

Insights Pipeline

In [29]:
def review_insights_pipeline(review_df):
    review_insights_df = review_insights_for_all(review_df)
    #store in db
    review_insights_df.to_csv("Review_Insights.csv", index=False)

    

In [30]:
review_insights_pipeline(review_df)

1
2
3


Unnamed: 0,Review,Sentiment,Rating,Issue_Existence,Issue(s),Praise_Existence,Praise(s),Update_Mention,Feature_Request,User_Loyalty,User_Profile,User_Activity_Level,User_Experience,User_Suggestions,User_Satisfaction,Promoter_Intent,User_Status
0,thank you,Positive,5,No,[],Yes,"[{'praise_type': 'Interaction', 'praise_detail...",False,False,Likely,"{'age': 'Not Mentioned', 'gender': 'Not Mentio...",High,The user had a positive interaction and expres...,,Satisfied,Promoter,New User
1,GOOD APP,Positive,Not Mentioned,No,[],Yes,"[{'praise_type': 'App praise', 'praise_details...",False,False,Neutral,"{'age': 'Not Mentioned', 'gender': 'Not Mentio...",Not Mentioned,Positive,Not Mentioned,Satisfied,Passive,Not Mentioned
2,My home page is now filled with Amazon influen...,Negative,1,Yes,"[{'issue_type': 'App functionality', 'issue_de...",No,[],True,False,Unlikely,"{'age': 'Not Mentioned', 'gender': 'Not Mentio...",Medium,Filled with Amazon influencers instead of item...,Provide a way to remove or block influencers,Not Satisfied,Detractor,In Between
