In [1]:
# Dependencies
from bs4 import BeautifulSoup
import requests
from splinter import Browser
import pandas as pd
import time
import re
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, Float
Base = declarative_base()
import pymysql
pymysql.install_as_MySQLdb()
import numpy as np

# Set path 
import sys
sys.path.append('../')

# Config variables
from config import remote_db_endpoint, remote_db_port
from config import remote_gwsis_dbname, remote_gwsis_dbuser, remote_gwsis_dbpwd
from config import local_gwsis_dbname, local_gwsis_dbuser, local_gwsis_dbpwd 

Create local and remote engines and pass in MySQL connection

In [2]:
# local Connection Backup 
lengine = create_engine(f"mysql://{local_gwsis_dbuser}:{local_gwsis_dbpwd}@localhost:3306/{local_gwsis_dbname}?charset=utf8mb4")

# AWS Database Connection
engine = create_engine(f"mysql://{remote_gwsis_dbuser}:{remote_gwsis_dbpwd}@{remote_db_endpoint}:{remote_db_port}/{remote_gwsis_dbname}")

In [3]:
# Create a local database engine connection
lconn = lengine.connect()

# Create a remote database engine connection
conn = engine.connect()

In [4]:
# Use Splinter to initiate browser
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

Looking at the following brands/types of birth control
* Shot: Depo-provera
* Hormonal IUD: Mirena
* Non-hormonal IUD: ParaGard (reviews not available)
* Ring: Nuvaring
* Combination pill: Ortho Tri-Cyclen
* Progestin pill: Norethindrone
* Implant: Nexplanon (formerly implanon)
* Patch: Xulane

In [5]:
#These are the birth controls we will look at
types_list = ['depo-provera','mirena','nuvaring','ortho-tri-cyclen','norethindrone','nexplanon','xulane']

Scrape all the reiviews from [Everyday Health](https://www.everydayhealth.com/drugs/) for each of the birth control types. This may take several minutes!

In [6]:
bc_type = []
review_text = []
stars = []
use_case = []
publish_date = []

for t in types_list:
    # URL of page to be scraped
    url = f'https://www.everydayhealth.com/drugs/{t}/reviews'
    browser.visit(url)
    
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')  

    # Use Splinter to dynamically find the number of pages of reviews
    results = soup.find('div', class_='review-pagination')

    pages = results.text
    max_pages = re.search(r'1 of (.*?) Next', pages).group(1)
    max_pages = int(max_pages)
    i = max_pages + 1
    
    # Use Splinter to click through each page of reviews and scrape the data
    for x in range(1, i):

        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')

        # Retrieve the parent divs for all reviews
        results = soup.find_all('div', class_='review-container')

        # Loop through results to retrieve review details
        for result in results:
            body = result.p.text
            rating = result.find('div', class_="star-rating-print").text[0]
            bc = result.h3.span.text
            date = result.find('span', class_="time")['content']
            my_use = result.h3.text
            my_use_normalized = re.search(r'for (.*?)Report', my_use).group(1)

            bc_type.append(bc)
            review_text.append(body)
            stars.append(rating)
            use_case.append(my_use_normalized)
            publish_date.append(date)    

        try:
            browser.click_link_by_partial_text('Next')

            time.sleep(1)

        except:
            print("Scraping Complete")

Scraping Complete
Scraping Complete
Scraping Complete
Scraping Complete
Scraping Complete
Scraping Complete
Scraping Complete


In [7]:
# Number of reviews collected
len(stars)

2848

In [8]:
bc_df = pd.DataFrame({
    'Birth Control': bc_type,
    'Star Rating': stars,
    'Review': review_text,
    'Use': use_case,
    'Publish Date': publish_date
    })

Use VADER sentiment analysis to analyze the review text for each review. How do women feel about their birth control?

In [9]:
# Import and Initialize Sentiment Analyzer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

In [10]:
# Variables for holding sentiments
compound_list = []
positive_list = []
negative_list = []
neutral_list = []

for review in bc_df['Review']:

    # Run Vader Analysis on each review
    results = analyzer.polarity_scores(review)
    compound = results["compound"]
    pos = results["pos"]
    neu = results["neu"]
    neg = results["neg"]

    # Add each value to the appropriate list
    compound_list.append(compound)
    positive_list.append(pos)
    negative_list.append(neg)
    neutral_list.append(neu)

In [39]:
# Store it all in a DataFrame
bc_df = pd.DataFrame({
    'Birth Control': bc_type,
    'Star Rating': stars,
    'Review': review_text,
    'Use': use_case,
    'Publish Date': publish_date,
    'Compound': compound_list,
    'Positive': positive_list,
    'Negative': negative_list,
    'Neutral': neutral_list, 
    'Source' : 'Everyday Health'
    })


In [12]:
# Clean up date
bc_df['Publish Date'] = [date[:-11] for date in bc_df['Publish Date']]

In [42]:
# Create a dataframe showing averages for each type of birth control
depo_df = bc_df.loc[bc_df['Birth Control'] == 'Depo-Provera']
mirena_df = bc_df.loc[bc_df['Birth Control'] == 'Mirena']
implanon_df = bc_df.loc[bc_df['Birth Control'] == 'Implanon']
norethindrone_df = bc_df.loc[bc_df['Birth Control'] == 'Norethindrone']
nuvaring_df = bc_df.loc[bc_df['Birth Control'] == 'NuvaRing']
xulane_df = bc_df.loc[bc_df['Birth Control'] == 'Xulane']
ortho_df = bc_df.loc[bc_df['Birth Control'] == 'Ortho Tri-Cyclen']

In [59]:
# Classify each type of birth control
for index, row in bc_df.iterrows():
    if row['Birth Control'] == 'Depo-Provera':
        bc_df.loc[index,"Type"] = 'Shot'
    elif row['Birth Control'] == 'Mirena':
        bc_df.loc[index,"Type"] = 'Hormonal IUD'
    elif row['Birth Control'] == 'Implanon':
        bc_df.loc[index,"Type"] = 'Implant'
    elif row['Birth Control'] == 'Norethindrone':
        bc_df.loc[index,"Type"] = 'Progestin Pill'
    elif row['Birth Control'] == 'NuvaRing':
        bc_df.loc[index,"Type"] = 'Ring'
    elif row['Birth Control'] == 'Xulane':
        bc_df.loc[index,"Type"] = 'Patch'
    elif row['Birth Control'] == 'Ortho Tri-Cyclen':
        bc_df.loc[index,"Type"] = 'Combination Pill'

In [62]:
bc_df.head()

Unnamed: 0,Birth Control,Star Rating,Review,Use,Publish Date,Compound,Positive,Negative,Neutral,Source,Type
0,Depo-Provera,1,I started Depo when I was 13 years old. Im now...,Birth Control,2/22/2019 12:42:43 PM,-0.9851,0.067,0.23,0.703,Everyday Health,Shot
1,Depo-Provera,4,Have been on depo for 5 years first 4 were gre...,Birth Control,2/22/2019 7:09:42 AM,0.8489,0.149,0.022,0.828,Everyday Health,Shot
2,Depo-Provera,3,"I started depo when I was 17, I weighed about ...",Birth Control,2/21/2019 7:28:32 AM,-0.5764,0.08,0.112,0.808,Everyday Health,Shot
3,Depo-Provera,1,To much side effects don’t buy,Depression,2/20/2019 9:00:25 AM,0.0,0.0,0.0,1.0,Everyday Health,Shot
4,Depo-Provera,4,I was on depo from Sept 2009- Jan 2017. I didn...,Birth Control,2/20/2019 6:48:48 AM,-0.9095,0.033,0.246,0.721,Everyday Health,Shot


In [43]:
bc_df['Star Rating'] = bc_df['Star Rating'].astype('int64')
avg_rating = bc_df.groupby('Birth Control').mean()
avg_rating

Unnamed: 0_level_0,Star Rating,Compound,Positive,Negative,Neutral
Birth Control,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Depo-Provera,2.19697,-0.302126,0.092523,0.142263,0.750054
Implanon,2.62987,-0.13671,0.101545,0.126403,0.752532
Mirena,3.328929,-0.094028,0.103779,0.118938,0.768959
Norethindrone,2.285714,-0.344439,0.074704,0.139556,0.764566
NuvaRing,2.983333,-0.042537,0.10375,0.117683,0.770217
Ortho Tri-Cyclen,3.108696,-0.076774,0.104022,0.111804,0.762424
Xulane,3.255814,-0.013038,0.104093,0.10755,0.757333


In [63]:
# Why is this row misbehaving?
bc_df['Review'][2271]

"I wanted an easy birth control method. Definitely easy to use. Never fell out when it wasn't suppose to, and A LOT LESS acne. The CONS for me was gaining 50lbs, low libido, extra discharge (way extra), feeling like a huge balloon! I've been on it for about a year now and I'm stopping it today! I look like I'm 9 months pregnant no joke! It made my belly balloon out like I have beer gut and I don't drink beer. I feel like my whole body is swollen. The reason I've been on it so long was because I thought it was just me needing to work out more. NOPE! I've been killing myself and no change, still gaining about a pound a week! See ya Nuva Ring! ✌"

In [64]:
# Drop review containing emoji
bc_df_clean = bc_df.drop([2271])

In [65]:
# Save dataframe to AWS
bc_df_clean.to_sql(name='birth_control_db', if_exists='replace', con=conn, index=False)

In [66]:
# Check remote database to make sure it migrated correctly
remote_bc_data = pd.read_sql("SELECT * FROM birth_control_db", conn)
print(len(remote_bc_data))
remote_bc_data.head()

2847


Unnamed: 0,Birth Control,Star Rating,Review,Use,Publish Date,Compound,Positive,Negative,Neutral,Source,Type
0,Depo-Provera,1,I started Depo when I was 13 years old. Im now...,Birth Control,2/22/2019 12:42:43 PM,-0.9851,0.067,0.23,0.703,Everyday Health,Shot
1,Depo-Provera,4,Have been on depo for 5 years first 4 were gre...,Birth Control,2/22/2019 7:09:42 AM,0.8489,0.149,0.022,0.828,Everyday Health,Shot
2,Depo-Provera,3,"I started depo when I was 17, I weighed about ...",Birth Control,2/21/2019 7:28:32 AM,-0.5764,0.08,0.112,0.808,Everyday Health,Shot
3,Depo-Provera,1,To much side effects don’t buy,Depression,2/20/2019 9:00:25 AM,0.0,0.0,0.0,1.0,Everyday Health,Shot
4,Depo-Provera,4,I was on depo from Sept 2009- Jan 2017. I didn...,Birth Control,2/20/2019 6:48:48 AM,-0.9095,0.033,0.246,0.721,Everyday Health,Shot


In [67]:
# Check that long string was not truncated during transfer
print(remote_bc_data['Review'][0])

I started Depo when I was 13 years old. Im now 19 and just stopped the shots in September 2018. I was diagnosed with depression prior to starting the shot. Im also lactose intolerant and was prescribed calcium supplements. And my family has lots of history of breadt cancer and osteoporosis. During the shots i had no weight gain however, i actually lost about 20 pounds a year after. I lost my boobs too unfortunately lol. I was lazy and had no desire to do anything most of the time. i was also extremely moody and had bad ups and downs. Since getting off the shots my boobs hurt all the time, i’m constantly hungry, and have horrible fatigue. I get annoyed easily. My period is all over the place. I bleed probably once a week right now and its honestly horrible. It makes me worried that i’ll never be normal again. I wouldnt ever recommend this form of birth control to someone for an irregular period, especially as young as i was. If i had known about all the side effects i would have never g

In [68]:
# Save dataframe to MySQL
bc_df_clean.to_sql(name='birth_control_db', if_exists='replace', con=lconn, index=False)

In [69]:
# Check local database to make sure it migrated correctly
local_bc_data = pd.read_sql("SELECT * FROM birth_control_db", lconn)
print(len(local_bc_data))
local_bc_data.head()

2847


Unnamed: 0,Birth Control,Star Rating,Review,Use,Publish Date,Compound,Positive,Negative,Neutral,Source,Type
0,Depo-Provera,1,I started Depo when I was 13 years old. Im now...,Birth Control,2/22/2019 12:42:43 PM,-0.9851,0.067,0.23,0.703,Everyday Health,Shot
1,Depo-Provera,4,Have been on depo for 5 years first 4 were gre...,Birth Control,2/22/2019 7:09:42 AM,0.8489,0.149,0.022,0.828,Everyday Health,Shot
2,Depo-Provera,3,"I started depo when I was 17, I weighed about ...",Birth Control,2/21/2019 7:28:32 AM,-0.5764,0.08,0.112,0.808,Everyday Health,Shot
3,Depo-Provera,1,To much side effects don’t buy,Depression,2/20/2019 9:00:25 AM,0.0,0.0,0.0,1.0,Everyday Health,Shot
4,Depo-Provera,4,I was on depo from Sept 2009- Jan 2017. I didn...,Birth Control,2/20/2019 6:48:48 AM,-0.9095,0.033,0.246,0.721,Everyday Health,Shot


In [70]:
# Check that long string was not truncated during transfer
print(local_bc_data['Review'][0])

I started Depo when I was 13 years old. Im now 19 and just stopped the shots in September 2018. I was diagnosed with depression prior to starting the shot. Im also lactose intolerant and was prescribed calcium supplements. And my family has lots of history of breadt cancer and osteoporosis. During the shots i had no weight gain however, i actually lost about 20 pounds a year after. I lost my boobs too unfortunately lol. I was lazy and had no desire to do anything most of the time. i was also extremely moody and had bad ups and downs. Since getting off the shots my boobs hurt all the time, i’m constantly hungry, and have horrible fatigue. I get annoyed easily. My period is all over the place. I bleed probably once a week right now and its honestly horrible. It makes me worried that i’ll never be normal again. I wouldnt ever recommend this form of birth control to someone for an irregular period, especially as young as i was. If i had known about all the side effects i would have never g