# Web scraping

## Introduction
Scrapping twitter handles from the [social bakers site.](https://www.socialbakers.com/statistics/twitter/profiles/kenya)
The site has classified twitter users into categories like:
* Brands.
* Celebrities.
* Community.
* Entertainment.
* Media
* Places
* Society
* Sports

The handles listed are for some of the most popular users.
Each category has its unique URL. 

## Table of Contents
1. [Scraper](#Scraper)
2. Categories:
   * [Sports](#sports)
   * [Society](#Society)
   * [Place](#Place)
   * [Restaurants and cafes](#Restaurants-and-cafes)
   * [Media](#Media)
   * [Entertainment](#Entertainment)
   * [Community](#Community)
   * [Celebrities](#Celebrities)
   * [Brands](#Brands)
   
3. [Creating Final DF](#Concating-the-dataframes-to-one-mega-df.)
   



### Scrapper

In [1]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import pandas as pd
import os, sys

import fire

import sys
import os
import json
import matplotlib.pyplot as plt
import re
import string

import matplotlib.dates as mdates
import seaborn as sns

# to view all columns
pd.set_option("display.max.columns", None)

import preprocessor as p

Function: [credits](https://github.com/yabebalFantaye)

In [16]:
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content  #.encode(BeautifulSoup.original_encoding)
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)
    
def get_elements(url, tag='',search={}, fname=None):
    """
    Downloads a page specified by the url parameter
    and returns a list of strings, one per tag element
    """
    
    if isinstance(url,str):
        response = simple_get(url)
    else:
        #if already it is a loaded html page
        response = url

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')
        
        res = []
        if tag:    
            for li in html.select(tag):
                for name in li.text.split('\n'):
                    if len(name) > 0:
                        res.append(name.strip())
                       
                
        if search:
            soup = html            
            
            
            r = ''
            if 'find' in search.keys():
                print('findaing',search['find'])
                soup = soup.find(**search['find'])
                r = soup

                
            if 'find_all' in search.keys():
                print('findaing all of',search['find_all'])
                r = soup.find_all(**search['find_all'])
   
            if r:
                for x in list(r):
                    if len(x) > 0:
                        res.extend(x)
            
        return res
    
def get_tag_elements(url, tag='h2'):
    """
    Downloads a page specified by the url parameter
    and returns a list of strings, one per tag element
    """
    
    response = simple_get(url)

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')
        names = set()
        for li in html.select(tag):
            for name in li.text.split('\n'):
                if len(name) > 0:
                    names.add(name.strip())
        return list(names)

    # Raise an exception if we failed to get any data from the url
    raise Exception('Error retrieving contents at {}'.format(url)) 
    
    
if get_ipython().__class__.__name__ == '__main__':
    fire(get_tag_elements)

<a id="sports"></a>
### sports

In [74]:
url = 'https://www.socialbakers.com/statistics/twitter/profiles/kenya/sport'
sports = get_elements(url, tag = 'a', search = {})

In [130]:
sport = []
for element in sports:
    for string in element.split(' '):
        if '@' in string:
            sport.append(string)
        
def trim_word(word, from_start=0, from_end=0):
    return word[from_start:len(word) - from_end]
sporting = []
for element in sport:
    element = trim_word(element, 1,1)
    sporting.append(element)
    
sporting_df = pd.DataFrame({'handles':sporting})
sporting_df.to_csv('sporting_df.csv')
sporting_df.head()

Unnamed: 0,handles
0,@OfficialKRU
1,@Nondies
2,@ingweleopards
3,@Gor_MahiaFC
4,@Monks_Rugby


### Society

In [132]:
url = 'https://www.socialbakers.com/statistics/twitter/profiles/kenya/society/education'
edu = get_elements(url, tag = 'a', search = {})

education = []
for element in edu:
    for string in element.split(' '):
        if '@' in string:
            education.append(string)
        
def trim_word(word, from_start=0, from_end=0):
    return word[from_start:len(word) - from_end]
educ = []
for element in education:
    element = trim_word(element, 1,1)
    educ.append(element)
    
edu_df = pd.DataFrame({'handles':educ})
edu_df.to_csv('edu_df.csv')
edu_df.head()

Unnamed: 0,handles
0,@StrathU
1,@CUEA_OFFICIAL
2,@DaystarUni
3,@moringaschool
4,@Maseno_Uni


In [133]:
# gov
url = 'https://www.socialbakers.com/statistics/twitter/profiles/kenya/society/governmental/page-1-3'
gov = get_elements(url, tag = 'a', search = {})

g = []
for element in gov:
    for string in element.split(' '):
        if '@' in string:
            g.append(string)
        
def trim_word(word, from_start=0, from_end=0):
    return word[from_start:len(word) - from_end]
gv = []
for element in g:
    element = trim_word(element, 1,1)
    gv.append(element)
    
gov_df = pd.DataFrame({'handles':gv})
gov_df.to_csv('gov_df.csv')
gov_df.head()

Unnamed: 0,handles
0,@StateHouseKenya
1,@IEBCKenya
2,@NPSOfficial_KE
3,@InteriorKE
4,@USEmbassyKenya


In [93]:
#conferences
url = 'https://www.socialbakers.com/statistics/twitter/profiles/kenya/society/conference'
conf = get_elements(url, tag = 'a', search = {})

c = []
for element in conf:
    for string in element.split(' '):
        if '@' in string:
            c.append(string)
        
def trim_word(word, from_start=0, from_end=0):
    return word[from_start:len(word) - from_end]
cf = []
for element in c:
    element = trim_word(element, 1,1)
    cf.append(element)
    
conf_df = pd.DataFrame({'handles':cf})
conf_df.to_csv('conf_df.csv')
conf_df.head()

Unnamed: 0,handles
0,@TheAGRF
1,@KenyaHomesExpo
2,@SMWNairobi


In [92]:
# politics
url = 'https://www.socialbakers.com/statistics/twitter/profiles/kenya/society/politics'
politics = get_elements(url, tag = 'a', search = {})

p = []
for element in politics:
    for string in element.split(' '):
        if '@' in string:
            p.append(string)
        
def trim_word(word, from_start=0, from_end=0):
    return word[from_start:len(word) - from_end]
pl = []
for element in p:
    element = trim_word(element, 1,1)
    pl.append(element)
    
politics_df = pd.DataFrame({'handles':pl})
politics_df.to_csv('politics_df.csv')
politics_df.head()

Unnamed: 0,handles
0,@WilliamsRuto
1,@RailaOdinga
2,@MikeSonko
3,@MarthaKarua
4,@KideroEvans


In [94]:
# ngo
url = 'https://www.socialbakers.com/statistics/twitter/profiles/kenya/society/ngo'
ngo = get_elements(url, tag = 'a', search = {})

p = []
for element in ngo:
    for string in element.split(' '):
        if '@' in string:
            p.append(string)
        
def trim_word(word, from_start=0, from_end=0):
    return word[from_start:len(word) - from_end]
pl = []
for element in p:
    element = trim_word(element, 1,1)
    pl.append(element)
    
ngo_df = pd.DataFrame({'handles':pl})
ngo_df.to_csv('ngo_df.csv')
ngo_df.head()

Unnamed: 0,handles
0,@KenyaRedCross
1,@SheldrickTrust
2,@MediaCouncilK
3,@UNHABITAT
4,@thekhrc


In [95]:
# prof
url = 'https://www.socialbakers.com/statistics/twitter/profiles/kenya/society/professional-association'
prof = get_elements(url, tag = 'a', search = {})

p = []
for element in prof:
    for string in element.split(' '):
        if '@' in string:
            p.append(string)
        
def trim_word(word, from_start=0, from_end=0):
    return word[from_start:len(word) - from_end]
pl = []
for element in p:
    element = trim_word(element, 1,1)
    pl.append(element)
    
prof_df = pd.DataFrame({'handles':pl})
prof_df.to_csv('ngo_df.csv')
prof_df.head()

Unnamed: 0,handles
0,@PROSAK_Kenya


### Place

In [98]:
# country
url = 'https://www.socialbakers.com/statistics/twitter/profiles/kenya/place/country'
country = get_elements(url, tag = 'a', search = {})

p = []
for element in country:
    for string in element.split(' '):
        if '@' in string:
            p.append(string)
        
def trim_word(word, from_start=0, from_end=0):
    return word[from_start:len(word) - from_end]
pl = []
for element in p:
    element = trim_word(element, 1,1)
    pl.append(element)
    
country_df = pd.DataFrame({'handles':pl})
country_df.to_csv('country_df.csv')
country_df.head()

Unnamed: 0,handles
0,@KResearcher
1,@magicalkenya
2,@kot
3,@MakeItKenya
4,@KenyaBuzz


In [110]:
# medical centre
url = 'https://www.socialbakers.com/statistics/twitter/profiles/kenya/place/medical-center'
med = get_elements(url, tag = 'a', search = {})


p = []
for element in med:
    for string in element.split(' '):
        if '@' in string:
            p.append(string)
        
def trim_word(word, from_start=0, from_end=0):
    return word[from_start:len(word) - from_end]
pl = []
for element in p:
    element = trim_word(element, 1,1)
    pl.append(element)
    
med_df = pd.DataFrame({'handles':pl})
med_df.to_csv('med_df.csv')
med_df.head()

Unnamed: 0,handles
0,@nhifkenya
1,@mmtckenya
2,@thenairobihosp
3,@AAR_Healthcare
4,@GertrudesHosp


### Restaurants and cafes

In [111]:
#cafe
url = 'https://www.socialbakers.com/statistics/twitter/profiles/kenya/place/restaurant-cafe'
cafe = get_elements(url, tag = 'a', search = {})
# cafe

p = []
for element in cafe:
    for string in element.split(' '):
        if '@' in string:
            p.append(string)
        
def trim_word(word, from_start=0, from_end=0):
    return word[from_start:len(word) - from_end]
pl = []
for element in p:
    element = trim_word(element, 1,1)
    pl.append(element)
    
cafe_df = pd.DataFrame({'handles':pl})
cafe_df.to_csv('cafe_df.csv')
cafe_df.head()

Unnamed: 0,handles
0,@urban_kichen
1,@Quepasakaren


### Media

In [112]:
# daily news
url = 'https://www.socialbakers.com/statistics/twitter/profiles/kenya/media/daily-news'
daily = get_elements(url, tag = 'a', search = {})
# daily


p = []
for element in daily:
    for string in element.split(' '):
        if '@' in string:
            p.append(string)
        
def trim_word(word, from_start=0, from_end=0):
    return word[from_start:len(word) - from_end]
pl = []
for element in p:
    element = trim_word(element, 1,1)
    pl.append(element)
    
daily_df = pd.DataFrame({'handles':pl})
daily_df.to_csv('daily_df.csv')
daily_df.head()

Unnamed: 0,handles
0,@dailynation
1,@StandardKenya
2,@TheStarKenya
3,@BD_Africa
4,@Kenyans


In [116]:
# media house
url = 'https://www.socialbakers.com/statistics/twitter/profiles/kenya/media/media-house'
media = get_elements(url, tag = 'a', search = {})

p = []
for element in media:
    for string in element.split(' '):
        if '@' in string:
            p.append(string)
        
def trim_word(word, from_start=0, from_end=0):
    return word[from_start:len(word) - from_end]
pl = []
for element in p:
    element = trim_word(element, 1,1)
    pl.append(element)
    
media_df = pd.DataFrame({'handles':pl})
media_df.to_csv('media_df.csv')
media_df.head()

Unnamed: 0,handles
0,@NationMediaGrp


In [118]:
# radio media
url = 'https://www.socialbakers.com/statistics/twitter/profiles/kenya/media/radio-media'
radio = get_elements(url, tag = 'a', search = {})

p = []
for element in radio:
    for string in element.split(' '):
        if '@' in string:
            p.append(string)
        
def trim_word(word, from_start=0, from_end=0):
    return word[from_start:len(word) - from_end]
pl = []
for element in p:
    element = trim_word(element, 1,1)
    pl.append(element)
    
radio_df = pd.DataFrame({'handles':pl})
radio_df.to_csv('radio_df.csv')
radio_df.head()

Unnamed: 0,handles
0,@CapitalFMKenya
1,@radiomaisha
2,@Kiss100kenya
3,@HomeboyzRadio
4,@Lit360KE_


In [119]:
# sportsMedia
url = 'https://www.socialbakers.com/statistics/twitter/profiles/kenya/media/sports-media'
sportsM = get_elements(url, tag = 'a', search = {})

p = []
for element in sportsM:
    for string in element.split(' '):
        if '@' in string:
            p.append(string)
        
def trim_word(word, from_start=0, from_end=0):
    return word[from_start:len(word) - from_end]
pl = []
for element in p:
    element = trim_word(element, 1,1)
    pl.append(element)
    
sportsM_df = pd.DataFrame({'handles':pl})
sportsM_df.to_csv('sportsM_df.csv')
sportsM_df.head()

Unnamed: 0,handles
0,@GameYetu
1,@SportpesaNews
2,@DN_Football
3,@Nation_Sport


### Entertainment

In [120]:
# apps
url = 'https://www.socialbakers.com/statistics/twitter/profiles/kenya/entertainment/apps'
apps = get_elements(url, tag = 'a', search = {})

p = []
for element in apps:
    for string in element.split(' '):
        if '@' in string:
            p.append(string)
        
def trim_word(word, from_start=0, from_end=0):
    return word[from_start:len(word) - from_end]
pl = []
for element in p:
    element = trim_word(element, 1,1)
    pl.append(element)
    
apps_df = pd.DataFrame({'handles':pl})
apps_df.to_csv('apps_df.csv')
apps_df.head()

Unnamed: 0,handles
0,@Ma3Route
1,@EasyTaxiKE
2,@OChargeKE
3,@olizaKE


In [121]:
# shows
url = 'https://www.socialbakers.com/statistics/twitter/profiles/kenya/entertainment/broadcast-show'
shows = get_elements(url, tag = 'a', search = {})

p = []
for element in shows:
    for string in element.split(' '):
        if '@' in string:
            p.append(string)
        
def trim_word(word, from_start=0, from_end=0):
    return word[from_start:len(word) - from_end]
pl = []
for element in p:
    element = trim_word(element, 1,1)
    pl.append(element)
    
shows_df = pd.DataFrame({'handles':pl})
shows_df.to_csv('shows_df.csv')
shows_df.head()

Unnamed: 0,handles
0,@KBCChannel1
1,@theTrendLive
2,@AMLiveNTV
3,@NTVSasa


### Community

In [123]:
# fun
url = 'https://www.socialbakers.com/statistics/twitter/profiles/kenya/community/fun'
fun = get_elements(url, tag = 'a', search = {})

p = []
for element in fun:
    for string in element.split(' '):
        if '@' in string:
            p.append(string)
        
def trim_word(word, from_start=0, from_end=0):
    return word[from_start:len(word) - from_end]
pl = []
for element in p:
    element = trim_word(element, 1,1)
    pl.append(element)
    
fun_df = pd.DataFrame({'handles':pl})
fun_df.to_csv('fun_df.csv')
fun_df.head()

Unnamed: 0,handles
0,@tafaqari


In [124]:
# hobbies
url = 'https://www.socialbakers.com/statistics/twitter/profiles/kenya/community/hobbies'
hobbies = get_elements(url, tag = 'a', search = {})

p = []
for element in hobbies:
    for string in element.split(' '):
        if '@' in string:
            p.append(string)
        
def trim_word(word, from_start=0, from_end=0):
    return word[from_start:len(word) - from_end]
pl = []
for element in p:
    element = trim_word(element, 1,1)
    pl.append(element)
    
hobbies_df = pd.DataFrame({'handles':pl})
hobbies_df.to_csv('hobbies_df.csv')
hobbies_df.head()

Unnamed: 0,handles
0,Zuku_WeCar
1,Zuku_WeCar
2,Zuku_WeCar
3,Zuku_WeCar
4,@AfricanProverbs


In [125]:
# life style
url = 'https://www.socialbakers.com/statistics/twitter/profiles/kenya/community/life-style'
life = get_elements(url, tag = 'a', search = {})

p = []
for element in life:
    for string in element.split(' '):
        if '@' in string:
            p.append(string)
        
def trim_word(word, from_start=0, from_end=0):
    return word[from_start:len(word) - from_end]
pl = []
for element in p:
    element = trim_word(element, 1,1)
    pl.append(element)
    
life_df = pd.DataFrame({'handles':pl})
life_df.to_csv('life_df.csv')
life_df.head()

Unnamed: 0,handles
0,@xtiandela
1,@ItsJacksonKE
2,@EveWomanKenya
3,@6lvcsupreme
4,@lizmarami


### Celebrities

In [126]:
# actors
url = 'https://www.socialbakers.com/statistics/twitter/profiles/kenya/celebrities/actor'
actor = get_elements(url, tag = 'a', search = {})

p = []
for element in actor:
    for string in element.split(' '):
        if '@' in string:
            p.append(string)
        
def trim_word(word, from_start=0, from_end=0):
    return word[from_start:len(word) - from_end]
pl = []
for element in p:
    element = trim_word(element, 1,1)
    pl.append(element)
    
actor_df = pd.DataFrame({'handles':pl})
actor_df.to_csv('actor_df.csv')
actor_df.head()

Unnamed: 0,handles
0,@MwalimChurchill
1,@iamedigathegi


In [109]:
# broadcasts
url = 'https://www.socialbakers.com/statistics/twitter/profiles/kenya/celebrities/broadcast-star'
broadcast = get_elements(url, tag = 'a', search = {})

p = []
for element in med:
    for string in element.split(' '):
        if '@' in string:
            p.append(string)
        
def trim_word(word, from_start=0, from_end=0):
    return word[from_start:len(word) - from_end]
pl = []
for element in p:
    element = trim_word(element, 1,1)
    pl.append(element)
    
med_df = pd.DataFrame({'handles':pl})
med_df.to_csv('med_df.csv')
med_df.head()

Unnamed: 0,handles
0,@nhifkenya
1,@mmtckenya
2,@thenairobihosp
3,@AAR_Healthcare
4,@GertrudesHosp


In [134]:
# dj
url = 'https://www.socialbakers.com/statistics/twitter/profiles/kenya/celebrities/disc-jockey'
dj = get_elements(url, tag = 'a', search = {})

p = []
for element in dj:
    for string in element.split(' '):
        if '@' in string:
            p.append(string)
        
def trim_word(word, from_start=0, from_end=0):
    return word[from_start:len(word) - from_end]
pl = []
for element in p:
    element = trim_word(element, 1,1)
    pl.append(element)
    
dj_df = pd.DataFrame({'handles':pl})
dj_df.to_csv('dj_df.csv')
dj_df.head()

Unnamed: 0,handles
0,@DJSADIC
1,@DeejayCeleb


In [108]:
# musician
url = 'https://www.socialbakers.com/statistics/twitter/profiles/kenya/celebrities/musician'
musician = get_elements(url, tag = 'a', search = {})

p = []
for element in musician:
    for string in element.split(' '):
        if '@' in string:
            p.append(string)
        
def trim_word(word, from_start=0, from_end=0):
    return word[from_start:len(word) - from_end]
pl = []
for element in p:
    element = trim_word(element, 1,1)
    pl.append(element)
    
musician_df = pd.DataFrame({'handles':pl})
musician_df.to_csv('musician_df.csv')
musician_df.head()

Unnamed: 0,handles
0,@sautisol
1,@BahatiKenya
2,@H_ARTTHEBAND
3,Sailors25
4,@sailors254


In [107]:
# singer
url = 'https://www.socialbakers.com/statistics/twitter/profiles/kenya/celebrities/singer'
singer = get_elements(url, tag = 'a', search = {})

p = []
for element in singer:
    for string in element.split(' '):
        if '@' in string:
            p.append(string)
        
def trim_word(word, from_start=0, from_end=0):
    return word[from_start:len(word) - from_end]
pl = []
for element in p:
    element = trim_word(element, 1,1)
    pl.append(element)
    
singer_df = pd.DataFrame({'handles':pl})
singer_df.to_csv('singer_df.csv')
singer_df.head()

Unnamed: 0,handles
0,@RabbitTheKing
1,@Sanaipei_Tande
2,@KarunMusic
3,@DynoxxPetrah
4,@Kamnao


### Brands

In [105]:
# accom
url = 'https://www.socialbakers.com/statistics/twitter/profiles/kenya/brands/accommodation'
accom = get_elements(url, tag = 'a', search = {})

p = []
for element in accom:
    for string in element.split(' '):
        if '@' in string:
            p.append(string)
        
def trim_word(word, from_start=0, from_end=0):
    return word[from_start:len(word) - from_end]
pl = []
for element in p:
    element = trim_word(element, 1,1)
    pl.append(element)
    
accom_df = pd.DataFrame({'handles':pl})
accom_df.to_csv('accom_df.csv')
accom_df.head()

Unnamed: 0,handles
0,@serenahotels
1,@SarovaHotelsKen
2,@SarovaStanley
3,@HeritageKenya
4,@VillaRosaKempin


In [104]:
# airlines
url = 'https://www.socialbakers.com/statistics/twitter/profiles/kenya/brands/airlines'
airlines = get_elements(url, tag = 'a', search = {})

p = []
for element in airlines:
    for string in element.split(' '):
        if '@' in string:
            p.append(string)
        
def trim_word(word, from_start=0, from_end=0):
    return word[from_start:len(word) - from_end]
pl = []
for element in p:
    element = trim_word(element, 1,1)
    pl.append(element)
    
air_df = pd.DataFrame({'handles':pl})
air_df.to_csv('air_df.csv')
air_df.head()

Unnamed: 0,handles
0,@KenyaAirways
1,@FlyJambojet
2,@Flysafarilink
3,@silverstoneair
4,@KLM_KE


In [103]:
# alcohol
url = 'https://www.socialbakers.com/statistics/twitter/profiles/kenya/brands/alcohol'
alcohol = get_elements(url, tag = 'a', search = {})

p = []
for element in alcohol:
    for string in element.split(' '):
        if '@' in string:
            p.append(string)
        
def trim_word(word, from_start=0, from_end=0):
    return word[from_start:len(word) - from_end]
pl = []
for element in p:
    element = trim_word(element, 1,1)
    pl.append(element)
    
alcohol_df = pd.DataFrame({'handles':pl})
alcohol_df.to_csv('alcohol_df.csv')
alcohol_df.head()

Unnamed: 0,handles
0,@TuskerCider
1,@TuskerLager
2,@GuinnessKE
3,@KenyaCaneSpirit
4,@TuskerMaltLager


In [102]:
# auto
url = 'https://www.socialbakers.com/statistics/twitter/profiles/kenya/brands/auto'
auto = get_elements(url, tag = 'a', search = {})

p = []
for element in auto:
    for string in element.split(' '):
        if '@' in string:
            p.append(string)
        
def trim_word(word, from_start=0, from_end=0):
    return word[from_start:len(word) - from_end]
pl = []
for element in p:
    element = trim_word(element, 1,1)
    pl.append(element)
    
auto_df = pd.DataFrame({'handles':pl})
auto_df.to_csv('auto_df.csv')
auto_df.head()

Unnamed: 0,handles
0,@FordinKenya
1,@IsuzuKenya
2,@SubaruKenya
3,@chlorideexide
4,@BMW_Kenya


In [101]:
# beauty
url = 'https://www.socialbakers.com/statistics/twitter/profiles/kenya/brands/beauty'
beauty = get_elements(url, tag = 'a', search = {})

p = []
for element in beauty:
    for string in element.split(' '):
        if '@' in string:
            p.append(string)
        
def trim_word(word, from_start=0, from_end=0):
    return word[from_start:len(word) - from_end]
pl = []
for element in p:
    element = trim_word(element, 1,1)
    pl.append(element)
    
beauty_df = pd.DataFrame({'handles':pl})
beauty_df.to_csv('beauty_df.csv')
beauty_df.head()

Unnamed: 0,handles
0,@RexonaEA
1,@NyweleCreative


In [100]:
# bev
url = 'https://www.socialbakers.com/statistics/twitter/profiles/kenya/brands/beverages'
bev = get_elements(url, tag = 'a', search = {})

p = []
for element in bev:
    for string in element.split(' '):
        if '@' in string:
            p.append(string)
        
def trim_word(word, from_start=0, from_end=0):
    return word[from_start:len(word) - from_end]
pl = []
for element in p:
    element = trim_word(element, 1,1)
    pl.append(element)
    
bev_df = pd.DataFrame({'handles':pl})
bev_df.to_csv('bev_df.csv')
bev_df.head()

Unnamed: 0,handles
0,@CocaColaAfrica
1,@NescafeKe
2,@KerichoGold
3,@KetepaLtd
4,@SpriteKenya


## Concating the dataframes to one mega df.

In [137]:
mega_dff = pd.concat([accom_df, actor_df, air_df, alcohol_df, apps_df, auto_df, beauty_df, bev_df, cafe_df, conf_df, country_df,
                     daily_df, edu_df, events_df, fun_df, gov_df, hobbies_df, life_df, mags_df, med_df, media_df, musician_df, ngo_df,
                     politics_df, radio_df, shows_df, singer_df, sporting_df, sportsM_df, dj_df, prof_df])
mega_dff = mega_dff.drop_duplicates(subset = ['handles'])
mega_dff.to_csv('mega1.csv')
mega_dff.shape

(172, 1)

#### [Back to top](#Web-scraping)

In [None]:
# from selenium import webdriver
# from selenium import webdriver
# from selenium.webdriver.firefox.options import Options as FirefoxOptions

# driver = webdriver.Firefox("/home/ada/10academy/training /week2/monday/kernels/geckodriver")

# import time

# driver.get("https://www.socialbakers.com/statistics/twitter/profiles/kenya/society")
# more_buttons = driver.find_elements_by_class_name("show-more-button btn-nw btn-nw--lg btn-nw--invisible ")
# for x in range(len(more_buttons)):
#     if more_buttons[x].is_displayed():
#         driver.execute_script("arguments[0].click();", more_buttons[x])
#         time.sleep(1)
        
# page_source = driver.page_source

# from bs4 import BeautifulSoup

# soup = BeautifulSoup(page_source, 'lxml')
# reviews = []
# reviews_selector = soup.find_all('a')
# # for review_selector in reviews_selector:
# #     review_div = review_selector.find('div', class_='dyn_full_review')
# #     if review_div is None:
# #         review_div = review_selector.find('div', class_='basic_review')
# #     review = review_div.find('div', class_='entry').find('p').get_text()
# #     review = review.strip()
# #     reviews.append(review)
# reviews



# def configure_firefox_driver():
#     # Add additional Options to the webdriver
#     firefox_options = FirefoxOptions()
#     # add the argument and make the browser Headless.
#     firefox_options.add_argument("--headless")

#     # Instantiate the Webdriver: Mention the executable path of the webdriver you have downloaded
#     # if driver is in PATH, no need to provide executable_path
#     driver = webdriver.Firefox(executable_path = "./geckodriver", options = firefox_options)
#     return driver


# dr = configure_firefox_driver()



# from selenium.common.exceptions import TimeoutException, NoSuchElementException
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC


# def get_field_text_if_exists(item, selector):
#     """Extracts a field by a CSS selector if exists."""
#     try:
#         return item.find_element_by_css_selector(selector).text
#     except NoSuchElementException:
#         return ""


# def get_link_if_exists(item, selector):
#     """Extracts an href attribute value by a CSS selector if exists."""
#     try:
#         return item.find_element_by_css_selector(selector).get_attribute("href")
#     except NoSuchElementException:
#         return ""


# wait = WebDriverWait(dr, 10)

# dr.get("https://www.socialbakers.com/statistics/twitter/profiles/kenya/society")


# soup = BeautifulSoup(dr.page_source,'html.parser')


#### [Back to top](#Web-scraping)