## Scrapy VS Beautiful Soup


### Step1-Create selector

In [10]:
import scrapy
from scrapy import Selector
import requests

In [12]:
url = 'https://kylecavan.com/pages/schools'
html = requests.get(url).content

# .content!

In [29]:
# create selector from url or html

selu = Selector(text=url)
print(type(selu))

selh = Selector(text=html)
print(type(selh))

# ?Selector # for more information about text

<class 'scrapy.selector.unified.Selector'>
<class 'scrapy.selector.unified.Selector'>


### Step2-Extract data

In [15]:
# define path for all data with "school-menu" class

xpath1 = '//*[@class = "school-menu"]'
css1 = '.school-menu'

In [22]:
# extract with xpath
reh = selh.xpath(xpath1).extract_first()

# alternative with css
rec = selh.css(css1).extract_first()

### Notes
 - `selector.path` returns a SelectorList of Selector objects
 - `selector.path.extract` return data in that those Selector objects list
 - `selector.path.extract_first` return first element in that list

temp = selh.css(css1)  
temp

## Stop and think: "reh" above looks messy. What's next?
Inspect website data and find that target collections names are in links under "school-menu" class

In [50]:
# get all links in all decents of class "school-menu"

col = selh.css('.school-menu a::attr(href)').extract()

# validate result: get a list of 165 strings
print(len(col))
print(type(col))
print(type(col[0]))

# preview
col[0:5]

165
<class 'list'>
<class 'str'>


['/collections/alabama',
 '/collections/auburn',
 '/collections/baylor',
 '/collections/uc-berkeley',
 '/collections/boston-college']

### Tips: alternative ways with xpath and append

__xpath__  
`col_xpath = selh.xpath('//*[@class ="school-menu"]//a/@href').extract()`  

__xpath.css__  
`col_xadd = selh.xpath(xpath1).css(' a::attr(href)').extract()`  


## Next, it's just to clean text!


In [60]:
import pandas as pd 

def clean_col(x):
    # First, get rid of special characters
    x = x.strip("'")
    x = x.replace("/collections/",'')
    x = x.replace('-',' ')

    # Then, prettify text with each word in school name capitalized
    x = [x.capitalize() for x in x.split()]
    x = ' '.join(x)
    return x


In [61]:
# clean list
col_cleaned = [clean_col(x) for x in col]

# preview head and tail
print(col_cleaned[0:5])
print(col_cleaned[-5:-1])

['Alabama', 'Auburn', 'Baylor', 'Uc Berkeley', 'Boston College']
['Washington Lee', 'West Virginia', 'Williams College', 'William Mary']


## Looks great! 
## Finally, let's write result to a dataframe for numerous later uses

In [62]:
# List to dataframe

df = pd.DataFrame(col_cleaned)
df.head()

Unnamed: 0,0
0,Alabama
1,Auburn
2,Baylor
3,Uc Berkeley
4,Boston College


In [63]:
df.to_csv('All_Collections.csv', index = False)

## Well done!

## A fasterrr way
tst = selh.css('.school-menu a::text').extract()  #::text extract content??
tst