<a href="https://colab.research.google.com/github/ahmedlila/Web-Scraping-Notebooks/blob/main/CS230%20-%20DL%20Projects%20Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Import Libraries Needed 

In [None]:
pip install validators

Collecting validators
  Downloading validators-0.19.0.tar.gz (30 kB)
Building wheels for collected packages: validators
  Building wheel for validators (setup.py) ... [?25l[?25hdone
  Created wheel for validators: filename=validators-0.19.0-py3-none-any.whl size=19553 sha256=063f36507b4e0dbc15c5a85f28b6ba61f225abb118153df85067a7b811c218d5
  Stored in directory: /root/.cache/pip/wheels/fe/5d/69/ff53a908b9f14fb7730a58fdede0fac4cdc99ef3624ec76d05
Successfully built validators
Installing collected packages: validators
Successfully installed validators-0.19.0


In [None]:
import validators
from validators import ValidationFailure
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
from tabulate import tabulate
from collections import Counter
from nltk.stem import PorterStemmer

### Helpers

In [None]:
# Function source code: https://miguendes.me/how-to-check-if-a-string-is-a-valid-url-in-python
def is_string_an_url(url_string: str) -> bool:
    result = validators.url(url_string)
    if isinstance(result, ValidationFailure):
        return False
    return result

In [None]:
base_directory = "https://cs230.stanford.edu/past-projects/"
html = urlopen(base_directory)
bsObj = BeautifulSoup(html, features="html.parser")
project_links, project_names = list(), list()

for project_name in bsObj.findAll("strong"):
    project_name_text = project_name.get_text()
    link = project_name.find_next_siblings("a")

    if link:  # if list is not empty
        #get the report and check if the url is ok 
        path = link[0].attrs['href']
        link_text = link[0].get_text()
        html = is_string_an_url(path)

        if link_text =='report':  # select reports only
            if html:
                project_links.append(path)
                project_names.append(project_name_text)
            else:
                project_links.append(base_directory+path)
                project_names.append(project_name_text)

In [None]:
# TEST
project_links[3]

'http://cs230.stanford.edu/projects_fall_2021/reports/102730335.pdf'

In [None]:
# Creat Dataframe
df = pd.DataFrame({'Project Name': project_names, 
                   'Project Link': project_links})

In [None]:
#Drop Duplicates 
df.drop_duplicates(inplace=True)
# TEST
df[df.duplicated()]

Unnamed: 0,Project Name,Project Link


In [None]:
print(tabulate(df.head(), headers='keys', tablefmt='psql'))

+----+--------------------------------------------------------------------------------------------------+--------------------------------------------------------------------+
|    | Project Name                                                                                     | Project Link                                                       |
|----+--------------------------------------------------------------------------------------------------+--------------------------------------------------------------------|
|  0 | Classification of Medical Imagery using DL (?)                                                   | http://cs230.stanford.edu/projects_fall_2021/reports/102543599.pdf |
|  1 | In Learning we Truss: Structural Design Optimization Using Deep Learning                         | http://cs230.stanford.edu/projects_fall_2021/reports/102699970.pdf |
|  2 | Predicting Regional US COVID Risk Using Publicly Available Satellite Images                      | http://cs230.stanfo

In [None]:
# Most repeated 100 words in projects name
Counter(" ".join(df["Project Name"]).split()).most_common(100)

[('Deep', 251),
 ('for', 250),
 ('Learning', 213),
 ('of', 206),
 ('with', 155),
 ('and', 143),
 ('in', 142),
 ('using', 128),
 ('Neural', 120),
 ('to', 117),
 ('Using', 93),
 ('Predicting', 93),
 ('from', 90),
 ('the', 88),
 ('Networks', 85),
 ('Classification', 81),
 ('on', 77),
 ('Detection', 73),
 ('Image', 69),
 ('Prediction', 56),
 ('Recognition', 53),
 ('Images', 50),
 ('A', 49),
 ('a', 47),
 ('Network', 45),
 ('Convolutional', 37),
 ('Generation', 35),
 ('Segmentation', 32),
 ('Data', 30),
 ('Transfer', 30),
 ('Music', 29),
 ('Adversarial', 28),
 ('Approach', 27),
 ('Stock', 26),
 ('Generative', 26),
 ('Satellite', 25),
 ('CNN', 25),
 ('Object', 24),
 ('Automated', 23),
 ('Style', 23),
 ('Generating', 22),
 ('based', 22),
 ('Language', 22),
 ('learning', 21),
 ('detection', 21),
 ('Identifying', 20),
 ('Analysis', 20),
 ('Text', 20),
 ('data', 20),
 ('3D', 20),
 ('Detecting', 20),
 ('Imagery', 19),
 ('deep', 19),
 ('Human', 19),
 ('neural', 19),
 ('Video', 19),
 ('-', 18),
 ('v

### Filter and Randomizer 

**Most repeated words you can search with:**
>  - Classification 
 - Recognition 
 - Prediction
 - Classification
 - Identification
 - GANs
 - CNNs
 - RNNs 
 - LSTM
 - Image
 - Video
 - Text
 - Sentiment
 - Stock
 - Facial 
 - Medical 
 - MRI
 - Automatic 





#### <font color='green' > I- Project Filter 

In [None]:
def filter_projects(word: str):
    """
    Filter all projects by the word given by user.
    
    Arguments:
    word -- string, we search about e.g.(detection).
    
    Returns:
    projects -- all projects that contain the word given.
    """
    new_df = df['Project Name'].apply(lambda x: PorterStemmer().stem(word.lower()) in x.lower())
    print(tabulate(df[new_df], headers='keys', tablefmt='psql'))

In [None]:
# TEST
filter_projects(word='3D')

+------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+
|      | Project Name                                                                                                                    | Project Link                                                                         |
|------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------|
|   42 | Mesh: Generating 3D Renderings from 2D Images                                                                                   | http://cs230.stanford.edu/projects_fall_2021/reports/103136976.pdf                   |
|  138 | Improving Generalization Results for 3D Point Cloud Data Reconstruction From 2D Images 

#### <font color='green' > II- Project Randomizer

In [None]:
def random_project(word: str, k: int=3):
    """
    Filter number of k projects by the word given by user.
    
    Arguments:
    word -- string, we search about e.g.(detection).
    k -- number of projects to return.
    
    Returns:
    k projects -- according to the entered k. 
    """

    new_df = df['Project Name'].apply(lambda x: PorterStemmer().stem(word.lower()) in x.lower())
    last = df[new_df]
    print(tabulate(last.sample(n = k), headers='keys', tablefmt='psql'))

In [None]:
# TEST
random_project(word='MRI', k=2)

+-----+-------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+
|     | Project Name                                                                        | Project Link                                                                        |
|-----+-------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------|
| 909 | Through Thick and Thin: MRI Super-Resolution Using a Generative Adversarial Network | https://cs230.stanford.edu/past-projects/../projects_fall_2018/reports/12449264.pdf |
| 126 | Brain Aneurysm Classification/Segmentation from MRI Images using MONAI Framework    | http://cs230.stanford.edu/projects_winter_2021/reports/70747002.pdf                 |
+-----+-------------------------------------------------------------------------------------+-------