Skip to content

This repo show how to Scrape different type of data

Notifications You must be signed in to change notification settings

ayoubzulfiqar/go-scraper

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

7 Commits
 
 
 
 
 
 
 
 
 
 
 
 

Repository files navigation

Scraping with Go

Basic of HTML Elements

1. Search for tags

In case of tags we just have to write like this for any html tag because in golang single quote represent runes so brackets under double quotes " "

for anchor tag 
(".a")

for paragraph Tag
(".p")

same for other tags....
2. Search for all div attributes

id considered as attribute associated with div tag

- example
("div[id]")

general
("html-tag[html-attribute]")
3. Search for all name attributes

For Example ("div[id=comment]") comment is name of id attribute so fo search all related name attributes we have to write like this

("#comment")
4. Search for all elements based on class

For Example ("div class=writer")

(".writer")
5. Search for all elements have set same attribute set
("*[html-attribute]")

Scraping in Golang Using Colly and exporting into CSV File

Example - 1

package main

import (
	"encoding/csv"
	"fmt"
	"log"
	"os"
	"time"

	"github.com/gocolly/colly"
)

// Its our data model based on this we will specify the elements we will scrap
type Quotes struct {
	AUTHOR string
	QUOTE  string
}

func QuoteScrapper() {
	// url of the website that we want to scrap
	var url string = "https://www.brainyquote.com/top_100_quotes"
	// file name of our csv file - yu can give it anything you want
	var fileName string = "quote.csv"
	fmt.Println("Starting Scraping....")
	// using os library we will create a csv file in our directory
	file, err := os.Create(fileName)
	if err != nil {
		log.Fatal("Panic: Could not be able to Create file", fileName, err)
		return
	}
	defer file.Close()
	// writer will write the context of the file
	writer := csv.NewWriter(file)
	defer writer.Flush()
	// first two heading of the CSV file
	writer.Write([]string{"Author", "Quote"})

	// Colly - Initializing our collector
	c := colly.NewCollector()
	c.SetRequestTimeout(120 * time.Second)

	c.OnRequest(func(r *colly.Request) {
		fmt.Println("Visiting:", r.URL)
	})

	c.OnResponse(func(r *colly.Response) {
		fmt.Println("Got a response from", r.Request.URL)
	})

	c.OnError(func(r *colly.Response, e error) {
		fmt.Println("Got this error:", e)
	})
	c.OnHTML(".quoteContent", func(h *colly.HTMLElement) {
		quote := &Quotes{}

		quote.AUTHOR = h.ChildText(".bq_fq_a")
		quote.QUOTE = h.ChildText(".b-qt-qt")
		writer.Write([]string{quote.AUTHOR, quote.QUOTE})
	})
	c.Visit(url)
	fmt.Println("End of Era: ", url)
}

Example - 2

package main

import (
	"encoding/csv"
	"fmt"
	"log"
	"os"
	"time"

	"github.com/gocolly/colly"
)

type PRODUCTS struct {
	Name     string
	Image    string
	Price    string
	Url      string
	Discount string
}

func StoreScrapper() {
	c := colly.NewCollector()
	c.SetRequestTimeout(120 * time.Second)

	var fileName string = "products.csv"
	fmt.Println("Starting Scraping....")

	file, err := os.Create(fileName)
	if err != nil {
		log.Fatal("Panic: Could not be able to Create file", fileName, err)
		return
	}
	defer file.Close()
	writer := csv.NewWriter(file)
	defer writer.Flush()
	writer.Write([]string{"Name", "Quote"})

	// Callbacks
	c.OnRequest(func(r *colly.Request) {
		fmt.Println("Visiting", r.URL)
	})

	c.OnResponse(func(r *colly.Response) {
		fmt.Println("Got a response from", r.Request.URL)
	})

	c.OnError(func(r *colly.Response, e error) {
		fmt.Println("Got this error:", e, r.StatusCode)
	})
	c.OnHTML(".core", func(e *colly.HTMLElement) {
		e.ForEach(".name", func(_ int, h *colly.HTMLElement) {
			item := &PRODUCTS{}
			item.Name = h.Text
			// item.Image = e.ChildAttr(".img-C", ".data-src")
			item.Price = e.ChildText(".data-price")
			item.Url = "https://jumia.com.ng" + e.Attr(".href")
			// item.Discount = e.ChildText(".div.tag._dsct")

			writer.Write([]string{item.Name, item.Price, item.Url})
		})

	})

	c.Visit("https://www.jumia.com.ng/flash-sales/")
}

Scraping in Golang Using Colly and exporting into JSON File

package main

import (
	"encoding/json"
	"fmt"
	"os"

	"github.com/gocolly/colly"
)



type NEWS struct {
	TITLE string `json:"title"`
	LINKS string `json:"links"`
	DATE  string `json:"date"`
}

func NewsCrawlerServer() {
	var url string = "https://www.thenews.com.pk/latest-stories"
	fmt.Println("Starting Scraping....")
	collector := colly.NewCollector()
	var data []NEWS
	collector.OnRequest(func(r *colly.Request) {
		fmt.Println("Visiting:", r.URL)
	})
	collector.OnResponse(func(r *colly.Response) {
		fmt.Println("Got a response from", r.Request.URL)
	})

	collector.OnError(func(r *colly.Response, e error) {
		fmt.Println("Got this error:", e)
	})

	collector.OnHTML(".writter-list-item-story", func(element *colly.HTMLElement) {
		news := &NEWS{}
		element.ForEach(".latest-right", func(_ int, h *colly.HTMLElement) {
			news.TITLE = h.ChildText(".open-section")
			news.LINKS = h.ChildAttr(".open-section", "href")
			news.DATE = h.ChildText(".latestDate")
			data = append(data, *news)
		})
	})
	collector.Visit(url)
	content, err := json.Marshal(data)
	if err != nil {
		fmt.Println(err.Error())
	}

	os.WriteFile("news.json", content, 0644)
	fmt.Println("NEWS ", len(data))

}

Scraping Table Data in Golang Using colly and Exporting into CSV

package main

import (
	"encoding/csv"
	"fmt"
	"log"
	"os"

	"github.com/gocolly/colly"
)


type PSX struct {
	LDCP    string
	SCRIP   string
	OPEN    string
	HIGH    string
	LOW     string
	CURRENT string
	VOLUME  string
	CHANGE  string
}

func StockTableCrawler() {
	fName := "data.csv"
	file, err := os.Create(fName)
	if err != nil {
		log.Fatalf("Could not create file, err: %q", err)
		return
	}
	defer file.Close()

	writer := csv.NewWriter(file)

	defer writer.Flush()

	var _url string = "https://www.urdupoint.com/english/"
	// var _fileName string = "psx.json"
	fmt.Println("Service Started....")

	collector := colly.NewCollector()

	collector.OnRequest(onRequest)
	collector.OnResponse(onResponse)
	collector.OnError(onError)
	collector.OnHTML(".table-responsive", func(e *colly.HTMLElement) {
		e.ForEach("tr", func(_ int, eh *colly.HTMLElement) {
			psxData := PSX{
				SCRIP:   eh.ChildText("td:nth-child(1)"),
				LDCP:    eh.ChildText("td:nth-child(2)"),
				OPEN:    eh.ChildText("td:nth-child(3)"),
				HIGH:    eh.ChildText("td:nth-child(4)"),
				LOW:     eh.ChildText("td:nth-child(5)"),
				CURRENT: eh.ChildText("td:nth-child(6)"),
				CHANGE:  eh.ChildText("td:nth-child(7)"),
				VOLUME:  eh.ChildText("td:nth-child(8)"),
			}
			writer.Write([]string{
				psxData.SCRIP,
				psxData.LDCP,
				psxData.OPEN,
				psxData.HIGH,
				psxData.LOW,
				psxData.CURRENT,
				psxData.CHANGE,
				psxData.VOLUME,
			})

		})
		fmt.Println("Scrapping Completed")
	})

	// collector.OnHTML(".table-responsive", onHTML)
	fmt.Println("Scrapping Completed")
	collector.Visit(_url)
}

// on Request
func onRequest(r *colly.Request) {
	fmt.Println("Scraping:", r.URL)
}

// on Response

func onResponse(r *colly.Response) {
	fmt.Println("Status:", r.StatusCode)
}

// on ERROR

func onError(r *colly.Response, err error) {
	fmt.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err)
}

Scraping Data of Multiple Pages in Golang Using colly and Exporting into CSV

package main

import (
	"encoding/csv"
	"fmt"
	"log"
	"os"

	"github.com/gocolly/colly"
)

type Book struct {
	Title string
	Price string
}

func Crawling() {
	Request()
	Response()
	HTML()
	NextPageHTML()
	Visiting()
}

func Data(data []string) {
	file, err := os.Create("export.csv")
	if err != nil {
		log.Fatal(err)
	}
	defer file.Close()

	writer := csv.NewWriter(file)
	defer writer.Flush()
	headers := []string{"TITLE", "PRICE"}
	writer.Write(headers)
	writer.Write(data)
}

var collector *colly.Collector = colly.NewCollector(
	colly.AllowedDomains("books.toscrape.com"),
)

func requesting(r *colly.Request) {
	fmt.Println("Visiting: ", r.URL)
}

func Request() {
	collector.OnRequest(requesting)
}

// responding

func responding(r *colly.Response) {
	fmt.Println("Response: ", r.StatusCode)
}

func Response() {
	collector.OnResponse(responding)
}

func htmlElement(e *colly.HTMLElement) {

	book := &Book{}
	book.Title = e.ChildAttr(".image_container img", "alt")
	book.Price = e.ChildText(".price_color")

	row := []string{book.Title, book.Price}
	Data(row)
}

func HTML() {
	collector.OnHTML(".product_pod", htmlElement)
	// collector.OnHTML(".next > a", pagination)
}

func pagination(e *colly.HTMLElement) {
	nextPage := e.Request.AbsoluteURL(e.Attr("href"))
	collector.Visit(nextPage)
}

func NextPageHTML() {
	collector.OnHTML(".next > a", pagination)
}

func Visiting() {
	collector.Visit("https://books.toscrape.com/")

}