# An example of a Python crawler
## Crawling and storing data from r/dailyprogrammer 👻

### Import libraries

In [1]:
import pandas as pd
import numpy as np
from bs4 import  BeautifulSoup
from fake_useragent import UserAgent
import requests
import csv
import time
from IPython.core.display import HTML
import xml
import mysql.connector
from datetime import datetime
from django.db import DataError

### Creating a Database

#### Setup a connection

In [2]:
import mysql.connector

mydb = mysql.connector.connect(
  host="localhost",
  user="root",
  passwd="",
  database='DailyProgrammerDB',
)

mycursor = mydb.cursor()


#### Creating the DB

In [133]:
mycursor = mydb.cursor()

#mycursor.execute("CREATE DATABASE DailyProgrammerDB")
mydb.connect(database='DailyProgrammerDB')
mycursor = mydb.cursor()
mycursor.execute("""
CREATE TABLE Exercises
(id int NOT NULL ,
 url             text NOT NULL ,
 popularity_index decimal(18,0) NOT NULL ,
 post_date        date NOT NULL ,
 description     text NOT NULL ,
 title           varchar(255) NOT NULL ,
 difficulty       char(10) NULL 
)""")

### Creating a temporary dataframe

In [3]:
exerciseDF = pd.DataFrame(columns=['id','url','title','description','popularity_index','post_date','difficulty'])
exerciseDF

Unnamed: 0,id,url,title,description,popularity_index,post_date,difficulty


### Getting the page

In [5]:
url = 'https://old.reddit.com/r/dailyprogrammer/'
# Headers to mimic a browser visit
ua = UserAgent()
header = {'User-Agent':str(ua.chrome)}
# Getting html content from url
page = requests.get(url, headers=header)


### Visualizing the html content in the response

In [16]:
HTML(page.text)

### Getting Started with BeautifulSoup

In [6]:
pageCount = 10
url = 'https://old.reddit.com/r/dailyprogrammer/'
ua = UserAgent()
header = {'User-Agent':str(ua.chrome)}
validURLs = []
for i in range(pageCount):
    page = requests.get(url, headers=header)
    soup = BeautifulSoup(page.text, 'html.parser')
    '''
    Get posts' urls
    '''
    
    for link in soup.find_all('a'):
        href = link.get('href')
        try:
            if '/r/dailyprogrammer/comments/' in href:
                if (href[:5] == 'https'):
                    validURLs.append(href)
                else:
                    href = 'https://www.reddit.com%s'%href
                    validURLs.append(href)
        except TypeError:
            pass
    '''
    Get next page's link
    '''
    next_span = soup.find_all('span',class_='next-button')[0]
    next_link = [j for j in next_span.children][0]
    next_url = next_link.get('href')
    url = next_url
print(len(validURLs))

547


In [7]:
for index,url in enumerate(validURLs[200:300]):
    print(url)
    tinypage = requests.get(url,headers=header)
    #print(HTML(tinypage.text))
    tinysoup = BeautifulSoup(tinypage.text,'html.parser')
    for i in tinysoup.find_all('div',attrs={'data-test-id' : True}):
        if i['data-test-id']=='post-content':
            children = [j for j in i.children]
            title = children[2].find_all('h2')[0].text
            print(title)
            post_date = title[1:11]
            problem_title = title[(title.find(']',20))+1:].strip()
            difficulty = title[(title.find('[',12))+1:(title.find(']',20))]
            #print(post_date)
            #print(difficulty)
            #print(problem_title)
            description = children[3].getText(separator=u'\n')
            total_content= children[3]
            popularity = [i for i in children[4].children][1].findAll('span')[0].text
            popularity_index = int(popularity.split(' ')[0][:-1])/100
            data = {
                "id" : index,
                "popularity_index":popularity_index,
                "description":description,
                "title":problem_title,
                "post_date":post_date,
                "url":url,
                "difficulty":difficulty
            }
            exerciseDF = exerciseDF.append(data, ignore_index=True)
    


https://www.reddit.com/r/dailyprogrammer/comments/7b5u96/20171106_challenge_339_easy_fixedlength_file/
[2017-11-06] Challenge #339 [Easy] Fixed-length file processing
https://old.reddit.com/r/dailyprogrammer/comments/7b5u96/20171106_challenge_339_easy_fixedlength_file/
https://www.reddit.com/r/dailyprogrammer/comments/7aae56/20171102_challenge_338_intermediate_maze_turner/
[2017-11-02] Challenge #338 [Intermediate] Maze turner
https://old.reddit.com/r/dailyprogrammer/comments/7aae56/20171102_challenge_338_intermediate_maze_turner/
https://www.reddit.com/r/dailyprogrammer/comments/79npf9/20171030_challenge_338_easy_what_day_was_it_again/
[2017-10-30] Challenge #338 [Easy] What day was it again?
https://old.reddit.com/r/dailyprogrammer/comments/79npf9/20171030_challenge_338_easy_what_day_was_it_again/
https://www.reddit.com/r/dailyprogrammer/comments/78twyd/20171026_challenge_337_intermediate_scrambled/
[2017-10-26] Challenge #337 [Intermediate] Scrambled images
https://old.reddit.com/r/

[17-08-23] Challenge #328 [Intermediate] Pyramid sliding
https://old.reddit.com/r/dailyprogrammer/comments/6vi9ro/170823_challenge_328_intermediate_pyramid_sliding/
https://www.reddit.com/r/dailyprogrammer/comments/6v29zk/170821_challenge_328_easy_latin_squares/
[17-08-21] Challenge #328 [Easy] Latin Squares
https://old.reddit.com/r/dailyprogrammer/comments/6v29zk/170821_challenge_328_easy_latin_squares/
https://www.reddit.com/r/dailyprogrammer/comments/6uifb0/20170818_challenge_327_hard_calculating_costas/
[2017-08-18] Challenge #327 [Hard] Calculating Costas Arrays
https://old.reddit.com/r/dailyprogrammer/comments/6uifb0/20170818_challenge_327_hard_calculating_costas/
https://www.reddit.com/r/dailyprogrammer/comments/6t0zua/20170811_challenge_326_hard_multifaceted_alphabet/
[2017-08-11] Challenge #326 [Hard] Multifaceted alphabet blocks
https://old.reddit.com/r/dailyprogrammer/comments/6t0zua/20170811_challenge_326_hard_multifaceted_alphabet/
https://www.reddit.com/r/dailyprogrammer/

In [22]:
len(exerciseDF)

110

### Insert to Table 🤖

In [30]:
for index in exerciseDF.index[:]:
    exercise = exerciseDF.iloc[index]
    sql = "INSERT INTO exercises (id,title, description, popularity_index, difficulty, post_date, url) VALUES (%s,%s, %s, %s, %s, %s, %s)"
    val = (exercise.id,exercise.title,exercise.description,str(exercise.popularity_index),exercise.difficulty,exercise.post_date,exercise.url)
    try:
        mycursor.execute(sql, val)
        mydb.commit()
    except :
        pass
        #exercise.post_date = datetime.today().strftime('%Y-%m-%d')
        #val = (exercise.id,exercise.title,exercise.description,str(exercise.popularity_index),exercise.post_date,exercise.url)
        #mycursor.execute(sql, val)
        #mydb.commit()

<class 'django.db.utils.DataError'>
