In [10]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import os.path
import sys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException

In [11]:
# importing the necessary packages
import requests
from bs4 import BeautifulSoup
import re
import json
import os

def getReferee(soup1):
    referee = soup1.find_all('div', class_ = "referee")
    for i in referee:
        s = i.get_text()
        match_data['Referee'] = s.strip()

def getStadiumName(soup1):
    stadium =  soup1.find_all('div', class_ = "stadium")
    for i in stadium:
        s = i.get_text()
        match_data['Stadium'] = s.strip()

def getFullTimeScore(soup1):
    match_events = soup1.find_all('div',class_="score fullTime")
    for i in match_events:
        s = i.get_text()
        txt = (s.strip().split('-'))
        match_data['TotalHomeGoals'] = txt[0]
        match_data['TotalAwayGoals'] = txt[1]     

## get the team names which played against each other
def getTeamNames(soup1):
    Name = []
    teamNames = soup1.find_all('span',class_='long')
    for i in teamNames:
        Name.append(i.get_text())
    match_data['HomeTeam'] = Name[0]
    match_data['AwayTeam'] = Name[1]

def getHomeTeamMatchEvents(soup1):
    xyz = (soup1.find_all('div', class_="matchEvents matchEventsContainer"))
    for ptag in xyz:
        for i in ptag.find_all('div', class_="home"):
            for x in i.find_all('div', {'aria-live':'polite'}):
                s = x.text
                txt = (re.sub(r"\s+", "", s)) 
                if re.search("^.*Goal*", txt):
                    match_data['HomeGoalScorer'].append(getGoals(txt))
                elif re.search("^.*Card*", txt):
                    match_data['HomeTeamFouls'].append(getFouls(txt))
                elif re.search("^.*penalty*", txt):
                    match_data["HomeGoalScorer"].append(getPenalty(txt))
                else:
                    print("No match")

def getAwayTeamMatchEvents(soup1):
    xyz = (soup1.find_all('div', class_="matchEvents matchEventsContainer"))
    for ptag in xyz:
        for i in ptag.find_all('div', class_="away"):
            for x in i.find_all('div', {'aria-live':'polite'}):
                s = x.text
                txt = (re.sub(r"\s+", "", s)) 
                if re.search("^.*Goal*", txt):
                    match_data['AwayGoalScorer'].append(getGoals(txt))
                elif re.search("^.*Card*", txt):
                    match_data['AwayTeamFouls'].append(getFouls(txt))
                elif re.search("^.*penalty*", txt):
                    match_data["AwayGoalScorer"].append(getPenalty(txt))
                else:
                    print("No match")

def getGoals(txt):
    goals = {}
    goal = re.findall(r"[^\W\d_]+|\d+",txt)
    goals["Player"] = " ".join(re.findall('[A-Z][^A-Z]*', goal[0]))
    goals["Time"] = goal[1]
    goals["Type"] = "Goal"
    return goals

def getFouls(txt):
    fouls = {}
    foul = re.findall(r"[^\W\d_]+|\d+",txt)
    fouls["Player"] = " ".join(re.findall('[A-Z][^A-Z]*', foul[0]))
    fouls["Time"] = foul[1]
    fouls["Card"] = " ".join(re.findall('[A-Z][^A-Z]*', foul[2]))
    return fouls

def getPenalty(txt):
    Penalties = {}
    Penalty = re.findall(r"[^\W\d_]+|\d+",txt)
    Penalties["Player"] = " ".join(re.findall('[A-Z][^A-Z]*', Penalty[0]))
    Penalties["Time"] = Penalty[1]
    Penalties["Type"] = "Penalty"
    return Penalties

def getHomeAssists(soup1):
    xyz = (soup1.find_all('div', class_="matchAssistsContainer"))
    for ptag in xyz:
        for i in ptag.find_all('div', class_="home"):
            for x in i.find_all('div', class_="event"):
                s = x.text
                txt = (re.sub(r"\s+", "", s)) 
                if txt is not None:
                    match_data['HomeTeamAssist'].append(getAssists(txt))
                else:
                    print("No match")
                    

def getAwayAssists(soup1):
    xyz = (soup1.find_all('div', class_="matchAssistsContainer"))
    for ptag in xyz:
        for i in ptag.find_all('div', class_="away"):
            for x in i.find_all('div', class_="event"):
                s = x.text
                txt = (re.sub(r"\s+", "", s)) 
                if txt is not None:
                    match_data['AwayTeamAssist'].append(getAssists(txt))
                else:
                    print("No match")

                    
def getAssists(txt):
    assists = {'Time': []}
    assist = re.findall(r"[^\W\d_]+|\d+",txt)
    assists["Player"] = " ".join(re.findall('[A-Z][^A-Z]*', assist[0]))
    for i in range(1,len(assist)):   
        assists['Time'].append(assist[i])
    return assists


def getDictionary(match_id):
    url = "https://www.premierleague.com/match/" + match_id
    
    r1 = requests.get(url)
    coverpage = r1.content

    soup1 = BeautifulSoup(coverpage, 'html5lib')
    
    getReferee(soup1)
    getStadiumName(soup1)
    getFullTimeScore(soup1)
    getTeamNames(soup1)
    getHomeTeamMatchEvents(soup1)
    getAwayTeamMatchEvents(soup1)
    getHomeAssists(soup1)
    getAwayAssists(soup1)
    
    filename = match_id +'.json'
    with open(os.path.join("/Users/ankitmittal/Desktop/Thesis/Project/data/matchdata/", filename), "w") as outfile:  
        json.dump(match_data, outfile)
    
#     print(match_data)

In [3]:
driver = webdriver.Chrome('/Users/ankitmittal/Downloads/chromedriver')
driver.get("https://www.premierleague.com/results?co=1&se=54&cl=-1&team=FIRST")

In [4]:
## Scrolling down till the end of the page

lenOfPage = driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
match=False
while(match==False):
    lastCount = lenOfPage
    time.sleep(10)
    lenOfPage = driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
    if lastCount==lenOfPage:
        match=True

In [5]:
IPs = driver.find_elements_by_class_name('matchFixtureContainer')

In [6]:
match_ids = []
for ip in IPs:
    match_ids.append(ip.get_attribute("data-comp-match-item"))

In [7]:
len(match_ids)

380

In [8]:
url = "https://www.premierleague.com/match/"
delay = 3 # seconds
orig_stdout = sys.stdout

In [10]:
for i in match_ids:
    driver.get(url + i)
    try:
        element_present = EC.presence_of_element_located((By.CLASS_NAME, 'standardArticle'))
        WebDriverWait(driver, delay).until(element_present)
    except TimeoutException:
        print("Timed out waiting for page to load")
    IPs = driver.find_elements_by_xpath("/html/body/main/div/section[2]/div[2]/div[2]/div[2]/section[1]/div/div[1]/div[3]/section/div/div/div/div/div")
    filename = i + ".txt"   
    for ip in IPs:
        with open(os.path.join("/Users/ankitmittal/Desktop/Thesis/Project/data/articles", filename) , 'w') as f:
            sys.stdout = f # Change the standard output to the file we created.
            print(ip.text)
            sys.stdout = orig_stdout
        f.close()

In [12]:
for match_id in match_ids:
    match_data = {
        'HomeGoalScorer':[],
        'AwayGoalScorer':[],
        'HomeTeamFouls':[],
        'AwayTeamFouls':[],
        'HomeTeamAssist':[],
        'AwayTeamAssist':[]
    }

    getDictionary(match_id)

In [9]:
soup1.