In [1]:
import requests
from bs4 import BeautifulSoup 
import lxml
import csv
from datetime import datetime

import numpy as np


#### Taking day date from user

In [2]:
# Accept input from user
date_string = input("Enter a date in the format mm/dd/yyyy: ")
print("You entered: ", date_string)

from datetime import datetime

# Convert date string to datetime object
date_object = datetime.strptime(date_string, '%m/%d/%Y').date()
print("After changing it to data: ", date_object)

# Convert datetime object back to string with new format
new_date_string = datetime.strftime(date_object, '%m-%d-%Y')
print("After converted back to string on the format %m-%d-%Y: ", new_date_string)


Enter a date in the format mm/dd/yyyy: 3/4/2023
You entered:  3/4/2023
After changing it to data:  2023-03-04
After converted back to string on the format %m-%d-%Y:  03-04-2023


In [3]:
# Storing file name and file path in global variables to be used in multiple cells
file_name = f"Match_Data_Scraped_{new_date_string}.csv"
file_path = "C:/Users/yusuf/Data Science/Projects/Web Scraping/Yallakor.com"
path = file_path+"/"+file_name

In [4]:
# Requesting the webpage and adding the date to it
page = requests.get(f"https://www.yallakora.com/match-center?date={date_string}")
website_URL = "https://www.yallakora.com"

# Checking the response of the getting the page, if respose is 200; this means we got it fine
page

<Response [200]>

###  Function that extract match details from championship segment of the webpage

In [5]:
 """
 championship; championship block that conatains multiple matches 
 championship_title; the championship title
 match_details; 
 """
def scrape_single_match_details(championship, championship_title): #, match_details):
    
    match_details = []

    # Loopting through all matches to collect match details in current championship
    for match in championship:

        # Start scraping each match details
        team_a = match.find("div",{"class":"allData"}
                                          ).find_next("div", {"class":"teamsData"}
                                                     ).find_next("div", {"class":"teams teamA"}
                                                        ).find_next("p").text

        team_b = match.find("div",{"class":"allData"}
                                  ).find_next("div", {"class":"teamsData"}
                                             ).find_next("div", {"class":"teams teamB"}
                                                        ).find_next("p").text

        time = match.find("div",{"class":"allData"}
                                  ).find_next("div", {"class":"teamsData"}
                                             ).find_next("div", {"class":"MResult"}
                                                        ).find_next("span"
                                                                   ).find_next("span"
                                                                              ).find_next("span"
                                                                                         ).find_next("span").text
        team_a_score = match.find("div",{"class":"allData"}
                                  ).find_next("div", {"class":"teamsData"}
                                             ).find_next("div", {"class":"MResult"}
                                                        ).find_next("span").text

        team_b_score = match.find("div",{"class":"allData"}
                                  ).find_next("div", {"class":"teamsData"}
                                             ).find_next("div", {"class":"MResult"}
                                                        ).find_next("span"
                                                                   ).find_next("span"
                                                                              ).find_next("span").text

        match_status = match.find("div",{"class":"matchStatus"}).find_next('span').text

        # Used getattr func to allocate None if there is no value in the webpage
        chanel_streaming = getattr(match.find("div",{"class":"channel icon-channel"}), 'text', None)

        match_detail_link = website_URL + match.find("div",{"class":"leftCol"}
                                                           ).find_next('a')['href']

        match_week_number = match.find("div",{"class":"date"}).text.strip()
        
        # Storing the values in dictionary
        match = {'Date':date_object, 'Championship_Title':championship_title, 'Team_A':team_a, 'Team_B':team_b,
                 'Time':time, 'Team_A_Score':team_a_score, 'Team_B_Score':team_b_score,
                 'Match_Status':match_status, 'Chanel_Streaming':chanel_streaming,
                 'Match_Detail_Link':match_detail_link,'Match_Week_Number':match_week_number}

        # Appending the match data to match_details list
        match_details.append(match)

    return(match_details)

#### Function that collects championship details

In [6]:

def get_championship_details(championships, match_details):
    
    # List to store every match details
    big_match_details = []
    
    i = 0
    # Looping through championships
    for championship in championships:

        championship_title = championship.find("a",{"class":"tourTitle"}).find_next("h2").text.strip()

        # Storing the each championship matches in list
        championship_matches = championship.find_all("li", {"class":"item finish"})

        # Getting data for every match in the champtionship
        match_details_after_scraping = scrape_single_match_details(championship_matches,
                                                                   championship_title) #, match_details)
        

        # Adding each championship matches details to the main list
        i +=1 
        if i <= 1:  # To distinguish first cycle
            big_match_details = match_details_after_scraping # First cycle for the loop
        else: 
            big_match_details = big_match_details + match_details_after_scraping # Later cycles
            
    return (big_match_details)
            

 #### Funtion that stores championships' matches data in csv file

In [7]:
# Function to store the data scraped in csv file

def store_in_csv(match_details, file_name, path):

    # Creating Keys to use in match dictionary
    keys = ['Date', 'Championship_Title', 'Team_A', 'Team_B', 'Time', 'Team_A_Score', 'Team_B_Score',
            'Match_Status', 'Chanel_Streaming', 'Match_Detail_Link','Match_Week_Number']

    # Storing the championships details in csv file
    with open(path, 'w', newline="") as match_data:
        dict_writer = csv.DictWriter(match_data, fieldnames=keys) 
        dict_writer.writeheader()

        # Looping through championships to store its data
        for match in match_details: 
            dict_writer.writerow(match)

        # Checking the file creation is done
        print(f"Match data saved to file {file_name}")

####  The main function that scrape the website contect

In [8]:
# Function to do the steps of scraping
def scraping(page):
    
    match_details = []
    
    src = page.content
    soup =  BeautifulSoup(src, "lxml")

    # Each championship in the entered data
    championships = soup.find_all("div", {'class':'matchCard'})    
    
    # Gets championships all matches details    
    all_matches_details = get_championship_details(championships, match_details)
    
    #match_details.append(get_championships_details(championships, match_details))
    
    # Calling the function that stores the data scraped in csv file
    store_in_csv(all_matches_details, file_name, path)
    
    #return(match_details)
    

#### Calling the main function

In [9]:

scraping(page)


Match data saved to file Match_Data_Scraped_03-04-2023.csv


## Working on the resulted data

### Openning the csv file and show the data

In [10]:
#Importing Libraries

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from six.moves import urllib
import os 

In [11]:
#function to read the data

def load_data(file_path, file_name):
    
    csv_path = os.path.join(file_path, file_name)
    
    return(pd.read_csv(csv_path))



In [12]:
# Loading the data using the load_data function
data = load_data(file_path, file_name)

In [13]:
data

Unnamed: 0,Date,Championship_Title,Team_A,Team_B,Time,Team_A_Score,Team_B_Score,Match_Status,Chanel_Streaming,Match_Detail_Link,Match_Week_Number
0,2023-03-04,دوري أبطال إفريقيا,الأهلي,القطن الكاميروني,18:00,3,0,انتهت,بى ان سبورت 4 HD,https://www.yallakora.com/african-champions-le...,الجولة الاولي
1,2023-03-04,الدوري الإنجليزي,مانشستر سيتي,نيوكاسل,14:30,2,0,انتهت,بى ان سبورت بريميوم 1,https://www.yallakora.com/epl/2761/match/87025...,الأسبوع السادس والعشرون
2,2023-03-04,الدوري الإنجليزي,تشيلسي,ليدز يونايتد,17:00,1,0,انتهت,بى ان سبورت بريميوم 2,https://www.yallakora.com/epl/2761/match/87026...,الأسبوع السادس والعشرون
3,2023-03-04,الدوري الإنجليزي,استون فيلا,كريستال بالاس,17:00,1,0,انتهت,بى ان سبورت اكسترا 1,https://www.yallakora.com/epl/2761/match/87027...,الأسبوع السادس والعشرون
4,2023-03-04,الدوري الإنجليزي,وولفرهامبتون,توتنهام هوتسبر,17:00,1,0,انتهت,بى ان سبورت بريميوم 3,https://www.yallakora.com/epl/2761/match/87028...,الأسبوع السادس والعشرون
5,2023-03-04,الدوري الإنجليزي,برايتون,وست هام يونايتد,17:00,4,0,انتهت,بى ان سبورت اكسترا 2,https://www.yallakora.com/epl/2761/match/87029...,الأسبوع السادس والعشرون
6,2023-03-04,الدوري الإنجليزي,أرسنال,بورنموث,17:00,3,2,انتهت,بى ان سبورت بريميوم 1,https://www.yallakora.com/epl/2761/match/87030...,الأسبوع السادس والعشرون
7,2023-03-04,الدوري الإنجليزي,ساوثامبتون,ليستر سيتي,19:30,1,0,انتهت,,https://www.yallakora.com/epl/2761/match/87031...,الأسبوع السادس والعشرون
8,2023-03-04,الدوري الإسباني,خيتافي,جيرونا,15:00,3,2,انتهت,بى ان سبورت 1HD,https://www.yallakora.com/la-liga/2764/match/8...,الأسبوع الرابع والعشرون
9,2023-03-04,الدوري الإسباني,الميريا,فياريال,17:15,0,2,انتهت,بى ان سبورت 1HD,https://www.yallakora.com/la-liga/2764/match/8...,الأسبوع الرابع والعشرون
