In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import datetime
import os
import re

In [2]:
def find_author(tweet):
    """
    Args:
        tweet (bs4.element.Tag): [description]

    Returns:
        string: auteur du tweet
    """
    author = ""
    elements = tweet.find("span", class_="css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0").contents
    for elem in elements:
        if elem.find("div"):
            if 'title' in elem.find('div').attrs:
                author += f"[{elem.find('div').attrs['aria-label']}]"
            else:
                author += elem.find('div').attrs['aria-label']
        else:
            author += elem.getText()
    return author


def find_text(tweet):
    """
    Args:
        tweet (bs4.element.Tag): [description]

    Returns:
        string, string: texte du tweet, langue du tweet
    """
    text_ = tweet.find("div", class_="css-901oao r-jwli3a r-1qd0xha r-a023e6 r-16dba41 r-ad9z0x r-bcqeeo r-bnwqim r-qvutc0")

    if not text_:
        text_ = tweet.find("div", class_="css-901oao r-jwli3a r-1tl8opc r-a023e6 r-16dba41 r-ad9z0x r-bcqeeo r-bnwqim r-qvutc0")  # Japonais par exemple

    lang = text_.attrs['lang']
    text = ""
    for elem in text_.contents:
        if elem.find("div"):
            if 'title' in elem.find('div').attrs:
                text += f"[{elem.find('div').attrs['aria-label']}]"
            else:
                text += elem.find('div').attrs['aria-label']
        else:
            text += elem.getText()
    return text, lang

In [3]:
path = os.path.join('data', 'web', 'html')
FILES = os.listdir(path)

data = []

for file in FILES:
    print(file)
    search = re.findall('[^-]*(?=-)',os.path.splitext(file)[0])[0]
    with open(os.path.join(path, file), "r") as f:
        page = BeautifulSoup(f.read(), 'lxml')

        tweets = page.findAll("div", class_="css-1dbjc4n r-1iusvr4 r-16y2uox r-1777fci r-1mi0q7o")
        i = 0
        for tweet in tweets:
            url = tweet.find("a", class_="css-4rbku5 css-18t94o4 css-901oao r-111h2gw r-1loqt21 r-1q142lx r-1qd0xha r-a023e6 r-16dba41 r-ad9z0x r-bcqeeo r-3s2u2q r-qvutc0").attrs.get("href")
            # Ok unique url

            error_author = False
            try:
                author = find_author(tweet)
            except:
                error_author = True
                print("Erreur auteur :", url)

            time = tweet.find("time").attrs.get("datetime")
            time = datetime.datetime.strptime(time, '%Y-%m-%dT%H:%M:%S.%fZ')
            # Parfois 2 dates lors des retweet, mais c'est bien la première date qu'il faut prendre en compte

            error_text = False
            try:
                text, lang = find_text(tweet)
            except:
                error_text = True
                print("Erreur texte", i, url)

            reply = False
            reply_ = tweet.find("div", class_="css-901oao r-111h2gw r-1qd0xha r-a023e6 r-16dba41 r-ad9z0x r-bcqeeo r-qvutc0")
            if reply_:
                if reply_.find(text=True) == "Replying to ":
                    reply = True

            if not error_author and not error_text:
                data.append({'search': search, 'author': author, 'time': time, 'url': url, 'text': text, 'lang':lang, 'reply':reply})

            i += 1

biden-2019-01-01_test.html
biden-2019-02-22_test.html
biden-2019-04-15_test.html
Erreur auteur : /Sudbury34001015/status/1117940614778081280
biden-2019-06-06_test.html
biden-2019-07-28_test.html
biden-2019-09-18_test.html
biden-2019-11-09_test.html
biden-2019-12-31_test.html
trump-2019-01-01_test.html
trump-2019-02-22_test.html
trump-2019-04-15_test.html
trump-2019-06-06_test.html
trump-2019-07-28_test.html
trump-2019-09-18_test.html
trump-2019-11-09_test.html
trump-2019-12-31_test.html


In [4]:
df = pd.DataFrame(data)
len(df)

2906

In [5]:
df.dtypes

search            object
author            object
time      datetime64[ns]
url               object
text              object
lang              object
reply               bool
dtype: object

In [6]:
df[df.search == "trump"]

Unnamed: 0,search,author,time,url,text,lang,reply
1457,trump,ATP,2019-01-01 23:59:59,/ATPJudge/status/1080252327611764736,Trump is too dim to figure out how to postpone...,en,False
1458,trump,Marc,2019-01-01 23:59:59,/ElegantxEgotist/status/1080252327557251073,The military has gotten pay raises every year....,en,False
1459,trump,Sally,2019-01-01 23:59:59,/Toughrthanmost/status/1080252327129481216,"they don’t owe tRump any explanation, he proba...",en,False
1460,trump,Barry Fahey,2019-01-01 23:59:58,/Fahey9Fahey/status/1080252324264755201,You are not paying attention. Fed raised inter...,en,False
1461,trump,Evict the Squatter in the WH!,2019-01-01 23:59:58,/jalan_jeff/status/1080252323807617024,No. This is how tRump talks. We don’t have a l...,en,False
...,...,...,...,...,...,...,...
2901,trump,RobDogDiggity,2019-12-31 23:59:06,/robdogdiggity/status/1212161285648977920,"Oh, if he would’ve pulled out a gun and shot o...",en,False
2902,trump,the other red [Drapeau des États-Unis][Drapeau...,2019-12-31 23:59:05,/zivvy_1/status/1212161285040869383,So no reality? THAT is how we get another 4 ye...,en,False
2903,trump,PeterDavies,2019-12-31 23:59:05,/PeterMDavies80/status/1212161285036630016,Trump strong! He's so tough! He's gonna show t...,en,False
2904,trump,"No, Donny, you did not win the election.",2019-12-31 23:59:05,/Vote4USA2020/status/1212161284617179136,That is so true. I think the ‘election’ of som...,en,False


In [7]:
df[df.search == "biden"]

Unnamed: 0,search,author,time,url,text,lang,reply
0,biden,Christopher Suprun #WeArePerseus,2019-01-01 23:59:59,/TheChrisSuprun/status/1080252328886915072,Off top of my head...Forgive spelling\n\nHarri...,en,False
1,biden,robbiewithoutanynumbers,2019-01-01 23:59:54,/robbienotrobin/status/1080252304509599744,"I kinda like Kamala/Beto, with Biden as Chif O...",en,False
2,biden,Linda Hirshman 🏊🏻‍♂️,2019-01-01 23:59:50,/LindaHirshman1/status/1080252291184189441,also Biden to Grassley. Did you know he wrote ...,en,False
3,biden,Carol Price,2019-01-01 23:59:43,/CarolP1941/status/1080252261547180032,I am anxious to see who can pull us out of thi...,en,False
4,biden,Oliver Anderson,2019-01-01 23:59:34,/bogart7777/status/1080252220204027904,"Black people, please don’t trust Joe Biden, he...",en,False
...,...,...,...,...,...,...,...
1452,biden,One4TheBooks,2019-12-31 23:51:00,/BooksOne4/status/1212159248303558656,You definitely can. The firing of Shokin was a...,en,False
1453,biden,Lauren,2019-12-31 23:50:57,/Paperikascards/status/1212159238455414784,No Biden was representing US and Obama had his...,en,False
1454,biden,Steampunkette,2019-12-31 23:50:55,/steampunkette/status/1212159229676654592,Because Shokin was supposed to be investigatin...,en,False
1455,biden,Dave Wolber,2019-12-31 23:50:53,/dwwolber/status/1212159217999601664,The key to saving our democracy is an alliance...,en,False


In [8]:
# Sauvegarde
file = os.path.join("data", "web", "web_parse.bz2")
if not os.path.exists(os.path.dirname(file)):
    os.makedirs(os.path.dirname(file))
df.to_pickle(file)