In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import datetime
import os
import re

In [2]:
def find_author(tweet):
    """
    Args:
        tweet (bs4.element.Tag): [description]

    Returns:
        string: auteur du tweet
    """
    author = ""
    elements = tweet.find("span", class_="css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0").contents
    for elem in elements:
        if elem.find("div"):
            if 'title' in elem.find('div').attrs:
                author += f"[{elem.find('div').attrs['aria-label']}]"
            else:
                author += elem.find('div').attrs['aria-label']
        else:
            author += elem.getText()
    return author


def find_text(tweet):
    """
    Args:
        tweet (bs4.element.Tag): [description]

    Returns:
        string, string: texte du tweet, langue du tweet
    """
    text_ = tweet.find("div", class_="css-901oao r-jwli3a r-1qd0xha r-a023e6 r-16dba41 r-ad9z0x r-bcqeeo r-bnwqim r-qvutc0")

    if not text_:
        text_ = tweet.find("div", class_="css-901oao r-jwli3a r-1tl8opc r-a023e6 r-16dba41 r-ad9z0x r-bcqeeo r-bnwqim r-qvutc0")  # Japonais par exemple

    lang = text_.attrs['lang']
    text = ""
    for elem in text_.contents:
        if elem.find("div"):
            if 'title' in elem.find('div').attrs:
                text += f"[{elem.find('div').attrs['aria-label']}]"
            else:
                text += elem.find('div').attrs['aria-label']
        else:
            text += elem.getText()
    return text, lang

In [3]:
path = os.path.join('data', 'web', 'html_2019')
FILES = os.listdir(path)

data = []

for file in FILES:
    print(file)
    search = re.findall('[^-]*(?=-)',os.path.splitext(file)[0])[0]
    with open(os.path.join(path, file), "r", encoding="utf-8") as f:
        page = BeautifulSoup(f.read(), 'lxml')

        tweets = page.findAll("div", class_="css-1dbjc4n r-1iusvr4 r-16y2uox r-1777fci r-1mi0q7o")
        i = 0
        for tweet in tweets:
            error_url = False
            try:
                url = tweet.find("a", class_="css-4rbku5 css-18t94o4 css-901oao r-111h2gw r-1loqt21 r-1q142lx r-1qd0xha r-a023e6 r-16dba41 r-ad9z0x r-bcqeeo r-3s2u2q r-qvutc0").attrs.get("href")
            except:
                error_url = True
            
            if not error_url:    # report copyright par exemple
                error_author = False
                try:
                    author = find_author(tweet)
                except:
                    error_author = True
                    print("Erreur auteur :", url)

                time = tweet.find("time").attrs.get("datetime")
                time = datetime.datetime.strptime(time, '%Y-%m-%dT%H:%M:%S.%fZ')
                # Parfois 2 dates lors des retweet, mais c'est bien la première date qu'il faut prendre en compte

                error_text = False
                try:
                    text, lang = find_text(tweet)
                except:
                    error_text = True
                    print("Erreur texte", i, url)

                reply = False
                reply_ = tweet.find("div", class_="css-901oao r-111h2gw r-1qd0xha r-a023e6 r-16dba41 r-ad9z0x r-bcqeeo r-qvutc0")
                if reply_:
                    if reply_.find(text=True) == "Replying to ":
                        reply = True

                if not error_author and not error_text:
                    data.append({'search': search, 'author': author, 'time': time, 'url': url, 'text': text, 'lang':lang, 'reply':reply})

            i += 1

biden-2019-01-01.html
biden-2019-01-04.html
biden-2019-01-08.html
biden-2019-01-12.html
biden-2019-01-15.html
biden-2019-01-19.html
biden-2019-01-23.html
biden-2019-01-26.html
biden-2019-01-30.html
biden-2019-02-03.html
biden-2019-02-06.html
biden-2019-02-10.html
biden-2019-02-14.html
biden-2019-02-17.html
biden-2019-02-21.html
biden-2019-02-25.html
biden-2019-02-28.html
biden-2019-03-04.html
biden-2019-03-08.html
biden-2019-03-11.html
biden-2019-03-15.html
biden-2019-03-19.html
biden-2019-03-22.html
biden-2019-03-26.html
biden-2019-03-30.html
biden-2019-04-02.html
biden-2019-04-06.html
biden-2019-04-10.html
biden-2019-04-13.html
biden-2019-04-17.html
biden-2019-04-21.html
biden-2019-04-24.html
biden-2019-04-28.html
biden-2019-05-02.html
biden-2019-05-06.html
biden-2019-05-09.html
biden-2019-05-13.html
biden-2019-05-17.html
biden-2019-05-20.html
biden-2019-05-24.html
biden-2019-05-28.html
biden-2019-05-31.html
biden-2019-06-04.html
biden-2019-06-08.html
biden-2019-06-11.html
biden-2019

In [4]:
df = pd.DataFrame(data)
len(df)

10694

In [5]:
df.dtypes

search            object
author            object
time      datetime64[ns]
url               object
text              object
lang              object
reply               bool
dtype: object

In [6]:
df[df.search == "trump"]

Unnamed: 0,search,author,time,url,text,lang,reply
5402,trump,Hal Lambert,2019-01-01 23:59:51,/MAGAindex/status/1080252292002209792,President Trump is the only one standing up fo...,en,False
5403,trump,Claude Taylor,2019-01-01 23:54:46,/TrueFactsStated/status/1080251015268630529,How does anyone expect the investigations to c...,en,False
5404,trump,Nouriel Roubini,2019-01-01 23:57:02,/Nouriel/status/1080251585853382658,Roubini: President Trump is flirting with mutu...,en,False
5405,trump,Rational Disconnect,2019-01-01 23:59:33,/RationalDis/status/1080252216907362304,I’ve been waiting to say this:\n\nTRUMP IS A F...,en,False
5406,trump,Matrix Surfer SAM,2019-01-01 23:56:26,/USAlivestrong/status/1080251435068010497,Democrats get elected without a moral bone in ...,en,False
...,...,...,...,...,...,...,...
10689,trump,VioletDaCat@danabennett,2019-12-27 23:50:58,/VioletDaCatdan1/status/1210709687466741761,"Seriously, when is it going to be over the top...",en,False
10690,trump,Isabella aka #PumpkinSpiceElf,2019-12-27 23:59:03,/Isabella040408/status/1210711722488729601,"Yes, I had a feeling Trump was really talking ...",en,False
10691,trump,Teresae[Globe terrestre Europe-Afrique],2019-12-27 23:52:18,/teresa3llen/status/1210710026077106176,The producers of the movie had to agree to a c...,en,False
10692,trump,Truth Quest,2019-12-27 23:56:12,/TheTruthQuester/status/1210711004570632192,He gave Trump the presidency and now he is pro...,en,False


In [7]:
df[df.search == "biden"]

Unnamed: 0,search,author,time,url,text,lang,reply
0,biden,Maggie Klaus 🤶🏻,2019-01-01 22:37:46,/Maggie_Klaus/status/1080231636334067712,Shut the f*ck up about Hillary’s likability. W...,en,False
1,biden,The Salt Lake Tribune,2019-01-01 22:15:18,/sltrib/status/1080225983083487232,The @UUtah agreed to pay former Vice President...,en,False
2,biden,The New York Times,2019-01-01 20:00:08,/nytimes/status/1080191967873716224,"A series of careful financial decisions, and t...",en,False
3,biden,Matt Binder,2019-01-01 22:18:51,/MattBinder/status/1080226876226965504,i agree. this is useful. democrats should defi...,en,False
4,biden,Patrick Healy,2019-01-01 19:58:58,/patrickhealynyt/status/1080191673701933059,NEWS: Joe Biden's 2020 campaign-in-waiting inc...,en,False
...,...,...,...,...,...,...,...
5397,biden,Steve Frisbie,2019-12-27 23:24:51,/commishfriz/status/1210703116603215872,Keep grasping...even Congressman Nadler gave u...,en,False
5398,biden,TDavisJ,2019-12-27 22:33:45,/RDavisJ1/status/1210690256367804418,"Biden, I think, is really having some cognitio...",en,False
5399,biden,Larry Langley,2019-12-27 23:11:59,/doublel71/status/1210699879288315905,Jill Biden steps over Veterans to help illegal...,en,False
5400,biden,☆Ophidian Pilot☆,2019-12-27 23:10:48,/ophidianpilot/status/1210699580918161409,Biden: I won’t comply with subpoena to testify...,en,False


In [8]:
# Sauvegarde
file = os.path.join("data", "web", "web_parse_2019.bz2")
if not os.path.exists(os.path.dirname(file)):
    os.makedirs(os.path.dirname(file))
df.to_pickle(file)