In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import datetime
import os

In [2]:
def find_author(tweet):
    """
    Args:
        tweet (bs4.element.Tag): [description]

    Returns:
        string: auteur du tweet
    """
    author = ""
    elements = tweet.find("span", class_="css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0").contents
    for elem in elements:
        if elem.find("div"):
            if 'title' in elem.find('div').attrs:
                author += f"[{elem.find('div').attrs['aria-label']}]"
            else:
                author += elem.find('div').attrs['aria-label']
        else:
            author += elem.getText()
    return author


def find_text(tweet):
    """
    Args:
        tweet (bs4.element.Tag): [description]

    Returns:
        string, string: texte du tweet, langue du tweet
    """
    text_ = tweet.find("div", class_="css-901oao r-jwli3a r-1qd0xha r-a023e6 r-16dba41 r-ad9z0x r-bcqeeo r-bnwqim r-qvutc0")

    if not text_:
        text_ = tweet.find("div", class_="css-901oao r-jwli3a r-1tl8opc r-a023e6 r-16dba41 r-ad9z0x r-bcqeeo r-bnwqim r-qvutc0")  # Japonais par exemple

    lang = text_.attrs['lang']
    text = ""
    for elem in text_.contents:
        if elem.find("div"):
            if 'title' in elem.find('div').attrs:
                text += f"[{elem.find('div').attrs['aria-label']}]"
            else:
                text += elem.find('div').attrs['aria-label']
        else:
            text += elem.getText()
    return text, lang

In [3]:
FILES = os.listdir('web')

data = []

for file in FILES:
    print(file)
    search = os.path.splitext(file)[0]
    with open(f"web/{file}", "r") as f:
        page = BeautifulSoup(f.read(), 'lxml')

        tweets = page.findAll("div", class_="css-1dbjc4n r-1iusvr4 r-16y2uox r-1777fci r-1mi0q7o")
        i = 0
        for tweet in tweets:
            url = tweet.find("a", class_="css-4rbku5 css-18t94o4 css-901oao r-111h2gw r-1loqt21 r-1q142lx r-1qd0xha r-a023e6 r-16dba41 r-ad9z0x r-bcqeeo r-3s2u2q r-qvutc0").attrs.get("href")
            # Ok unique url

            error_author = False
            try:
                author = find_author(tweet)
            except:
                error_author = True
                print("Erreur auteur :", url)

            time = tweet.find("time").attrs.get("datetime")
            time = datetime.datetime.strptime(time, '%Y-%m-%dT%H:%M:%S.%fZ')
            # Parfois 2 dates lors des retweet, mais c'est bien la première date qu'il faut prendre en compte

            error_text = False
            try:
                text, lang = find_text(tweet)
            except:
                error_text = True
                print("Erreur texte", i, url)

            reply = False
            reply_ = tweet.find("div", class_="css-901oao r-111h2gw r-1qd0xha r-a023e6 r-16dba41 r-ad9z0x r-bcqeeo r-qvutc0")
            if reply_:
                if reply_.find(text=True) == "Replying to ":
                    reply = True

            if not error_author and not error_text:
                data.append({'search': search, 'author': author, 'time': time, 'url': url, 'text': text, 'lang':lang, 'reply':reply})

            i += 1

biden.html
trump.html


In [4]:
df = pd.DataFrame(data)
len(df)

4045

In [5]:
df.dtypes

search            object
author            object
time      datetime64[ns]
url               object
text              object
lang              object
reply               bool
dtype: object

In [6]:
df[df.search == "trump"]

Unnamed: 0,search,author,time,url,text,lang,reply
517,trump,"Honestly, Its What They Deserve 🌊🌊",2012-12-05 23:59:48,/WhatFulikaSays/status/276475993600954368,"Look, I love Nene as much as you but has anyon...",en,False
518,trump,PEEZO,2012-12-05 23:59:48,/ItsNiecesPieces/status/276475992921493504,& your arms tired...they gotta be lol RT @MsHo...,en,False
519,trump,Kayleigh Donaldson,2012-12-05 23:59:46,/Ceilidhann/status/276475984931323904,"Come on Scotland, let us unite for a wonderful...",en,False
520,trump,Big Scotty,2012-12-05 23:59:39,/redsealtech/status/276475955302772737,@scotchblog Trump has completely lost his marb...,en,False
521,trump,Andrew Clauer,2012-12-05 23:58:50,/andrewclauer/status/276475749542801408,"I'm at Trump Parc Condo (New York, NY) http://...",en,False
...,...,...,...,...,...,...,...
4040,trump,Paolo,2012-12-05 02:36:59,/PaoloTorress/status/276153163374030849,Take over the world when im on my donald trump...,en,False
4041,trump,♡ N. Doll ♡,2012-12-05 02:36:50,/North_Doll/status/276153125289750528,@BubbaWallace @chaseelliott @RossKenseth lmao ...,en,False
4042,trump,Arista Ellis,2012-12-05 02:36:43,/Arista817/status/276153098207117312,Jerry Trump admits in April 2012 to lying at 2...,en,False
4043,trump,David Thomas McKenzie,2012-12-05 02:36:39,/davidtmckenzie/status/276153079391481856,@realDonaldTrump \n\nLove ritz Carleton. Great...,en,False


In [7]:
def filtre(text):
    lower_text = text.lower()
    if "trumps" in lower_text:
        if "trump" in lower_text.replace("trumps", ""):
            return True
        else:
            return False
    else:
        return True

In [8]:
df[(df.search == "trump") & (df.text.apply(filtre))]

Unnamed: 0,search,author,time,url,text,lang,reply
518,trump,PEEZO,2012-12-05 23:59:48,/ItsNiecesPieces/status/276475992921493504,& your arms tired...they gotta be lol RT @MsHo...,en,False
519,trump,Kayleigh Donaldson,2012-12-05 23:59:46,/Ceilidhann/status/276475984931323904,"Come on Scotland, let us unite for a wonderful...",en,False
520,trump,Big Scotty,2012-12-05 23:59:39,/redsealtech/status/276475955302772737,@scotchblog Trump has completely lost his marb...,en,False
521,trump,Andrew Clauer,2012-12-05 23:58:50,/andrewclauer/status/276475749542801408,"I'm at Trump Parc Condo (New York, NY) http://...",en,False
522,trump,Selene Navarro,2012-12-05 23:58:37,/SeleneNavarroS/status/276475698640719872,Women trump men in financial planning: RaboDir...,en,False
...,...,...,...,...,...,...,...
4039,trump,homelet,2012-12-05 02:37:14,/homelet_ca/status/276153225588117504,OSC won't take action against Toronto’s Trump ...,en,False
4040,trump,Paolo,2012-12-05 02:36:59,/PaoloTorress/status/276153163374030849,Take over the world when im on my donald trump...,en,False
4041,trump,♡ N. Doll ♡,2012-12-05 02:36:50,/North_Doll/status/276153125289750528,@BubbaWallace @chaseelliott @RossKenseth lmao ...,en,False
4042,trump,Arista Ellis,2012-12-05 02:36:43,/Arista817/status/276153098207117312,Jerry Trump admits in April 2012 to lying at 2...,en,False


In [9]:
df[df.search == "biden"]

Unnamed: 0,search,author,time,url,text,lang,reply
0,biden,Frank Braswell,2012-12-05 23:41:12,/FBRASWELL/status/276471312489254912,Gotta love Joe! RT @rorycooper: VP Biden says ...,en,False
1,biden,GLENN,2012-12-05 23:40:46,/GCGATOR24/status/276471203290554369,BIDEN'S HAIRPLUGS EXEMPT: IRS finalizes new t...,en,False
2,biden,cydney,2012-12-05 23:39:11,/cydney/status/276470805855076353,"Omg Keisha... ""@Angelic_kiss: Listening to ""Pu...",en,False
3,biden,GLENN,2012-12-05 23:38:10,/GCGATOR24/status/276470550338080768,BIDEN'S HAIRPLUGS EXEMPT: @DRUDGE_REPORT: IRS ...,en,False
4,biden,Emily Greenhouse,2012-12-05 23:37:28,/emserre/status/276470372902264832,"As Obama jotted while Biden blathered: ""Shoot....",en,False
...,...,...,...,...,...,...,...
512,biden,follow my new one,2012-12-05 00:09:31,/MadeANewHandle/status/276116053816401920,@TheLiteralMC (that guy at that one Joe Biden ...,en,False
513,biden,Eric Dierkes,2012-12-05 00:08:28,/DirkyDierkerson/status/276115788241444864,@_gabrielleeeeee @zoseph_ @jo_biden I know Gab...,en,False
514,biden,Brent,2012-12-05 00:06:57,/brento76/status/276115404622032896,"But, But, Biden said..... \nRT @DRUDGE_REPORT:...",en,False
515,biden,Eric Dierkes,2012-12-05 00:06:47,/DirkyDierkerson/status/276115366453854208,@_gabrielleeeeee @zoseph_ @jo_biden DRESS jean...,en,False
