Skip to content

Commit

Permalink
Twitter scraping
Browse files Browse the repository at this point in the history
  • Loading branch information
amadejkastelic committed Sep 5, 2023
1 parent dfaa9d9 commit d8380eb
Show file tree
Hide file tree
Showing 5 changed files with 146 additions and 8 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
cookies.txt
instagram.sess
.env
accounts.db
1 change: 1 addition & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ redvid = "==2.0.2"
ffmpeg-python = "==0.2.0"
opencv-python = "==4.8.0.74"
asyncpraw = "==7.7.1"
twscrape = "==0.7.0"

[dev-packages]
black = "==23.7.0"
Expand Down
72 changes: 64 additions & 8 deletions Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,9 @@ REDDIT_API_TOKEN=<your_reddit_api_token>
REDDIT_API_SECRET=<your_reddit_api_secret>
REDDIT_USER_AGENT=<name_version_and_your_username>
```
- For better twitter support you need to add credentials:
```bash
TWITTER_USERNAME=<your_twitter_username>
TWITTER_EMAIL=<your_twitter_email>
TWITTER_PASSWORD=<your_twitter_password>
```
74 changes: 74 additions & 0 deletions downloader/twitter.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
import datetime
import json
import logging
import os
import typing

import twscrape

from downloader import base
from models import post
Expand All @@ -22,6 +27,32 @@
}


class TwitterClientSingleton(object):
INSTANCE: typing.Optional[twscrape.API] = None

@classmethod
async def get_instance(cls) -> typing.Optional[twscrape.API]:
username = os.getenv('TWITTER_USERNAME')
email = os.getenv('TWITTER_EMAIL')
password = os.getenv('TWITTER_PASSWORD')
if not all([username, email, password]):
return None

if not cls.INSTANCE:
cls.INSTANCE = twscrape.API()
await cls.INSTANCE.pool.add_account(
username=username, email=email, password=password, email_password=password
)
await cls.INSTANCE.pool.login_all()

return cls.INSTANCE

@classmethod
async def relogin(cls) -> None:
if cls.INSTANCE:
await cls.INSTANCE.pool.relogin(usernames=[os.getenv('TWITTER_USERNAME')])


class TwitterClient(base.BaseClient):
DOMAINS = ['twitter.com', 'x.com']

Expand All @@ -32,6 +63,49 @@ def __init__(self, url: str):
self.index = int(metadata[2]) - 1 if len(metadata) == 3 and metadata[1] == 'photo' else 0

async def get_post(self) -> post.Post:
client = await TwitterClientSingleton.get_instance()
if not client:
return await self._get_post_no_login()

return await self._get_post_login(client=client)

async def _get_post_login(self, client: twscrape.API, retry_count=0) -> post.Post:
try:
details = await client.tweet_details(int(self.id))
p = post.Post(
url=self.url,
author=f'{details.user.displayname} ({details.user.username})',
description=details.rawContent,
views=details.viewCount,
likes=details.likeCount,
created=details.date.astimezone(),
)

if not details.media:
return p

if details.media.videos:
url = max(details.media.videos[0].variants, key=lambda x: x.bitrate).url
elif details.media.photos:
url = details.media.photos[0].url
elif details.media.animated:
url = details.media.animated[0].videoUrl
else:
return p

p.buffer = await self._download(url=url, cookies=(await client.pool.get_all())[0].cookies)
return p
except Exception as e:
logging.error(f'Failed fetching from twitter, retrying: {str(e)}')
if retry_count == 0:
await TwitterClientSingleton.relogin()
return await self._get_post_login(client=client, retry_count=retry_count + 1)
elif retry_count == 1:
return await self._get_post_no_login()
else:
raise Exception('Failed fetching from twitter')

async def _get_post_no_login(self) -> post.Post:
tweet = json.loads(
await self._fetch_content(url=scrape_url, data='', headers=headers, params={'id': self.id, 'lang': 'en'})
)
Expand Down

0 comments on commit d8380eb

Please sign in to comment.