-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 67e2cc6
Showing
12 changed files
with
666 additions
and
0 deletions.
There are no files selected for viewing
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{'DeepL_key': ''} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{'github_pvt': ''} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,130 @@ | ||
#!/usr/bin/env python3 | ||
|
||
|
||
from src import printcolors as pc | ||
from src.vk_scraper import VkScraper | ||
from src.vk_downloader import VkDownloader | ||
import argparse | ||
import sys | ||
import signal | ||
from src.git_credentials import GitCredentialsHandler | ||
|
||
is_windows = True | ||
|
||
try: | ||
import gnureadline | ||
except: | ||
is_windows = True | ||
from pyreadline3.rlmain import BaseReadline as PyRdl | ||
|
||
|
||
def welcome() -> None: | ||
pc.printout("-" * 80 + "\n") | ||
pc.printout(" _ _ _ __ _____ \n", pc.GREEN) | ||
pc.printout("| | | | | / / / ___| \n", pc.GREEN) | ||
pc.printout("| | | | |/ / \ `--. ___ _ __ __ _ _ __ ___ _ __ \n", pc.GREEN) | ||
pc.printout("| | | | \ `--. \/ __| '__/ _` | '_ \ / _ \ '__|\n", pc.GREEN) | ||
pc.printout("\ \_/ / |\ \ /\__/ / (__| | | (_| | |_) | __/ | \n", pc.GREEN) | ||
pc.printout(" \___/\_| \_/ \____/ \___|_| \__,_| .__/ \___|_| \n", pc.GREEN) | ||
pc.printout(" | | \n", pc.GREEN) | ||
pc.printout(" |_| \n", pc.GREEN) | ||
print("\n") | ||
pc.printout("Code structure based on OSINTagram\n", pc.YELLOW) | ||
pc.printout("Alpha Build - Developed by Alberto Federico Olivieri\n\n", pc.CYAN) | ||
pc.printout("This program will create a csv with the date, text, likes, and url for one or\n", pc.CYAN) | ||
pc.printout("more targets, with the possibility to specify dates for filtering the output\n", pc.CYAN) | ||
pc.printout("Type 'list' to show all allowed commands\n") | ||
pc.printout("-" * 80) | ||
|
||
|
||
def cmdlist() -> None: | ||
# API discarded, credentials not needed | ||
# pc.printout("credentials\t") | ||
# pc.printout("Provide API Key\n", colour=pc.YELLOW) | ||
pc.printout("dates\t\t") | ||
pc.printout("Insert date or date range (dd/mm/yyyy format)\n", colour=pc.YELLOW) | ||
pc.printout("download\t") | ||
pc.printout("Downloads media from link if they exist\n", colour=pc.YELLOW) | ||
pc.printout("dshow\t\t") | ||
pc.printout("Show stored dates\n", colour=pc.YELLOW) | ||
pc.printout("gitkey\t\t") | ||
pc.printout("Change GitHub Private Access Key\n", colour=pc.YELLOW) | ||
pc.printout("run\t\t") | ||
pc.printout("When everything is set, the scraper will start running with this command\n", colour=pc.YELLOW) | ||
pc.printout("targets\t\t") | ||
pc.printout("Insert whitespace separated list of target(s), will overwrite old ones\n", colour=pc.YELLOW) | ||
pc.printout("translate\t") | ||
pc.printout("After downloading targets walls you can translate it if in possession of DeepL API key\n", colour=pc.YELLOW) | ||
pc.printout("tshow\t\t") | ||
pc.printout("Show comma separated list of target(s)\n", colour=pc.YELLOW) | ||
pc.printout("update\t\t") | ||
pc.printout("Update translation credentials\n", colour=pc.YELLOW) | ||
|
||
|
||
def signal_handler(sig: object, frame: object) -> None: | ||
pc.printout("\nGoodbye!\n", pc.RED) | ||
sys.exit(0) | ||
|
||
|
||
def completer(text: str, state: int) -> str or None: | ||
options = [i for i in commands if i.startswith(text)] | ||
if state < len(options): | ||
return options[state] | ||
else: | ||
return None | ||
|
||
|
||
def _quit() -> None: | ||
pc.printout("Goodbye!\n", pc.RED) | ||
sys.exit(0) | ||
|
||
|
||
welcome() | ||
|
||
parser = argparse.ArgumentParser(description="Description") | ||
parser.add_argument('-t', '--targets', type=str, nargs='+', | ||
help='target identificator, single or whitespace separated list') | ||
|
||
args = parser.parse_args() | ||
|
||
api_1 = VkScraper(args.targets) | ||
api_2 = VkDownloader() | ||
api_3 = GitCredentialsHandler() | ||
|
||
commands = { | ||
'list': cmdlist, | ||
'help': cmdlist, | ||
'quit': _quit, | ||
'exit': _quit, | ||
# 'credentials': api.store_credentials, API discarded, credentials not needed | ||
'dates': api_1.set_dates, | ||
'download': api_2.download_media, | ||
'dshow': api_1.show_dates, | ||
'gitkey': api_3.set_github_key, | ||
'run': api_1.retrieve_targets_posts, | ||
'targets': api_1.set_targets, | ||
'translate': api_1.translating_target_csv, | ||
'tshow': api_1.show_targets, | ||
'update': api_1.update_credentials | ||
} | ||
|
||
signal.signal(signal.SIGINT, signal_handler) | ||
if is_windows: | ||
PyRdl().parse_and_bind("tab: complete") | ||
PyRdl().set_completer(completer) | ||
else: | ||
gnureadline.parse_and_bind("tab: complete") | ||
gnureadline.set_completer(completer) | ||
|
||
while True: | ||
pc.printout("Run a command: ", pc.YELLOW) | ||
cmd = input() | ||
|
||
_cmd = commands.get(cmd) | ||
|
||
if _cmd: | ||
_cmd() | ||
elif cmd == "": | ||
print("") | ||
else: | ||
pc.printout("Unknown command\n", pc.RED) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
do not delete this folder |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
## VK SCRAPER | ||
To run this application you will need a GitHub Private Access Key, the app will ask to insert one on startup if not present. | ||
This application will take the content published by a VKontakte page, and it will output a table with the **date**, the **text**, the **number of likes**, and the **original link** to the post. | ||
It is also possible to input a start and an end date to filter out the results. The output will be saved as a **CSV** file in the output folder. | ||
|
||
The commands available are: | ||
|
||
| Command | Description | | ||
|:---------:|:----------------------------------------------------------------------------------------------------------:| | ||
| dates | Insert date or date range (dd/mm/yyyy format) | | ||
| download | From a commaseparated list of post links it will download the thumbnails | | ||
| dshow | Show stored dates | | ||
|gitkey|You can change the saved GitHub Private Access Key that you saved| | ||
| run | When everything is set, the scraper will start running with this command | | ||
| targets | Isert whitespace separated list of target(s), will overwrite old ones | | ||
| translate | This command will start the translation of the targets you have set in English. NB, requires DeepL API key | | ||
| tshow | Show comma separated list of target | | ||
| update | This commands is used to update the API key credentials | | ||
|
||
A small code example to start the application, the `<target name>` is optional, it can be set within the application using `targets`: | ||
|
||
`python3 main.py -t <target name>` | ||
|
||
All of the data you willscrape or download will be found in the `outputs` folder. | ||
|
||
The target name is retrieved from the target `url`: `https://vk.com/<target name>` | ||
|
||
NB Timezone is set for `"Europe/Moscow"`, I'm still figuring out how TZ work on VK | ||
|
||
For translation, you need a DeepL API key, if you have one just run `translate` and if not altready saved it will ask you for an API key. | ||
|
||
**For any new feature, bug, help, etc. Just contact me @ albertofedericoolivieri@gmail.com or open a ticket.** | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
async-generator==1.10 | ||
attrs==21.4.0 | ||
beautifulsoup4==4.11.1 | ||
bs4==0.0.1 | ||
certifi==2022.5.18.1 | ||
cffi==1.15.0 | ||
charset-normalizer==2.0.12 | ||
cryptography==37.0.2 | ||
deepl==1.8.0 | ||
gnureadline==8.0.0 | ||
h11==0.13.0 | ||
idna==3.3 | ||
numpy==1.22.4 | ||
outcome==1.1.0 | ||
pandas==1.4.2 | ||
pycparser==2.21 | ||
pyOpenSSL==22.0.0 | ||
pyreadline3==3.4.1 | ||
PySocks==1.7.1 | ||
python-dateutil==2.8.2 | ||
python-dotenv==0.20.0 | ||
pytz==2022.1 | ||
requests==2.27.1 | ||
selenium==4.2.0 | ||
six==1.16.0 | ||
sniffio==1.2.0 | ||
sortedcontainers==2.4.0 | ||
soupsieve==2.3.2.post1 | ||
tqdm==4.64.0 | ||
trio==0.21.0 | ||
trio-websocket==0.9.2 | ||
urllib3==1.26.9 | ||
webdriver-manager==3.7.0 | ||
wsproto==1.1.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
from ast import literal_eval | ||
from src import printcolors as pc | ||
|
||
|
||
class GitCredentialsHandler: | ||
|
||
def __init__(self): | ||
self.check, self.PAT_dict = self.key_checker() | ||
if not self.check: | ||
pc.printout("You do not have a GitHub Personal Access Token saved, this application will not work\n", pc.RED) | ||
pc.printout("To obtain one Login into your GitHub, go to settings -> developer settings ->\n" | ||
"personal access tokens -> generate new token\n", pc.RED) | ||
pc.printout("Please, provide a PAT or quit with Ctrl+C\n", pc.RED) | ||
self.set_github_key() | ||
|
||
@staticmethod | ||
def key_checker() -> tuple: | ||
with open("./credentials/github_private_key.txt", "r") as handle: | ||
git_key_dict = literal_eval(handle.read()) | ||
if git_key_dict["github_pvt"]: | ||
return True, git_key_dict | ||
else: | ||
return False, git_key_dict | ||
|
||
def set_github_key(self): | ||
pc.printout("Please insert your PAT: \n") | ||
self.PAT = input() | ||
self.PAT_dict['github_pvt'] = self.PAT | ||
with open("./credentials/github_private_key.txt", "w") as handle: | ||
handle.write(str(self.PAT_dict)) | ||
|
||
def retrieve_key(self): | ||
return str(self.PAT_dict['github_pvt']) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
import sys | ||
|
||
BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = range(8) | ||
|
||
|
||
def has_colours(stream: object) -> object: | ||
if not (hasattr(stream, "isatty") and stream.isatty): | ||
return False | ||
try: | ||
import curses | ||
curses.setupterm() | ||
return curses.tigetnum("colors") > 2 | ||
except: | ||
return False | ||
|
||
|
||
has_colours = has_colours(sys.stdout) | ||
|
||
|
||
def printout(text: str, colour: int = WHITE) -> None: | ||
if has_colours: | ||
seq = "\x1b[1;%dm" % (30 + colour) + text + "\x1b[0m" | ||
sys.stdout.write(seq) | ||
else: | ||
sys.stdout.write(text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
from selenium import webdriver | ||
# from selenium.webdriver.chrome.service import Service | ||
# from selenium.webdriver.chrome.options import Options | ||
# from webdriver_manager.chrome import ChromeDriverManager | ||
from selenium.webdriver.firefox.service import Service as FirefoxService | ||
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile | ||
from webdriver_manager.firefox import GeckoDriverManager | ||
from bs4 import BeautifulSoup | ||
import requests | ||
from os.path import basename | ||
import os | ||
import pandas as pd | ||
from src import printcolors as pc | ||
from tqdm import tqdm | ||
from src.git_credentials import GitCredentialsHandler | ||
|
||
|
||
class VkDownloader: | ||
|
||
def __init__(self) -> None: | ||
self.conversion_dict = {"original_url": [], "thumb_url": [], "img_name": []} | ||
|
||
@staticmethod | ||
def load_links(full_path_to_list: str) -> [list, str]: | ||
url_list = [] | ||
with open(full_path_to_list, "r", encoding='utf-8-sig') as handle: | ||
text = handle.read() | ||
for url in text.split(", "): | ||
url_list.append(url.strip()) | ||
return url_list, basename(full_path_to_list).split(".")[0] | ||
|
||
def thumbnail_url(self, url_list: list) -> list: | ||
thumbnail_url_list = [] | ||
os.environ["GH_TOKEN"] = GitCredentialsHandler().retrieve_key() | ||
options = webdriver.FirefoxOptions() | ||
options.add_argument("--headless") | ||
options.add_argument("--incognito") | ||
service = FirefoxService(executable_path=GeckoDriverManager().install()) | ||
profile = FirefoxProfile() | ||
profile.set_preference('browser.cache.disk.enable', False) | ||
profile.set_preference('browser.cache.memory.enable', False) | ||
profile.set_preference('browser.cache.offline.enable', False) | ||
profile.set_preference('network.cookie.cookieBehavior', 2) | ||
driver = webdriver.Firefox(service=service, options=options, firefox_profile=profile) | ||
pc.printout("-" * 80 + "\n", pc.BLUE) | ||
pc.printout("Saving urls...\n", pc.BLUE) | ||
for url in tqdm(url_list, bar_format='{l_bar}{bar:40}{r_bar}{bar:-10b}'): | ||
""" | ||
options = Options() | ||
options.add_argument("--headless") | ||
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), chrome_options=options) | ||
""" | ||
driver.get(url) | ||
html = driver.page_source | ||
soup = BeautifulSoup(html, "html.parser") | ||
post_head = soup.find("div", {"class": "wall_text"}) | ||
if not post_head: | ||
pc.printout("Something went wrong! Probably too many requests, try deleting the cache while using a VPN or take a look at the response: \n", pc.RED) | ||
print(soup) | ||
quit(0) | ||
thumbnail = post_head.find("a", {"class": "page_post_thumb_wrap"}) | ||
media_link = post_head.find("img", {"class": "media_link__photo"}) | ||
if thumbnail: | ||
thumbnail_url = thumbnail["style"].split("(")[-1].split(")")[0] | ||
else: | ||
thumbnail_url = media_link["src"] | ||
thumbnail_url_list.append(thumbnail_url) | ||
self.conversion_dict["original_url"].append(url) | ||
self.conversion_dict["thumb_url"].append(thumbnail_url) | ||
driver.close() | ||
return thumbnail_url_list | ||
|
||
def save_images(self, thumbnail_url_list: list, name: str) -> None: | ||
try: | ||
os.mkdir("./outputs/{}".format(name)) | ||
except FileExistsError: | ||
pass | ||
counter = 0 | ||
pc.printout("Saving images...\n", pc.BLUE) | ||
for thumbnail_url in tqdm(thumbnail_url_list, bar_format='{l_bar}{bar:40}{r_bar}{bar:-10b}'): | ||
response = requests.get(thumbnail_url) | ||
filename = "{}_{}".format(name, str(counter)) | ||
with open("./outputs/{}/{}.jpg".format(name, filename), "wb") as handle: | ||
handle.write(response.content) | ||
self.conversion_dict["img_name"].append(filename) | ||
counter += 1 | ||
pc.printout("All images saved!\n", pc.BLUE) | ||
pc.printout("-" * 80 + "\n", pc.BLUE) | ||
conversion_df = pd.DataFrame.from_dict(self.conversion_dict) | ||
conversion_df.to_csv("./outputs/{}/{}_conversion.csv".format(name, name), index=False) | ||
|
||
def download_media(self) -> None: | ||
pc.printout("Please, insert full path of txt file containing all the post urls\n", pc.YELLOW) | ||
pc.printout("Remember, they should be comma separated for this to work: \n", pc.YELLOW) | ||
full_path = input() | ||
url_list, name = self.load_links(full_path) | ||
thumb_list = self.thumbnail_url(url_list) | ||
self.save_images(thumb_list, name) |
Oops, something went wrong.