Skip to content

Commit

Permalink
Troubleshooting
Browse files Browse the repository at this point in the history
  • Loading branch information
afolivieri committed Jan 11, 2024
0 parents commit 67e2cc6
Show file tree
Hide file tree
Showing 12 changed files with 666 additions and 0 deletions.
Binary file added .gitignore
Binary file not shown.
1 change: 1 addition & 0 deletions credentials/DeepL_key.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{'DeepL_key': ''}
1 change: 1 addition & 0 deletions credentials/github_private_key.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{'github_pvt': ''}
130 changes: 130 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
#!/usr/bin/env python3


from src import printcolors as pc
from src.vk_scraper import VkScraper
from src.vk_downloader import VkDownloader
import argparse
import sys
import signal
from src.git_credentials import GitCredentialsHandler

is_windows = True

try:
import gnureadline
except:
is_windows = True
from pyreadline3.rlmain import BaseReadline as PyRdl


def welcome() -> None:
pc.printout("-" * 80 + "\n")
pc.printout(" _ _ _ __ _____ \n", pc.GREEN)
pc.printout("| | | | | / / / ___| \n", pc.GREEN)
pc.printout("| | | | |/ / \ `--. ___ _ __ __ _ _ __ ___ _ __ \n", pc.GREEN)
pc.printout("| | | | \ `--. \/ __| '__/ _` | '_ \ / _ \ '__|\n", pc.GREEN)
pc.printout("\ \_/ / |\ \ /\__/ / (__| | | (_| | |_) | __/ | \n", pc.GREEN)
pc.printout(" \___/\_| \_/ \____/ \___|_| \__,_| .__/ \___|_| \n", pc.GREEN)
pc.printout(" | | \n", pc.GREEN)
pc.printout(" |_| \n", pc.GREEN)
print("\n")
pc.printout("Code structure based on OSINTagram\n", pc.YELLOW)
pc.printout("Alpha Build - Developed by Alberto Federico Olivieri\n\n", pc.CYAN)
pc.printout("This program will create a csv with the date, text, likes, and url for one or\n", pc.CYAN)
pc.printout("more targets, with the possibility to specify dates for filtering the output\n", pc.CYAN)
pc.printout("Type 'list' to show all allowed commands\n")
pc.printout("-" * 80)


def cmdlist() -> None:
# API discarded, credentials not needed
# pc.printout("credentials\t")
# pc.printout("Provide API Key\n", colour=pc.YELLOW)
pc.printout("dates\t\t")
pc.printout("Insert date or date range (dd/mm/yyyy format)\n", colour=pc.YELLOW)
pc.printout("download\t")
pc.printout("Downloads media from link if they exist\n", colour=pc.YELLOW)
pc.printout("dshow\t\t")
pc.printout("Show stored dates\n", colour=pc.YELLOW)
pc.printout("gitkey\t\t")
pc.printout("Change GitHub Private Access Key\n", colour=pc.YELLOW)
pc.printout("run\t\t")
pc.printout("When everything is set, the scraper will start running with this command\n", colour=pc.YELLOW)
pc.printout("targets\t\t")
pc.printout("Insert whitespace separated list of target(s), will overwrite old ones\n", colour=pc.YELLOW)
pc.printout("translate\t")
pc.printout("After downloading targets walls you can translate it if in possession of DeepL API key\n", colour=pc.YELLOW)
pc.printout("tshow\t\t")
pc.printout("Show comma separated list of target(s)\n", colour=pc.YELLOW)
pc.printout("update\t\t")
pc.printout("Update translation credentials\n", colour=pc.YELLOW)


def signal_handler(sig: object, frame: object) -> None:
pc.printout("\nGoodbye!\n", pc.RED)
sys.exit(0)


def completer(text: str, state: int) -> str or None:
options = [i for i in commands if i.startswith(text)]
if state < len(options):
return options[state]
else:
return None


def _quit() -> None:
pc.printout("Goodbye!\n", pc.RED)
sys.exit(0)


welcome()

parser = argparse.ArgumentParser(description="Description")
parser.add_argument('-t', '--targets', type=str, nargs='+',
help='target identificator, single or whitespace separated list')

args = parser.parse_args()

api_1 = VkScraper(args.targets)
api_2 = VkDownloader()
api_3 = GitCredentialsHandler()

commands = {
'list': cmdlist,
'help': cmdlist,
'quit': _quit,
'exit': _quit,
# 'credentials': api.store_credentials, API discarded, credentials not needed
'dates': api_1.set_dates,
'download': api_2.download_media,
'dshow': api_1.show_dates,
'gitkey': api_3.set_github_key,
'run': api_1.retrieve_targets_posts,
'targets': api_1.set_targets,
'translate': api_1.translating_target_csv,
'tshow': api_1.show_targets,
'update': api_1.update_credentials
}

signal.signal(signal.SIGINT, signal_handler)
if is_windows:
PyRdl().parse_and_bind("tab: complete")
PyRdl().set_completer(completer)
else:
gnureadline.parse_and_bind("tab: complete")
gnureadline.set_completer(completer)

while True:
pc.printout("Run a command: ", pc.YELLOW)
cmd = input()

_cmd = commands.get(cmd)

if _cmd:
_cmd()
elif cmd == "":
print("")
else:
pc.printout("Unknown command\n", pc.RED)
1 change: 1 addition & 0 deletions outputs/readme.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
do not delete this folder
33 changes: 33 additions & 0 deletions readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
## VK SCRAPER
To run this application you will need a GitHub Private Access Key, the app will ask to insert one on startup if not present.
This application will take the content published by a VKontakte page, and it will output a table with the **date**, the **text**, the **number of likes**, and the **original link** to the post.
It is also possible to input a start and an end date to filter out the results. The output will be saved as a **CSV** file in the output folder.

The commands available are:

| Command | Description |
|:---------:|:----------------------------------------------------------------------------------------------------------:|
| dates | Insert date or date range (dd/mm/yyyy format) |
| download | From a commaseparated list of post links it will download the thumbnails |
| dshow | Show stored dates |
|gitkey|You can change the saved GitHub Private Access Key that you saved|
| run | When everything is set, the scraper will start running with this command |
| targets | Isert whitespace separated list of target(s), will overwrite old ones |
| translate | This command will start the translation of the targets you have set in English. NB, requires DeepL API key |
| tshow | Show comma separated list of target |
| update | This commands is used to update the API key credentials |

A small code example to start the application, the `<target name>` is optional, it can be set within the application using `targets`:

`python3 main.py -t <target name>`

All of the data you willscrape or download will be found in the `outputs` folder.

The target name is retrieved from the target `url`: `https://vk.com/<target name>`

NB Timezone is set for `"Europe/Moscow"`, I'm still figuring out how TZ work on VK

For translation, you need a DeepL API key, if you have one just run `translate` and if not altready saved it will ask you for an API key.

**For any new feature, bug, help, etc. Just contact me @ albertofedericoolivieri@gmail.com or open a ticket.**

34 changes: 34 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
async-generator==1.10
attrs==21.4.0
beautifulsoup4==4.11.1
bs4==0.0.1
certifi==2022.5.18.1
cffi==1.15.0
charset-normalizer==2.0.12
cryptography==37.0.2
deepl==1.8.0
gnureadline==8.0.0
h11==0.13.0
idna==3.3
numpy==1.22.4
outcome==1.1.0
pandas==1.4.2
pycparser==2.21
pyOpenSSL==22.0.0
pyreadline3==3.4.1
PySocks==1.7.1
python-dateutil==2.8.2
python-dotenv==0.20.0
pytz==2022.1
requests==2.27.1
selenium==4.2.0
six==1.16.0
sniffio==1.2.0
sortedcontainers==2.4.0
soupsieve==2.3.2.post1
tqdm==4.64.0
trio==0.21.0
trio-websocket==0.9.2
urllib3==1.26.9
webdriver-manager==3.7.0
wsproto==1.1.0
33 changes: 33 additions & 0 deletions src/git_credentials.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from ast import literal_eval
from src import printcolors as pc


class GitCredentialsHandler:

def __init__(self):
self.check, self.PAT_dict = self.key_checker()
if not self.check:
pc.printout("You do not have a GitHub Personal Access Token saved, this application will not work\n", pc.RED)
pc.printout("To obtain one Login into your GitHub, go to settings -> developer settings ->\n"
"personal access tokens -> generate new token\n", pc.RED)
pc.printout("Please, provide a PAT or quit with Ctrl+C\n", pc.RED)
self.set_github_key()

@staticmethod
def key_checker() -> tuple:
with open("./credentials/github_private_key.txt", "r") as handle:
git_key_dict = literal_eval(handle.read())
if git_key_dict["github_pvt"]:
return True, git_key_dict
else:
return False, git_key_dict

def set_github_key(self):
pc.printout("Please insert your PAT: \n")
self.PAT = input()
self.PAT_dict['github_pvt'] = self.PAT
with open("./credentials/github_private_key.txt", "w") as handle:
handle.write(str(self.PAT_dict))

def retrieve_key(self):
return str(self.PAT_dict['github_pvt'])
25 changes: 25 additions & 0 deletions src/printcolors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import sys

BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = range(8)


def has_colours(stream: object) -> object:
if not (hasattr(stream, "isatty") and stream.isatty):
return False
try:
import curses
curses.setupterm()
return curses.tigetnum("colors") > 2
except:
return False


has_colours = has_colours(sys.stdout)


def printout(text: str, colour: int = WHITE) -> None:
if has_colours:
seq = "\x1b[1;%dm" % (30 + colour) + text + "\x1b[0m"
sys.stdout.write(seq)
else:
sys.stdout.write(text)
98 changes: 98 additions & 0 deletions src/vk_downloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
from selenium import webdriver
# from selenium.webdriver.chrome.service import Service
# from selenium.webdriver.chrome.options import Options
# from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.firefox.service import Service as FirefoxService
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from webdriver_manager.firefox import GeckoDriverManager
from bs4 import BeautifulSoup
import requests
from os.path import basename
import os
import pandas as pd
from src import printcolors as pc
from tqdm import tqdm
from src.git_credentials import GitCredentialsHandler


class VkDownloader:

def __init__(self) -> None:
self.conversion_dict = {"original_url": [], "thumb_url": [], "img_name": []}

@staticmethod
def load_links(full_path_to_list: str) -> [list, str]:
url_list = []
with open(full_path_to_list, "r", encoding='utf-8-sig') as handle:
text = handle.read()
for url in text.split(", "):
url_list.append(url.strip())
return url_list, basename(full_path_to_list).split(".")[0]

def thumbnail_url(self, url_list: list) -> list:
thumbnail_url_list = []
os.environ["GH_TOKEN"] = GitCredentialsHandler().retrieve_key()
options = webdriver.FirefoxOptions()
options.add_argument("--headless")
options.add_argument("--incognito")
service = FirefoxService(executable_path=GeckoDriverManager().install())
profile = FirefoxProfile()
profile.set_preference('browser.cache.disk.enable', False)
profile.set_preference('browser.cache.memory.enable', False)
profile.set_preference('browser.cache.offline.enable', False)
profile.set_preference('network.cookie.cookieBehavior', 2)
driver = webdriver.Firefox(service=service, options=options, firefox_profile=profile)
pc.printout("-" * 80 + "\n", pc.BLUE)
pc.printout("Saving urls...\n", pc.BLUE)
for url in tqdm(url_list, bar_format='{l_bar}{bar:40}{r_bar}{bar:-10b}'):
"""
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), chrome_options=options)
"""
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
post_head = soup.find("div", {"class": "wall_text"})
if not post_head:
pc.printout("Something went wrong! Probably too many requests, try deleting the cache while using a VPN or take a look at the response: \n", pc.RED)
print(soup)
quit(0)
thumbnail = post_head.find("a", {"class": "page_post_thumb_wrap"})
media_link = post_head.find("img", {"class": "media_link__photo"})
if thumbnail:
thumbnail_url = thumbnail["style"].split("(")[-1].split(")")[0]
else:
thumbnail_url = media_link["src"]
thumbnail_url_list.append(thumbnail_url)
self.conversion_dict["original_url"].append(url)
self.conversion_dict["thumb_url"].append(thumbnail_url)
driver.close()
return thumbnail_url_list

def save_images(self, thumbnail_url_list: list, name: str) -> None:
try:
os.mkdir("./outputs/{}".format(name))
except FileExistsError:
pass
counter = 0
pc.printout("Saving images...\n", pc.BLUE)
for thumbnail_url in tqdm(thumbnail_url_list, bar_format='{l_bar}{bar:40}{r_bar}{bar:-10b}'):
response = requests.get(thumbnail_url)
filename = "{}_{}".format(name, str(counter))
with open("./outputs/{}/{}.jpg".format(name, filename), "wb") as handle:
handle.write(response.content)
self.conversion_dict["img_name"].append(filename)
counter += 1
pc.printout("All images saved!\n", pc.BLUE)
pc.printout("-" * 80 + "\n", pc.BLUE)
conversion_df = pd.DataFrame.from_dict(self.conversion_dict)
conversion_df.to_csv("./outputs/{}/{}_conversion.csv".format(name, name), index=False)

def download_media(self) -> None:
pc.printout("Please, insert full path of txt file containing all the post urls\n", pc.YELLOW)
pc.printout("Remember, they should be comma separated for this to work: \n", pc.YELLOW)
full_path = input()
url_list, name = self.load_links(full_path)
thumb_list = self.thumbnail_url(url_list)
self.save_images(thumb_list, name)
Loading

0 comments on commit 67e2cc6

Please sign in to comment.