Troubleshooting

afolivieri · Jan 11, 2024 · 67e2cc6 · 67e2cc6
commit 67e2cc6
Show file tree

Hide file tree

Showing 12 changed files with 666 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
diff --git a/credentials/DeepL_key.txt b/credentials/DeepL_key.txt
@@ -0,0 +1 @@
+{'DeepL_key': ''}
diff --git a/credentials/github_private_key.txt b/credentials/github_private_key.txt
@@ -0,0 +1 @@
+{'github_pvt': ''}
diff --git a/main.py b/main.py
@@ -0,0 +1,130 @@
+#!/usr/bin/env python3
+
+
+from src import printcolors as pc
+from src.vk_scraper import VkScraper
+from src.vk_downloader import VkDownloader
+import argparse
+import sys
+import signal
+from src.git_credentials import GitCredentialsHandler
+
+is_windows = True
+
+try:
+    import gnureadline
+except:
+    is_windows = True
+    from pyreadline3.rlmain import BaseReadline as PyRdl
+
+
+def welcome() -> None:
+    pc.printout("-" * 80 + "\n")
+    pc.printout(" _   _ _   __  _____                                \n", pc.GREEN)
+    pc.printout("| | | | | / / /  ___|                               \n", pc.GREEN)
+    pc.printout("| | | | |/ /  \ `--.  ___ _ __ __ _ _ __   ___ _ __ \n", pc.GREEN)
+    pc.printout("| | | |    \   `--. \/ __| '__/ _` | '_ \ / _ \ '__|\n", pc.GREEN)
+    pc.printout("\ \_/ / |\  \ /\__/ / (__| | | (_| | |_) |  __/ |   \n", pc.GREEN)
+    pc.printout(" \___/\_| \_/ \____/ \___|_|  \__,_| .__/ \___|_|   \n", pc.GREEN)
+    pc.printout("                                   | |              \n", pc.GREEN)
+    pc.printout("                                   |_|              \n", pc.GREEN)
+    print("\n")
+    pc.printout("Code structure based on OSINTagram\n", pc.YELLOW)
+    pc.printout("Alpha Build - Developed by Alberto Federico Olivieri\n\n", pc.CYAN)
+    pc.printout("This program will create a csv with the date, text, likes, and url for one or\n", pc.CYAN)
+    pc.printout("more targets, with the possibility to specify dates for filtering the output\n", pc.CYAN)
+    pc.printout("Type 'list' to show all allowed commands\n")
+    pc.printout("-" * 80)
+
+
+def cmdlist() -> None:
+    # API discarded, credentials not needed
+    # pc.printout("credentials\t")
+    # pc.printout("Provide API Key\n", colour=pc.YELLOW)
+    pc.printout("dates\t\t")
+    pc.printout("Insert date or date range (dd/mm/yyyy format)\n", colour=pc.YELLOW)
+    pc.printout("download\t")
+    pc.printout("Downloads media from link if they exist\n", colour=pc.YELLOW)
+    pc.printout("dshow\t\t")
+    pc.printout("Show stored dates\n", colour=pc.YELLOW)
+    pc.printout("gitkey\t\t")
+    pc.printout("Change GitHub Private Access Key\n", colour=pc.YELLOW)
+    pc.printout("run\t\t")
+    pc.printout("When everything is set, the scraper will start running with this command\n", colour=pc.YELLOW)
+    pc.printout("targets\t\t")
+    pc.printout("Insert whitespace separated list of target(s), will overwrite old ones\n", colour=pc.YELLOW)
+    pc.printout("translate\t")
+    pc.printout("After downloading targets walls you can translate it if in possession of DeepL API key\n", colour=pc.YELLOW)
+    pc.printout("tshow\t\t")
+    pc.printout("Show comma separated list of target(s)\n", colour=pc.YELLOW)
+    pc.printout("update\t\t")
+    pc.printout("Update translation credentials\n", colour=pc.YELLOW)
+
+
+def signal_handler(sig: object, frame: object) -> None:
+    pc.printout("\nGoodbye!\n", pc.RED)
+    sys.exit(0)
+
+
+def completer(text: str, state: int) -> str or None:
+    options = [i for i in commands if i.startswith(text)]
+    if state < len(options):
+        return options[state]
+    else:
+        return None
+
+
+def _quit() -> None:
+    pc.printout("Goodbye!\n", pc.RED)
+    sys.exit(0)
+
+
+welcome()
+
+parser = argparse.ArgumentParser(description="Description")
+parser.add_argument('-t', '--targets', type=str, nargs='+',
+                    help='target identificator, single or whitespace separated list')
+
+args = parser.parse_args()
+
+api_1 = VkScraper(args.targets)
+api_2 = VkDownloader()
+api_3 = GitCredentialsHandler()
+
+commands = {
+    'list': cmdlist,
+    'help': cmdlist,
+    'quit': _quit,
+    'exit': _quit,
+    # 'credentials': api.store_credentials, API discarded, credentials not needed
+    'dates': api_1.set_dates,
+    'download': api_2.download_media,
+    'dshow': api_1.show_dates,
+    'gitkey': api_3.set_github_key,
+    'run': api_1.retrieve_targets_posts,
+    'targets': api_1.set_targets,
+    'translate': api_1.translating_target_csv,
+    'tshow': api_1.show_targets,
+    'update': api_1.update_credentials
+}
+
+signal.signal(signal.SIGINT, signal_handler)
+if is_windows:
+    PyRdl().parse_and_bind("tab: complete")
+    PyRdl().set_completer(completer)
+else:
+    gnureadline.parse_and_bind("tab: complete")
+    gnureadline.set_completer(completer)
+
+while True:
+    pc.printout("Run a command: ", pc.YELLOW)
+    cmd = input()
+
+    _cmd = commands.get(cmd)
+
+    if _cmd:
+        _cmd()
+    elif cmd == "":
+        print("")
+    else:
+        pc.printout("Unknown command\n", pc.RED)
diff --git a/outputs/readme.txt b/outputs/readme.txt
@@ -0,0 +1 @@
+do not delete this folder
diff --git a/readme.md b/readme.md
@@ -0,0 +1,33 @@
+## VK SCRAPER
+To run this application you will need a GitHub Private Access Key, the app will ask to insert one on startup if not present. 
+This application will take the content published by a VKontakte page, and it will output a table with the **date**, the **text**, the **number of likes**, and the **original link** to the post.
+It is also possible to input a start and an end date to filter out the results. The output will be saved as a **CSV** file in the output folder.
+
+The commands available are:
+
+|  Command  |                                                Description                                                 |
+|:---------:|:----------------------------------------------------------------------------------------------------------:|
+|   dates   |                               Insert date or date range (dd/mm/yyyy format)                                |
+| download  |                  From a commaseparated list of post links it will download the thumbnails                  |
+|   dshow   |                                             Show stored dates                                              |
+|gitkey|You can change the saved GitHub Private Access Key that you saved|
+|    run    |                  When everything is set, the scraper will start running with this command                  |
+|  targets  |                   Isert whitespace separated list of target(s), will overwrite old ones                    |
+| translate | This command will start the translation of the targets you have set in English. NB, requires DeepL API key |
+|   tshow   |                                    Show comma separated list of target                                     |
+|  update   |                          This commands is used to update the API key credentials                           |
+
+A small code example to start the application, the `<target name>` is optional, it can be set within the application using `targets`:
+
+`python3 main.py -t <target name>`
+
+All of the data you willscrape or download will be found in the `outputs` folder.
+
+The target name is retrieved from the target `url`: `https://vk.com/<target name>`
+
+NB Timezone is set for `"Europe/Moscow"`, I'm still figuring out how TZ work on VK
+
+For translation, you need a DeepL API key, if you have one just run `translate` and if not altready saved it will ask you for an API key.
+
+**For any new feature, bug, help, etc. Just contact me @ albertofedericoolivieri@gmail.com or open a ticket.**
+
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,34 @@
+async-generator==1.10
+attrs==21.4.0
+beautifulsoup4==4.11.1
+bs4==0.0.1
+certifi==2022.5.18.1
+cffi==1.15.0
+charset-normalizer==2.0.12
+cryptography==37.0.2
+deepl==1.8.0
+gnureadline==8.0.0
+h11==0.13.0
+idna==3.3
+numpy==1.22.4
+outcome==1.1.0
+pandas==1.4.2
+pycparser==2.21
+pyOpenSSL==22.0.0
+pyreadline3==3.4.1
+PySocks==1.7.1
+python-dateutil==2.8.2
+python-dotenv==0.20.0
+pytz==2022.1
+requests==2.27.1
+selenium==4.2.0
+six==1.16.0
+sniffio==1.2.0
+sortedcontainers==2.4.0
+soupsieve==2.3.2.post1
+tqdm==4.64.0
+trio==0.21.0
+trio-websocket==0.9.2
+urllib3==1.26.9
+webdriver-manager==3.7.0
+wsproto==1.1.0
diff --git a/src/git_credentials.py b/src/git_credentials.py
@@ -0,0 +1,33 @@
+from ast import literal_eval
+from src import printcolors as pc
+
+
+class GitCredentialsHandler:
+
+    def __init__(self):
+        self.check, self.PAT_dict = self.key_checker()
+        if not self.check:
+            pc.printout("You do not have a GitHub Personal Access Token saved, this application will not work\n", pc.RED)
+            pc.printout("To obtain one Login into your GitHub, go to settings -> developer settings ->\n"
+                        "personal access tokens -> generate new token\n", pc.RED)
+            pc.printout("Please, provide a PAT or quit with Ctrl+C\n", pc.RED)
+            self.set_github_key()
+
+    @staticmethod
+    def key_checker() -> tuple:
+        with open("./credentials/github_private_key.txt", "r") as handle:
+            git_key_dict = literal_eval(handle.read())
+            if git_key_dict["github_pvt"]:
+                return True, git_key_dict
+            else:
+                return False, git_key_dict
+
+    def set_github_key(self):
+        pc.printout("Please insert your PAT: \n")
+        self.PAT = input()
+        self.PAT_dict['github_pvt'] = self.PAT
+        with open("./credentials/github_private_key.txt", "w") as handle:
+            handle.write(str(self.PAT_dict))
+
+    def retrieve_key(self):
+        return str(self.PAT_dict['github_pvt'])
diff --git a/src/printcolors.py b/src/printcolors.py
@@ -0,0 +1,25 @@
+import sys
+
+BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = range(8)
+
+
+def has_colours(stream: object) -> object:
+    if not (hasattr(stream, "isatty") and stream.isatty):
+        return False
+    try:
+        import curses
+        curses.setupterm()
+        return curses.tigetnum("colors") > 2
+    except:
+        return False
+
+
+has_colours = has_colours(sys.stdout)
+
+
+def printout(text: str, colour: int = WHITE) -> None:
+    if has_colours:
+        seq = "\x1b[1;%dm" % (30 + colour) + text + "\x1b[0m"
+        sys.stdout.write(seq)
+    else:
+        sys.stdout.write(text)
diff --git a/src/vk_downloader.py b/src/vk_downloader.py
@@ -0,0 +1,98 @@
+from selenium import webdriver
+# from selenium.webdriver.chrome.service import Service
+# from selenium.webdriver.chrome.options import Options
+# from webdriver_manager.chrome import ChromeDriverManager
+from selenium.webdriver.firefox.service import Service as FirefoxService
+from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
+from webdriver_manager.firefox import GeckoDriverManager
+from bs4 import BeautifulSoup
+import requests
+from os.path import basename
+import os
+import pandas as pd
+from src import printcolors as pc
+from tqdm import tqdm
+from src.git_credentials import GitCredentialsHandler
+
+
+class VkDownloader:
+
+    def __init__(self) -> None:
+        self.conversion_dict = {"original_url": [], "thumb_url": [], "img_name": []}
+
+    @staticmethod
+    def load_links(full_path_to_list: str) -> [list, str]:
+        url_list = []
+        with open(full_path_to_list, "r", encoding='utf-8-sig') as handle:
+            text = handle.read()
+            for url in text.split(", "):
+                url_list.append(url.strip())
+            return url_list, basename(full_path_to_list).split(".")[0]
+
+    def thumbnail_url(self, url_list: list) -> list:
+        thumbnail_url_list = []
+        os.environ["GH_TOKEN"] = GitCredentialsHandler().retrieve_key()
+        options = webdriver.FirefoxOptions()
+        options.add_argument("--headless")
+        options.add_argument("--incognito")
+        service = FirefoxService(executable_path=GeckoDriverManager().install())
+        profile = FirefoxProfile()
+        profile.set_preference('browser.cache.disk.enable', False)
+        profile.set_preference('browser.cache.memory.enable', False)
+        profile.set_preference('browser.cache.offline.enable', False)
+        profile.set_preference('network.cookie.cookieBehavior', 2)
+        driver = webdriver.Firefox(service=service, options=options, firefox_profile=profile)
+        pc.printout("-" * 80 + "\n", pc.BLUE)
+        pc.printout("Saving urls...\n", pc.BLUE)
+        for url in tqdm(url_list, bar_format='{l_bar}{bar:40}{r_bar}{bar:-10b}'):
+            """
+            options = Options()
+            options.add_argument("--headless")
+            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), chrome_options=options)
+            """
+            driver.get(url)
+            html = driver.page_source
+            soup = BeautifulSoup(html, "html.parser")
+            post_head = soup.find("div", {"class": "wall_text"})
+            if not post_head:
+                pc.printout("Something went wrong! Probably too many requests, try deleting the cache while using a VPN or take a look at the response: \n", pc.RED)
+                print(soup)
+                quit(0)
+            thumbnail = post_head.find("a", {"class": "page_post_thumb_wrap"})
+            media_link = post_head.find("img", {"class": "media_link__photo"})
+            if thumbnail:
+                thumbnail_url = thumbnail["style"].split("(")[-1].split(")")[0]
+            else:
+                thumbnail_url = media_link["src"]
+            thumbnail_url_list.append(thumbnail_url)
+            self.conversion_dict["original_url"].append(url)
+            self.conversion_dict["thumb_url"].append(thumbnail_url)
+        driver.close()
+        return thumbnail_url_list
+
+    def save_images(self, thumbnail_url_list: list, name: str) -> None:
+        try:
+            os.mkdir("./outputs/{}".format(name))
+        except FileExistsError:
+            pass
+        counter = 0
+        pc.printout("Saving images...\n", pc.BLUE)
+        for thumbnail_url in tqdm(thumbnail_url_list, bar_format='{l_bar}{bar:40}{r_bar}{bar:-10b}'):
+            response = requests.get(thumbnail_url)
+            filename = "{}_{}".format(name, str(counter))
+            with open("./outputs/{}/{}.jpg".format(name, filename), "wb") as handle:
+                handle.write(response.content)
+            self.conversion_dict["img_name"].append(filename)
+            counter += 1
+        pc.printout("All images saved!\n", pc.BLUE)
+        pc.printout("-" * 80 + "\n", pc.BLUE)
+        conversion_df = pd.DataFrame.from_dict(self.conversion_dict)
+        conversion_df.to_csv("./outputs/{}/{}_conversion.csv".format(name, name), index=False)
+
+    def download_media(self) -> None:
+        pc.printout("Please, insert full path of txt file containing all the post urls\n", pc.YELLOW)
+        pc.printout("Remember, they should be comma separated for this to work: \n", pc.YELLOW)
+        full_path = input()
+        url_list, name = self.load_links(full_path)
+        thumb_list = self.thumbnail_url(url_list)
+        self.save_images(thumb_list, name)