# exploring reddit api
https://www.reddit.com/prefs/apps
nome: grattini

doc
https://praw.readthedocs.io/en/stable/tutorials/comments.html

In [1]:
from praw_init import reddit
from dataclasses import dataclass
from typing import List

@dataclass
class SubmissionWithComment:
    id: str # from the short link, to avoid duplicates, e.g. https://redd.it/11an7hn -> 11an7hn
    submission: str
    comments: List[str]

"""
reddit = praw.Reddit(
    client_id="asdfga",
    client_secret="-ab-afbab",
    password="abafb",
    user_agent="afdbab",
    username="abb",
)
"""

'\nreddit = praw.Reddit(\n    client_id="asdfga",\n    client_secret="-ab-afbab",\n    password="abafb",\n    user_agent="afdbab",\n    username="abb",\n)\n'

In [2]:


def get_submissions_with_comments(reddit, link:str, limit: int, min_length:int ) -> SubmissionWithComment:
    submission = reddit.submission(url=link)
    submission.comments.replace_more(limit=0)
    comments = map(
        lambda comment:comment.body,
        filter(
            lambda c: len(c.body) > min_length,
            submission.comments.list()[:limit]
        )
    )
    assert submission.id == link.split('/')[-1]
    return SubmissionWithComment(
        submission.id,
        submission.title + " " + submission.selftext,
        list(comments)
    )

def get_top_links_from_subreddit(reddit, subreddit_name:str, limit: int, min_length:int=42, time_filter:str='week') -> List[str]:
    return set(map(
        lambda submission: submission.shortlink,
        filter(
            lambda s:len(s.selftext) > min_length, 
            reddit.subreddit(subreddit_name).top(limit=limit, time_filter=time_filter)
        )
    ))

def get_hot_links_from_subreddit(reddit, subreddit_name:str, limit: int, min_length:int=42) -> List[str]:
    return set(map(
        lambda submission: submission.shortlink,
        filter(
            lambda s:len(s.selftext) > min_length, 
            reddit.subreddit(subreddit_name).hot(limit=limit)
        )
    ))

def get_important_link_from_subreddit(reddit, subreddit_name:str, limit: int, min_length:int, time_filter:str='week') -> List[str]:
    hot_thread = get_hot_links_from_subreddit(reddit, subreddit_name, limit, min_length)
    top_thread = get_top_links_from_subreddit(reddit, subreddit_name, limit, min_length, time_filter)
    return list(hot_thread.intersection(top_thread))

def get_link_from_subreddit(reddit, subreddit_name:str, limit: int, min_length:int, time_filter:str='week') -> List[str]:
    hot_thread = get_hot_links_from_subreddit(reddit, subreddit_name, limit//2, min_length)
    top_thread = get_top_links_from_subreddit(reddit, subreddit_name, limit, min_length, time_filter)
    return list(hot_thread.union(top_thread))

def get_submissions_with_comments_from_subreddit(reddit, subreddit_name:str, limit: int, min_length:int=100, time_filter:str='week') -> List[SubmissionWithComment]:
    links = get_link_from_subreddit(reddit, subreddit_name, limit, min_length, time_filter)
    return list(
        filter(
            lambda swc: len(swc.comments) > 0,
            map(
                lambda link: get_submissions_with_comments(reddit, link, limit, min_length),
                links
            )
    ))

In [3]:

import re

@dataclass
class Cleanupper: # aka spazzino
    r_ulr: bool = True
    r_0_width_space: bool = True
    r_emoji: bool = True
    r_double_space: bool = True
    r_double_newline: bool = True

    # function to remove URLs between () brackets and [] brackets
    def remove_url(self, text):
        text = re.sub(r'\(http\S+', '', text)
        text = re.sub(r'\[http\S+', '', text)
        return text


    def cleanup(self, text: str) -> str:
        if self.r_ulr:
            text = self.remove_url(text)
        if self.r_0_width_space:
            text = re.sub(r'\u200b', '', text)
        if self.r_emoji:
            text = re.sub(r'\\x\w\w', '', text)
        if self.r_double_space:
            text = re.sub(r'\s\s+', ' ', text)
        if self.r_double_newline:
            text = re.sub(r'\n\n+', '\n', text)
        
        return text




    def __call__(self, submission: SubmissionWithComment) -> SubmissionWithComment:
        return SubmissionWithComment(
            submission.id,
            self.cleanup(submission.submission),
            list(map(self.cleanup, submission.comments))
        )



In [4]:
interesting_subreddits = [
    "Italia",
    "italy",
    "ItalyCalcio",
    "ItaliaPersonalFinance",
    "ScienzaItalia",
    "polliceverde"
]

extra_subreddit = [
    "askitaly",
    "CinemaeTVItalia",
    "erba",
    "ITAGLIA",
    "storia",
    "ItalyMotori",
    "italyLGBT",
    "CasualIT"
]


Salvataggio su file
sarà un dict, così posso (i) usare pickle (ii) non inventare separatori strani (iii) nessuno deve riusare la mia classe

In [5]:
import random
import pickle
import os 

# create 'threads' folder (if not exists)
os.makedirs("threads", exist_ok=True)

# read all files in 'threads' folder
# and get list of ids
files = os.listdir("threads")
files = set(map(lambda f: f.split('.')[0].split('_')[1], files))

subreddits = interesting_subreddits + random.choices(extra_subreddit, k=2)
random.shuffle(subreddits)
for sub in subreddits:
    print("downloading from " + sub)
    swc = get_submissions_with_comments_from_subreddit(reddit, sub, 30)
    print("cleaning up")
    swc = list(map(Cleanupper(), swc))
    print("saving as <subreddit>_<id>.pk")
    for s in swc:
        if s.id not in files:
            with open(f"threads/{sub}_{s.id}.pk", "wb") as f:
                pickle.dump(s.__dict__, f)




downloading from italy
cleaning up
saving as <subreddit>_<id>.pk
downloading from Italia
cleaning up
saving as <subreddit>_<id>.pk
downloading from ItaliaPersonalFinance
cleaning up
saving as <subreddit>_<id>.pk
downloading from erba
cleaning up
saving as <subreddit>_<id>.pk
downloading from ItalyCalcio
cleaning up
saving as <subreddit>_<id>.pk
downloading from storia
cleaning up
saving as <subreddit>_<id>.pk
downloading from polliceverde
cleaning up
saving as <subreddit>_<id>.pk
downloading from ScienzaItalia
cleaning up
saving as <subreddit>_<id>.pk


In [6]:
# list saved threads
files = os.listdir("threads")

# read all threads
threads = [
    pickle.load(open(f"threads/{f}", "rb"))
    for f in files
]

# print title and first comment of each thread
for t in threads:
    print(t['submission'])
    print(t['comments'][0])
    print()

Nuova riforma fiscale, impatto sulle rendite finanziarie Cito un articolo di We Wealth: https://www.we-wealth.com/news/investimenti/leggi-e-normative/riforma-del-fisco-quali-novita-sulle-rendite-finanziarie "Tra i punti particolarmente degni di attenzione si segnala il settore delle rendite finanziarie. Su questo terreno, stando a quanto emerge dal documento in esame, la proposta è di raggruppare i redditi di capitale e i redditi diversi di natura finanziaria sotto il cappello di un’unica categoria reddituale, il cui meccanismo di tassazione seguirebbe quello del c.d. principio di cassa (superando invece il meccanismo che prescindendo dall’effettivo realizzo prevedeva l’imposizione sul maturato a conclusione dell’anno).
La scelta di orientarsi sulla tassazione per cassa del realizzo, come tra l’altro ha dichiarato il Ministro Leo, trova ragion d’essere in quanto questo meccanismo sarebbe più idoneo a rispettare il principio di capacità contributiva. Inoltre, prevedere un’imposta sostit