* assign weight to each post using score and comments and normalize over total weight and num of posts
    * 100 posts means normalized weight totals 100
* get sentiment of statement - positive, negative, neutral (1, -1, 0)
* identify players and teams in statement - create player/team column and generate list of players and teams
* create 2 matrices (one for players and one for teams)
    * rows will be players/teams, columns will be posts, data will be sum of scores
    * calculate sum of each players/teams score over all posts
    * rank players/teams from highest score to lowest

In [3]:
import praw
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [4]:
def read_subreddit(name):
    read_only = praw.Reddit(client_id='8vMcTdF4LlTwoVv3dM1DXg',
                            client_secret='E_NxxYPlsPxM5vfvNstPKmwc5u4ESQ',
                            user_agent='Scraper')
    
    return read_only.subreddit(name)

In [39]:
def subreddit_df(name, sort_type='hot'):
    '''
    Takes a subreddit name and sort type as input.
    Returns a pandas dataframe with the title, text, score, number of comments, and any links
    for the first 100 posts in the specified subreddit, sorted by the specified sort type.
    '''
    
    assert type(name) == str
    
    sub = read_subreddit(name)
    
    sub_dict = {'title': [],
                'text': [],
                'score': [],
                'num_comments': [],
                'link': [],
                'ratio': [],
                'comments': []}
    
    if sort_type == 'new':
        for post in sub.new(limit=None):
            sub_dict['title'].append(post.title)
            sub_dict['text'].append(post.selftext)
            sub_dict['score'].append(post.score)
            sub_dict['num_comments'].append(post.num_comments)
            sub_dict['link'].append(post.url)
            sub_dict['ratio'].append(post.upvote_ratio)
            
            comments = []
            post.comments.replace_more(limit=None)
            for comment in post.comments:
                comments.append(comment.body)
                
            sub_dict['comments'].append(comments)
            
    else:
        for post in sub.hot(limit=None):
            sub_dict['title'].append(post.title)
            sub_dict['text'].append(post.selftext)
            sub_dict['score'].append(post.score)
            sub_dict['num_comments'].append(post.num_comments)
            sub_dict['link'].append(post.url)
            sub_dict['ratio'].append(post.upvote_ratio)
            
            comments = []
            post.comments.replace_more(limit=None)
            for comment in post.comments:
                comments.append(comment.body)
                
            sub_dict['comments'].append(comments)
    
    types = {'title': 'string',
             'text': 'string',
             'score': int,
             'num_comments': int,
             'link': object,
             'ratio': float,
             'comments': list}
    
    df = pd.DataFrame(sub_dict)
    return df.astype(dtype=types)

In [5]:
def id_cols(in_df):
    assert type(in_df) == pd.core.frame.DataFrame
    
    df = in_df.copy()
    return df.dtypes[df.dtypes == 'string'].index.tolist()

In [6]:
def df_text_to_list(in_df, cols_in=None):
    '''
    Takes a pandas dataframe and list of columns as input.
    For all specified columns, removes any non-alphanumeric characters,
    splits the column up into individual words, and returns the dataframe
    with the contents of the columns replaced as a list of words.
    '''
    assert type(in_df) == pd.core.frame.DataFrame
    
    global cols
    if cols_in is None: cols = id_cols(in_df)
    else: 
        assert type(cols_in) == list
        cols = cols_in
    
    df = in_df.copy()
    
    df[cols] = df[cols].apply(lambda x: x.str.lower())
    df[cols] = df[cols].apply(lambda x: x.str.replace(r'[^\w\s]', '', regex=True))
    df[cols] = df[cols].apply(lambda x: x.str.strip())
    df[cols] = df[cols].apply(lambda x: x.str.split())
    
    def remove_duplicates_from_series(x):
        
        def remove_duplicates_from_row(x):
            return list(set(x))
        
        x = x.copy()
        return x.apply(remove_duplicates_from_row)
        
    df[cols] = df[cols].apply(remove_duplicates_from_series)
    
    return df

In [7]:
def remove_stopwords(in_df):#, cols=None):
    '''
    Takes a pandas dataframe and list of columns as input.
    For all specified columns, removes any stop words found in
    the nltk corpus stopwords, and returns the columns with
    all stop words removed.
    '''
    assert type(in_df) == pd.core.frame.DataFrame and type(cols) == list
    
#     if cols is None: cols = id_cols(in_df)
#     else: assert type(cols) == list
    
    df = in_df.copy()
    
    def remove_stop_from_series(in_x):
        
        def remove_stop_from_row(y):
            stop_words = set(stopwords.words('english'))
            return [word for word in y if word not in stop_words]
        
        x = in_x.copy()
        x = x.apply(remove_stop_from_row)
        #print(x)
        return x
        
    
    df[cols] = df[cols].apply(remove_stop_from_series)
    
    return df

In [40]:
#test_df = subreddit_df('news', sort_type='new')[:5]
test_df = subreddit_df('nba')[:5]
test_df

KeyboardInterrupt: 

In [27]:
for item in test_df.Title.tolist(): print(item + '\n' + '-'*40, end='\n')

Daily Discussion Thread + Game Thread Index
----------------------------------------
[SERIOUS NEXT DAY THREAD] Post-Game Discussion (December 21, 2022)
----------------------------------------
Pascal Siakam drops career-high in a win against the New York Knicks: 52 points
----------------------------------------
The Beam being lit at Golden 1 Center
----------------------------------------
[Highlight] Robin Lopez trash talking his brother
----------------------------------------


In [28]:
for item in test_df.Text.tolist(): print(item + '\n' + '-'*40, end='\n')

# Game Threads Index (December 21, 2022):

|Tip-off|GDT|Away|Score|Home|PGT|
|:--|:--:|:--|:-:|:--|:--:|
|07:00 pm ET|[Link](https://www.reddit.com/r/nba/comments/zs4dg3/game_thread_milwaukee_bucks_228_cleveland/)|[Milwaukee Bucks](/r/mkebucks)|FINAL >!106 to 114!<|[Cleveland Cavaliers](/r/clevelandcavs)|[Link](https://www.reddit.com/r/nba/comments/zs9mqr/post_game_thread_the_cleveland_cavaliers_2211/)|
|07:00 pm ET|[Link](https://www.reddit.com/r/nba/comments/zs4dg1/game_thread_detroit_pistons_825_philadelphia/)|[Detroit Pistons](/r/detroitpistons)|FINAL >!93 to 113!<|[Philadelphia 76ers](/r/sixers)|[Link](https://www.reddit.com/r/nba/comments/zs9gc3/post_game_thread_the_philadelphia_76ers_1812/)|
|07:30 pm ET|[Link](https://www.reddit.com/r/nba/comments/zs5654/game_thread_chicago_bulls_1218_atlanta_hawks_1615/)|[Chicago Bulls](/r/chicagobulls)|FINAL >!110 to 108!<|[Atlanta Hawks](/r/atlantahawks)|[Link](https://www.reddit.com/r/nba/comments/zsa35y/post_game_thread_the_chicago_bulls_1

In [10]:
test_df = df_text_to_list(test_df)

In [11]:
test_df

Unnamed: 0,Title,Text,Score,Comments,Link,Ratio
0,"[daily, discussion, thread, game, index]","[95, etlinkhttpswwwredditcomrnbacommentszq9hkw...",10,4,https://www.reddit.com/r/nba/comments/zq09a9/d...,0.7
1,"[postgame, december, next, 18, 2022, discussio...","[orlando, minnesota, thread, 121, memes, det, ...",28,17,https://www.reddit.com/r/nba/comments/zppo5n/s...,0.88
2,"[windhorst, good, sprained, ankle, a, its, foo...","[internet, people, windhorst, 2, httpspodcasts...",5368,1195,https://www.reddit.com/r/nba/comments/zptfqj/w...,0.92
3,"[setting, scoring, from, damian, trail, record...","[ever, get, will, sits, 2, accomplishment, por...",1818,189,https://www.reddit.com/r/nba/comments/zq1qp5/d...,0.98
4,"[knicks, allstar, wannabe, last, wally, after,...","[can, 12, sure, 473789, he, tyrese, 579, 34, 3...",1308,471,https://www.reddit.com/r/nba/comments/zpzow9/w...,0.93


In [12]:
test_df = remove_stopwords(test_df)

In [13]:
test_df

Unnamed: 0,Title,Text,Score,Comments,Link,Ratio
0,"[daily, discussion, thread, game, index]","[95, etlinkhttpswwwredditcomrnbacommentszq9hkw...",10,4,https://www.reddit.com/r/nba/comments/zq09a9/d...,0.7
1,"[postgame, december, next, 18, 2022, discussio...","[orlando, minnesota, thread, 121, memes, det, ...",28,17,https://www.reddit.com/r/nba/comments/zppo5n/s...,0.88
2,"[windhorst, good, sprained, ankle, foot, ad]","[internet, people, windhorst, 2, httpspodcasts...",5368,1195,https://www.reddit.com/r/nba/comments/zptfqj/w...,0.92
3,"[setting, scoring, damian, trail, record, poin...","[ever, get, sits, 2, accomplishment, portland,...",1818,189,https://www.reddit.com/r/nba/comments/zq1qp5/d...,0.98
4,"[knicks, allstar, wannabe, last, wally, suppos...","[12, sure, 473789, tyrese, 579, 34, 32, 40, ws...",1308,471,https://www.reddit.com/r/nba/comments/zpzow9/w...,0.93


In [14]:
test_df['Weight'] = test_df['Score'] / test_df['Ratio']

In [15]:
test_df

Unnamed: 0,Title,Text,Score,Comments,Link,Ratio,Weight
0,"[daily, discussion, thread, game, index]","[95, etlinkhttpswwwredditcomrnbacommentszq9hkw...",10,4,https://www.reddit.com/r/nba/comments/zq09a9/d...,0.7,14.285714
1,"[postgame, december, next, 18, 2022, discussio...","[orlando, minnesota, thread, 121, memes, det, ...",28,17,https://www.reddit.com/r/nba/comments/zppo5n/s...,0.88,31.818182
2,"[windhorst, good, sprained, ankle, foot, ad]","[internet, people, windhorst, 2, httpspodcasts...",5368,1195,https://www.reddit.com/r/nba/comments/zptfqj/w...,0.92,5834.782609
3,"[setting, scoring, damian, trail, record, poin...","[ever, get, sits, 2, accomplishment, portland,...",1818,189,https://www.reddit.com/r/nba/comments/zq1qp5/d...,0.98,1855.102041
4,"[knicks, allstar, wannabe, last, wally, suppos...","[12, sure, 473789, tyrese, 579, 34, 32, 40, ws...",1308,471,https://www.reddit.com/r/nba/comments/zpzow9/w...,0.93,1406.451613


In [16]:
test_df['NormWt'] = test_df['Weight'] / test_df['Weight'].sum()

In [17]:
test_df

Unnamed: 0,Title,Text,Score,Comments,Link,Ratio,Weight,NormWt
0,"[daily, discussion, thread, game, index]","[95, etlinkhttpswwwredditcomrnbacommentszq9hkw...",10,4,https://www.reddit.com/r/nba/comments/zq09a9/d...,0.7,14.285714,0.001563
1,"[postgame, december, next, 18, 2022, discussio...","[orlando, minnesota, thread, 121, memes, det, ...",28,17,https://www.reddit.com/r/nba/comments/zppo5n/s...,0.88,31.818182,0.00348
2,"[windhorst, good, sprained, ankle, foot, ad]","[internet, people, windhorst, 2, httpspodcasts...",5368,1195,https://www.reddit.com/r/nba/comments/zptfqj/w...,0.92,5834.782609,0.638208
3,"[setting, scoring, damian, trail, record, poin...","[ever, get, sits, 2, accomplishment, portland,...",1818,189,https://www.reddit.com/r/nba/comments/zq1qp5/d...,0.98,1855.102041,0.202911
4,"[knicks, allstar, wannabe, last, wally, suppos...","[12, sure, 473789, tyrese, 579, 34, 32, 40, ws...",1308,471,https://www.reddit.com/r/nba/comments/zpzow9/w...,0.93,1406.451613,0.153838


In [5]:
nba = read_subreddit('nba')

In [6]:
for post in nba.hot(limit=5):
    print(post.title + '\n' + '-' * 20)
    print()
    post.comments.replace_more(limit=None)
    print(post.comments.list())
    #for comment in post.comments.list():
        #print('-' + comment.body)
    print('=' * 50)

Daily Discussion Thread + Game Thread Index
--------------------

[Comment(id='j1ixag4'), Comment(id='j1izu0v'), Comment(id='j1iz0pq'), Comment(id='j1jd1lo'), Comment(id='j1jtf2u')]
[SERIOUS NEXT DAY THREAD] Post-Game Discussion (December 23, 2022)
--------------------

[Comment(id='j1hjzxy'), Comment(id='j1hk1b9'), Comment(id='j1hk10u'), Comment(id='j1hk1wt'), Comment(id='j1hk0eh'), Comment(id='j1hk0pz'), Comment(id='j1hk1lv'), Comment(id='j1hk3kr'), Comment(id='j1hk3vf'), Comment(id='j1hk3b2'), Comment(id='j1hk30d'), Comment(id='j1hk26l'), Comment(id='j1hk2qy'), Comment(id='j1hk01o'), Comment(id='j1hk2gb'), Comment(id='j1hneo6'), Comment(id='j1hmep7'), Comment(id='j1ipony'), Comment(id='j1ipvfc'), Comment(id='j1iji1k'), Comment(id='j1hmj4s'), Comment(id='j1ihg9n'), Comment(id='j1i58vy'), Comment(id='j1in0dw'), Comment(id='j1ilyym'), Comment(id='j1jip45'), Comment(id='j1i92dg'), Comment(id='j1i59ss'), Comment(id='j1horq3'), Comment(id='j1ihm0k'), Comment(id='j1i8hup')]
James Harden on

[Comment(id='j1iyfwt'), Comment(id='j1izagw'), Comment(id='j1j4q14'), Comment(id='j1j9qg1'), Comment(id='j1iysem'), Comment(id='j1j039o'), Comment(id='j1izc66'), Comment(id='j1j9gwc'), Comment(id='j1j0ap4'), Comment(id='j1iyllb'), Comment(id='j1j0en7'), Comment(id='j1j7f9g'), Comment(id='j1jgct0'), Comment(id='j1jcbg3'), Comment(id='j1j1mgj'), Comment(id='j1j6xfj'), Comment(id='j1j01q1'), Comment(id='j1j1ehj'), Comment(id='j1jbugo'), Comment(id='j1jgt11'), Comment(id='j1jh0do'), Comment(id='j1jjttx'), Comment(id='j1jnd21'), Comment(id='j1j4zdv'), Comment(id='j1j6lcv'), Comment(id='j1j6yab'), Comment(id='j1jeekz'), Comment(id='j1j1806'), Comment(id='j1j6g4s'), Comment(id='j1j10iu'), Comment(id='j1j97p3'), Comment(id='j1jn1aa'), Comment(id='j1j884o'), Comment(id='j1jfkff'), Comment(id='j1izz62'), Comment(id='j1j4uot'), Comment(id='j1jccx0'), Comment(id='j1jeuj9'), Comment(id='j1jex9i'), Comment(id='j1jl7dl'), Comment(id='j1jlvoo'), Comment(id='j1jmev9'), Comment(id='j1josca'), Comment(id

In [9]:
reddit = praw.Reddit(client_id='8vMcTdF4LlTwoVv3dM1DXg',
                     client_secret='E_NxxYPlsPxM5vfvNstPKmwc5u4ESQ',
                     user_agent='Scraper')

reddit.comment(id='j1iiw9l').body

'Man is in multiple strip club halls of fame in Houston. Why would he care about something like this?'

In [17]:
coms = reddit.info(fullnames=['j1iiw9l', 'j1iyfwt'])
for com in coms:
    print(com)