# This Panel will call comments from  r/all, r/news, and r/politics and predict if robot or not

## import libraries

In [1]:
import pandas as pd
import requests
import praw
from praw.models import MoreComments
import datetime
import numpy as np

import joblib
from sklearn.datasets import load_digits
from sklearn.linear_model import SGDClassifier

## Define Functions

### Fn: Get the posts, their comments, and clean them up (Hot)

In [2]:
## First Get The Posts
def getpostidsHOT(sub):  
    """
    This fn gets the top 10 posts for a subreddit sorted by hot
    """
    subreddit = reddit.subreddit(sub)
    title = []
    subid = []
    for submission in subreddit.hot(limit=10):
        tit = submission.title
        title.append(tit)
        ids = submission.id
        subid.append(ids)
    df = pd.DataFrame(list(zip(title, subid)), columns=["title", "id"])
    print("got post ids")
    return df

In [3]:

## Then Get The Comments
def getcomments(id, postnumber):
    """
    This function gets comments and a few other field. It formats in a way ready for the
    cleandata fn.
    """
    submission = reddit.submission(id=id)
    comid = []
    authoru = []
    mods = []
    created = []
    upratio = []
    body = []
    submission.comments.replace_more(limit=5)  
    for comment in submission.comments.list():
        ids = comment.id
        author = comment.author
        mod = comment.distinguished
        createds = comment.created_utc
        score = comment.score
        bodys = comment.body
        
        comid.append(ids)
        authoru.append(author)
        mods.append(mod)
        created.append(createds)
        upratio.append(score)
        body.append(bodys)
    df = pd.DataFrame(list(zip(comid, authoru, mods, created, upratio, body)), 
                              columns=["comid", "authoru", "mods", 'created', 'upratio', 'body'])
    postnumber = df
    print("got comments")
    return postnumber

###################################################################
## Then get a little bit more data that be useful later
def cleandata (df):
    """
    This function extracts usernames from the author feature even where the name is deleted.
    It also cleans up the time to make it more useable.
    """
    created_date = []
    names = []
    for i in range(len(df)):
        try:
            uname = df.loc[i][1].name
            names.append(uname)
        except:    
            uname = 'deleted'  
            names.append(uname)
    df["username"] = names
    
    for row in df["created"]:
        date = datetime.datetime.fromtimestamp(row)
        created_date.append(date)

    print("done")
    df["created_date"] = created_date


    ## Dealing with TIME~~
    df["created_date"] = pd.to_datetime(df["created_date"])
    dattime_strings = df["created_date"].dt.strftime("%Y-%m-%d %H:%M:%S")

    df["created_date2"] = dattime_strings
    #complaints.head()

    df["created_date2"]=df["created_date2"].apply(lambda x:datetime.datetime.strptime(x,"%Y-%m-%d %H:%M:%S"))

    time_strings = df["created_date"].dt.strftime("%H:%M:%S")
    date_strings = df["created_date"].dt.strftime("%Y-%m-%d")

    df["date"] = date_strings
    df["time"] = time_strings
    print("claned time and got users")
    return df

#######################################################################
## Then use Eamon Flemming's Cleaning Strategy

#Import Regex
import re

#This function selects any consecutive combination of \r's and \n's in a bloc of text, 
#and replaces that selection with a single space.
def replace_linebreaks_w_space(x):
    return re.sub('([\r\n]+)',' ',x)

#This function selects any stretch of two or more consecutive spaces in a bloc of text,
#and replaces that selection with a single space.
def replace_multispace_w_space(x):
    return re.sub('([ ]{2,})',' ',x)

"""
I created a fn to tie all  eamon's fn together:
"""
def use_eamons(df, sub, body = "body"):
    #Here we take every comment and apply the two functions to it.
    df[body] = df[body].map(replace_linebreaks_w_space)
    df[body] = df[body].map(replace_multispace_w_space)
    
    #Strip away any spaces at the beginning or end of each comment, splits the comment into a list of words, 
    #and returns the length of that list (i.e.; the number of words in the comment)
    df['word_length'] = df[body].map(lambda x: len(x.strip().split(' ')))
    df["subreddit"] = sub
    
    """
    The last thing we're going to do is remove all comments that are 3 words and shorter, as it's difficult, 
    and for the most part just unreasonable, to guess anything from comments this short. 
    We want to focus on accurately predicting comments that have some content.
    """
    comments = df[df['word_length']>=4]
    
    df_clean = comments
    print("used eamons")
    return df_clean

In [4]:
## Then combine everything but the post ids into one fn
def allsteps(df_name, subname):  
    i = 0
    df_list = []
    for row in df_name["id"]:

        print(row)
        i += 1
        name = f"row{i}"
        df = getcomments(row, name)
        df = cleandata(df)
        df_list.append(df)

    clean_df_list=[]
    for df in df_list:
        clean_df = use_eamons(df, subname)
        clean_df_list.append(clean_df)

    final_df = pd.concat(clean_df_list)
    
    return final_df


### Look for Accusations of Bots and Get the Comments Accused 

In [5]:
## Get Accusation Comments

def findbotaccuse(df, body_column):
    """
    This function looks for the word bot, then cleans for mod bots that say -i am a bot-. it assumes everything 
    that is keeps is an accusation. While that's likely not true, we'll roll with it and see.
    """
    bot = df[df[body_column].str.contains(r"\bbot\b")]
    Bot = df[df[body_column].str.contains(r"\bBot\b")]
    robot = df[df[body_column].str.contains(r"\brobot\b")]
    Robot = df[df[body_column].str.contains(r"\bRobot\b")]
    
    bot_accuse = pd.concat([bot, robot, Robot, Bot])
    bot_accuse = bot_accuse.reset_index(drop=True)
    
    #cleaning for -i am a bot-
    only_accuse_bot = bot_accuse[~bot_accuse[body_column].str.contains(r"\bI am a bot\b")]
    print("found accusations")
    return only_accuse_bot

######################################################
## Then get accused

def getparentid(df, idcol, bodycol):
    """
    This gets the id of the parents that we can use to get the body
    """

    com_id = []
    parent = []
    com_body = []
    for comid in df[idcol]:
        comment_id = comid
        com_id.append(comment_id)

        # instantiating the Comment class
        comment = reddit.comment(comment_id)

        # fetching the parent_id attribute
        parent_id = comment.parent_id 
        print(parent_id)
        # collecting
        parent.append(parent_id)
    for body in df[bodycol]:
        bodyi = body
        com_body.append(bodyi)
        
    print("got parent ID")
    return parent, com_id, com_body


def getparentbody(parent, com_id, com_body):
    """
    This gets the body of the parent comment and produces the final df with all the 
    info we need
    """
    parentbody=[]
 
    for ids in parent:
        try:
            parent_id = ids
            # but also running through next step
            comment_parent = reddit.comment(parent_id)
            parent_body = comment_parent.body


            print("got parent body")
        except:
            parent_body = " "
            print("no body")
        
        parentbody.append(parent_body)    
    
    lil_df = pd.DataFrame(list(zip(com_id, com_body, parent, parentbody)), columns=["child_id", "child_body", "parent_id", "parent_body"])
    
    return lil_df

## putting get accused into one neat fn: 

def getparentall(df, idcol, bodycol):
    """
    This combines the get parent body and get id. it wasn't working when i had them in one singular fn, but it works
    like this
    """
    parent_list, com_id, com_body  = getparentid(df, idcol, bodycol)
    final_df = getparentbody(parent_list, com_id, com_body)
    return final_df

### Last Get Predictions

In [6]:
# Use the loaded pickled model to make predictions
def getpredictions(model, Xval, df):
    """
    This function predicts with the defined model on the validation set
    """
    predictions = pd.DataFrame(model.predict_proba(Xval))
    predictions['parent_body'] = df['parent_body']
    predictions['child_body'] = df['child_body']
    predictions['child_id'] = df['child_id']
    predictions['parent_id'] = df['parent_id']
    predictions['pred'] = model.predict(Xval)
    predictions.columns = ['bot', 'human','parent_body','child_body','child_id','parent_id','pred']
    predictions = predictions[['child_id','child_body','parent_id','parent_body','pred', 'human','bot']]
    return predictions

## setup credentials

In [7]:
CLIENT_ID =  "4OwAYLyOEKKPQjb-Hgpt6g"
SECRET_TOKEN =  "-05d1ux9vDdkVt67yPMojiKywTjRsQ"
user_agent =  "MUSA550KarmaMine1"
password = 'KarmaMine1$$'

In [8]:
reddit = praw.Reddit(client_id=CLIENT_ID, \
                     client_secret=SECRET_TOKEN, \
                     user_agent=user_agent, \
                     username=user_agent, \
                     password=password)

## Call Comments

### r/all (sorted by HOT)

In [9]:
rall_df = getpostidsHOT("all")
rall_comments = allsteps(rall_df, "all")

got post ids
rl0rbu
got comments
done
claned time and got users
rl14fx
got comments
done
claned time and got users
rkzlrz
got comments
done
claned time and got users
rkz9so
got comments
done
claned time and got users
rkylhs
got comments
done
claned time and got users
rky28q
got comments
done
claned time and got users
rky25x
got comments
done
claned time and got users
rkx8ka
got comments
done
claned time and got users
rkyn0t
got comments
done
claned time and got users
rkx61s
got comments
done
claned time and got users
used eamons
used eamons
used eamons
used eamons
used eamons
used eamons
used eamons
used eamons
used eamons
used eamons


### r/news  (sorted by HOT)

In [10]:
rnews_df = getpostidsHOT("news")
rnews_comments = allsteps(rnews_df, "news")

got post ids
rkulbg
got comments
done
claned time and got users
rkqjyx
got comments
done
claned time and got users
rko3lm
got comments
done
claned time and got users
rkzeoj
got comments
done
claned time and got users
rkp52l
got comments
done
claned time and got users
rkz5f0
got comments
done
claned time and got users
rl1y03
got comments
done
claned time and got users
rktnd9
got comments
done
claned time and got users
rkyknx
got comments
done
claned time and got users
rkpbb9
got comments
done
claned time and got users
used eamons
used eamons
used eamons
used eamons
used eamons
used eamons
used eamons
used eamons
used eamons
used eamons


### r/politics  (sorted by HOT)

In [11]:
rpolitics_df = getpostidsHOT("politics")
rpolitics_comments = allsteps(rpolitics_df, "politics")

got post ids
rkt3lz
got comments
done
claned time and got users
rkwvl2
got comments
done
claned time and got users
rkzdyh
got comments
done
claned time and got users
rkwyar
got comments
done
claned time and got users
rkn2t0
got comments
done
claned time and got users
rknwk8
got comments
done
claned time and got users
rklwct
got comments
done
claned time and got users
rknp3p
got comments
done
claned time and got users
rkrlfz
got comments
done
claned time and got users
rkov6q
got comments
done
claned time and got users
used eamons
used eamons
used eamons
used eamons
used eamons
used eamons
used eamons
used eamons
used eamons
used eamons


## Get Accusations

In [12]:
## rall
rall_accuse = findbotaccuse(rall_comments, "body")


## rnews
rnews_accuse = findbotaccuse(rnews_comments, "body")


## rpolitics
rpolitics_accuse = findbotaccuse(rpolitics_comments, "body")

found accusations
found accusations
found accusations


## Getting the Accused (Parent of the Accusation)

In [13]:
## rall
rall_parent = getparentall(rall_accuse, "comid", "body")


## rnews
rnews_parent = getparentall(rnews_accuse, "comid", "body")


## rpolitics
rpolitics_parent = getparentall(rpolitics_accuse, "comid", "body")



t1_hpdb1m4
t1_hpd3d6s
t1_hpdbr6q
t1_hpdg389
t1_hpdiiw5
got parent ID
got parent body
got parent body
got parent body
got parent body
got parent body
t1_hpbyjuu
got parent ID
got parent body
t1_hpbikxs
t3_rklwct
t1_hpadwgg
t1_hpar46b
got parent ID
got parent body
no body
got parent body
got parent body


## Cleaning the Comments

In [14]:
rall_parent_clean = use_eamons(rall_parent, "rall", "parent_body")

used eamons


In [15]:
rnews_parent_clean = use_eamons(rnews_parent, "rnews", "parent_body")

used eamons


In [16]:
rpolitics_parent_clean = use_eamons(rpolitics_parent, "rpolitics", "parent_body")

used eamons


## Bringing the Model

In [17]:
import joblib
from sklearn.datasets import load_digits
from sklearn.linear_model import SGDClassifier


In [18]:
filename = r'./data/rallsubsimLogit.joblib.pkl'

In [19]:
model = joblib.load(filename)

## Testing on the Accusations

### r/all 

In [20]:
Xall = rall_parent_clean['parent_body']
rall_predictions = getpredictions(model, Xall, rall_parent)

### r/news  

In [21]:
Xnews = rnews_parent_clean['parent_body']
news_predictions = getpredictions(model, Xnews, rnews_parent)

### r/politics  

In [22]:
Xpolitics = rpolitics_parent_clean['parent_body']
politics_predictions = getpredictions(model, Xpolitics, rpolitics_parent)

# combine

In [23]:
frames = [rall_predictions, news_predictions, politics_predictions]

In [161]:
accused = pd.concat(frames)

In [162]:
accused.head()

Unnamed: 0,child_id,child_body,parent_id,parent_body,pred,human,bot
0,hpdgz49,I downvoted you and I’m not a bot. Your belief...,t1_hpdb1m4,[deleted],all,0.7497,0.2503
1,hpd63gn,Whoa... someone is project hard here. Or are y...,t1_hpd3d6s,"China and TCM related reddit: ""fuck this pos""",all,0.55417,0.44583
2,hpdg389,"Don't worry, it seems like they're an advanced...",t1_hpdbr6q,I had to check the date on my watch. Damn you!,all,0.554178,0.445822
3,hpdkbqt,What if I am an android? Then I’d technically ...,t1_hpdg389,"Don't worry, it seems like they're an advanced...",SubSimulatorGPT2,0.362779,0.637221
0,hpc4ybs,We need robot coyotes to howl at the right time.,t1_hpbyjuu,"That's amazing. ""Hang on, let me double my fer...",all,0.923328,0.076672


## Textblob

In [163]:
import textblob

In [164]:
blobs = [textblob.TextBlob(posts) for posts in accused["parent_body"]]

In [165]:
accused['Polarity'] = [blob.sentiment.polarity for blob in blobs]
accused['Subjectivity'] = [blob.sentiment.subjectivity for blob in blobs]

In [166]:
accused.rename(columns={'parent_body': 'accused', 'child_body': 'accuser'}, inplace=True)

In [167]:
accused.head()

Unnamed: 0,child_id,accuser,parent_id,accused,pred,human,bot,Polarity,Subjectivity
0,hpdgz49,I downvoted you and I’m not a bot. Your belief...,t1_hpdb1m4,[deleted],all,0.7497,0.2503,0.0,0.0
1,hpd63gn,Whoa... someone is project hard here. Or are y...,t1_hpd3d6s,"China and TCM related reddit: ""fuck this pos""",all,0.55417,0.44583,-0.2,0.5
2,hpdg389,"Don't worry, it seems like they're an advanced...",t1_hpdbr6q,I had to check the date on my watch. Damn you!,all,0.554178,0.445822,0.0,0.0
3,hpdkbqt,What if I am an android? Then I’d technically ...,t1_hpdg389,"Don't worry, it seems like they're an advanced...",SubSimulatorGPT2,0.362779,0.637221,0.144444,0.35
0,hpc4ybs,We need robot coyotes to howl at the right time.,t1_hpbyjuu,"That's amazing. ""Hang on, let me double my fer...",all,0.923328,0.076672,0.366667,0.55


## HV Plot

In [168]:
import hvplot.pandas

In [169]:
import panel as pn
pn.extension()

In [170]:
def accused_plot(x='human', y='bot', width= 320, color='#4b2362', hover_cols=["accuser", "accused"]):
    return accused.hvplot.scatter(x, y, c=c, padding=0.1)

columns = list(accused.columns[-4:])
columns

['human', 'bot', 'Polarity', 'Subjectivity']

In [171]:
# Create the widgets
x = pn.widgets.Select(value="human", options=columns, name="x")
y = pn.widgets.Select(value="bot", options=columns, name="y")
#color = pn.widgets.ColorPicker(name="Color", value="#4b2362")

In [172]:
# Create the dashboard
reactive_dashboard = pn.Row(
    pn.Column(
        pn.Row("Bot Accused Comments"),
        pn.Row(
            pn.Column(x, y)), # Title and widgets
    pn.bind(accused.hvplot.scatter, x, y, width= 320,  c='#4b2362', hover_cols=["accuser", "accused"])), # Main chart
)


In [173]:
reactive_dashboard

In [42]:
#reactive_dashboard.save("bot_accusation.html", embed=True)

                                                                                                                       