In [None]:
import pandas as pd
import datetime as dt
from IPython.display import Markdown as md
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re

In [None]:
# read in csv file with all comments and set date col to be date type

In [None]:
text_to_clean = pd.read_csv('test_output.csv', parse_dates=['Date'])

In [None]:
# remove ID and unecessary column

In [None]:
cols_to_drop = ['AuthorID', 'Attachments']

In [None]:
full_chat = text_to_clean.drop(cols_to_drop, axis=1)

In [None]:
# drop posts by admin '0xmunching'

In [None]:
disc_chat = full_chat.drop(full_chat[full_chat.Author == '0xmunching'].index)

In [None]:
# set all content to lower case, remove special characters then drop posts containing ‘gm’, ‘hi’ and NaN

In [None]:
disc_chat['Content'] = disc_chat['Content'].str.lower()

In [None]:
disc_chat['Content'] = disc_chat['Content'].map(lambda x: re.sub(r'\W+', '', x))

In [None]:
disc_chat = disc_chat[~disc_chat['Content'].isin(['gm', 'hi', 'hello', 'lol'])]

In [None]:
disc_chat = disc_chat[disc_chat['Content'].notna()]

In [None]:
# set date format to yyyy/mm/dd

In [None]:
disc_chat['Date'] = pd.to_datetime(disc_chat['Date']).dt.date

In [None]:
# gather starter stats

In [None]:
no_posts = disc_chat.shape[0]
no_authors = len(disc_chat['Author'].unique())
no_days = (disc_chat['Date'].max() - disc_chat['Date'].min()).days

In [None]:
# top authors

In [None]:
top_contributors = disc_chat.Author.value_counts().nlargest(5)

In [None]:
# find the top contributer and number of chats

In [None]:
top_contributor = top_contributors.index[0]

In [None]:
top_chats = top_contributors.iloc[0]

In [None]:
# how many contributors had more that 1 post

In [None]:
multi_contributors = len(disc_chat.Author.value_counts().loc[lambda x : x>1])

In [None]:
# how many had more that 100 posts

In [None]:
hundy_contributors = len(disc_chat.Author.value_counts().loc[lambda x : x>100])

In [None]:
# grouped data

In [None]:
test_group = disc_chat.groupby(["Date"]).sum()

In [None]:
test_group.head()

In [None]:
md("Over a period of %i days there were %i posts from %i unique authors."%(no_days, no_posts, no_authors))

In [None]:
md("The top contributer over the period is %s with a total of %i chats.  There were %i contributers that made more than one chat and %i that had over 100 chats."%(top_contributor, top_chats, multi_contributors, hundy_contributors))

In [None]:
# how does the number of chats change over time

In [None]:
tmser = disc_chat.groupby(['Date']).size().to_frame()

In [None]:
plt.figure(figsize=(12,6))
plt.title('Count of Chats')
plt.xlabel('Date')
plt.ylabel('Chats')
sns.lineplot(data=tmser)

In [None]:
# how does the number of chats change over time for the top contributors

In [None]:
top_cont_list = top_contributors.index.to_list()

In [None]:
top_disc_chat = disc_chat[disc_chat['Author'].isin(top_cont_list)].copy()

In [None]:
top_tmser = top_disc_chat.groupby(['Date']).size().to_frame()

In [None]:
plt.figure(figsize=(12,6))
plt.title('Test Plot')
sns.lineplot(data=top_tmser)
plt.xlabel('Date')
plt.ylabel('Chats by Top Contributors')

In [None]:
import tensorflow
import torch
from transformers import pipeline

In [None]:
sentiment_pipeline = sentiment_analysis = pipeline("sentiment-analysis",model="siebert/sentiment-roberta-large-english")

In [None]:
data = ["I love you", "I hate you"]

In [None]:
sentiment_pipeline(data)