## Preliminary data analysis metrics

> An explanation of the central idea behind your final project (What is the idea? Why is it interesting? Which datasets did you need to explore the idea? How did you download them?)

Required imports

In [1]:
# clean up outputs from warnings
import warnings
warnings.filterwarnings("ignore")

import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import scraper
import utils
import numpy as np
import pandas as pd

from nltk import FreqDist
import pathlib
from multiprocessing import Pool
import pickle
from tqdm import tqdm
import time

plt.rcParams['figure.figsize'] = [10, 8]

If something related to tqdm fails, run:
> `pip install ipywidgets widgetsnbextension pandas-profiling`

Changes?
> `pip freeze > requirements.txt`

## Downloading the data

In [2]:
category_titles = [
    "Category:Wikipedia_level-1_vital_articles",
    "Category:Wikipedia_level-2_vital_articles",
    "Category:Wikipedia_level-3_vital_articles",
    #"Category:Wikipedia_level-4_vital_articles",
    #"Category:Wikipedia_level-5_vital_articles"
]

page_graph, infos  = await scraper.scrape_wiki(category_titles)

Fetching 3 categories: 100%|██████████| 3/3 [00:00<00:00,  5.59it/s]
Fetching 1001 article pages: 100%|██████████| 101/101 [00:00<00:00, 127.51it/s]
Parsing talk page batches: 100%|██████████| 101/101 [00:00<00:00, 8489.64it/s]


{'Earth': ['All Wikipedia articles written in American English',
  'All articles containing potentially dated statements',
  'All articles lacking reliable references',
  'Articles containing Ancient Greek (to 1453)-language text',
  'Articles containing potentially dated statements from September 2021',
  'Articles lacking reliable references from February 2023',
  'Articles with BNE identifiers',
  'Articles with BNF identifiers',
  'Articles with BNFdata identifiers',
  'Articles with EMU identifiers',
  'Articles with FAST identifiers',
  'Articles with GND identifiers',
  'Articles with ISNI identifiers',
  'Articles with J9U identifiers',
  'Articles with LCCN identifiers',
  'Articles with NARA identifiers',
  'Articles with NDL identifiers',
  'Articles with NKC identifiers',
  'Articles with NLK identifiers',
  'Articles with TDVİA identifiers',
  'Articles with VIAF identifiers',
  'Articles with WorldCat Entities identifiers',
  'Articles with hAudio microformats',
  'Articl

Fetching 3 categories: 100%|██████████| 3/3 [00:00<00:00,  5.57it/s]
Fetching 1001 article pages: 100%|██████████| 21/21 [00:00<00:00, 56.18it/s]
Parsing talk page batches:   0%|          | 0/21 [00:00<?, ?it/s]


UnboundLocalError: local variable 'categories' referenced before assignment

A walk-through of your preliminary data analysis, addressing:
> - What is the total size of your data? (MB, number of pages, other variables, etc.)

In [None]:
infos.keys()

NameError: name 'infos' is not defined

In [None]:
talk_pages = [title for title in page_graph.nodes if title[:5] == "Talk:"]
users = [title for title in page_graph.nodes if title[:5] == "User:"]
print("Number of pages in the vital articles dataset:", len(infos["titles"]))
print("Number of related archived pages:", len(infos["archive_titles"]))
print("Number of users found in relation to the dataset:", len(users))
print("No. nodes", page_graph.number_of_nodes())
print("No. links:", page_graph.number_of_edges())

> - What is the network you will be analyzing? (number of nodes? number of links?, degree distributions, what are node attributes?, etc.)

In [None]:
plt.rcParams['font.sans-serif'] = 'DejaVu Sans'

graph = page_graph.copy()

# Only keep users with a high degree
for node in page_graph.nodes(data=True):
    if "page_class" in node[1]:
        if node[1]["page_class"] == "user":
            if page_graph.out_degree(node[0]) <= 10:
                graph.remove_node(node[0])
    else:
        graph.remove_node(node[0])

# Remove outliers
cc = nx.weakly_connected_components(graph)
largest_c = max(cc, key=lambda x: len(x))
rsubgraph = nx.subgraph(graph, largest_c)

# Positions (currently unused)
#pos = nx.nx_agraph.graphviz_layout(rsubgraph, prog="neato")

# Color and size according to coast and degree
color_talk = "#0000FF"
color_user = "#FF0000"
node_colors = [color_talk if node[1]["page_class"] == "talk" else color_user for node in rsubgraph.nodes(data=True)]
node_sizes = [rsubgraph.degree(node) for node in rsubgraph.nodes]

nx.draw(rsubgraph, with_labels=True, font_weight='light', font_size=5, node_size=node_sizes, width=.1, edge_color="#555555", arrowsize=2, node_color=node_colors)

In [None]:
# sorted(rsubgraph.degree, key=lambda item: item[1], reverse=True)[:10]
import pandas as pd

degrees = pd.DataFrame(rsubgraph.degree, columns=["Node", "Degree"])
degrees.head()

In [None]:
degrees["PageType"] = ["User" if node[:5] == "User:" else "Talk" for node in degrees.Node ]
users = degrees[degrees.PageType == "User"]
potential_bots = users[users["Node"].str.contains('bot', case=False)]
print(f"{len(potential_bots)} users found with with bot in their name:")
print(",\n".join(potential_bots.Node.values))

After evaluation we found that `User:Botteville`, `User:KP Botany` and `User:NinjaRobotPirate` are human users. Therefore we can filter out the bots by name.


In [None]:
bot_names = ["User:Community Tech bot", "User:PrimeBOT", "User:InternetArchiveBot", "User:AnomieBOT", "User:RMCD bot", "User:Cyberbot II", "User:CommonsNotificationBot",
"User:LinkBot", "User:FairuseBot", "User:BetacommandBot", "User:Legobot", "User:DumZiBoT"]

human_users = users[~users["Node"].isin(bot_names)]
n = 10
top_n_human_df = human_users.sort_values(["Degree"], ascending=False).head(n)
top_n_human_df

Make subgraph with users and their related pages.



In [None]:
top_n_human_names = top_n_human_df.Node.values
top_users_graph = graph.subgraph(sum([list(graph.neighbors(node)) + [node] for node in top_n_human_names], []))
pages = [node for node in top_users_graph.nodes if node not in top_n_human_names] 

node_colors = [color_talk if node[1]["page_class"] == "talk" else color_user for node in top_users_graph.nodes(data=True)]
node_sizes = [rsubgraph.degree(node) for node in top_users_graph.nodes]

In [None]:
degrees = dict(top_users_graph.degree())
sorted_nodes = sorted(degrees, key=degrees.get, reverse=True)
top_nodes_count = 10
top_nodes = [node for node in sorted(degrees, key=degrees.get, reverse=True) if node[:5] == "Talk:"][:top_nodes_count]
labels = {node: node for node in top_nodes}

In [None]:
from fa2 import ForceAtlas2

forceatlas2 = ForceAtlas2(
                        # Behavior alternatives
                        outboundAttractionDistribution=True,  # Dissuade hubs
                        linLogMode=False,  # NOT IMPLEMENTED
                        adjustSizes=False,  # Prevent overlap (NOT IMPLEMENTED)
                        edgeWeightInfluence=1.0,

                        # Performance
                        jitterTolerance=1.0,  # Tolerance
                        barnesHutOptimize=True,
                        barnesHutTheta=2.0,
                        multiThreaded=False,  # NOT IMPLEMENTED

                        # Tuning
                        scalingRatio=3.0,
                        strongGravityMode=False,
                        gravity=0.1,

                        # Log
                        verbose=True)
pos=forceatlas2.forceatlas2_networkx_layout(top_users_graph, pos=None, iterations=2000)
    
nx.draw(top_users_graph, pos=pos, node_color=node_colors, node_size=node_sizes, edge_color="#999999", width=0.3, alpha=0.5)
plt.show()

In [None]:
pos = nx.spring_layout(top_users_graph)
label_pos = {node: (pos[node][0], pos[node][1] + 0.15) for node in top_nodes}
nx.draw(top_users_graph, pos=pos, node_color=node_colors, node_size=node_sizes, edge_color="#999999", width=0.3, alpha=0.5, with_labels=False)
nx.draw_networkx_labels(top_users_graph, pos, labels=labels, font_size=10, font_color='black')
plt.show()

## Basic Analysis

In [None]:
import pandas as p
import matplotlib.pyplot as plt
import powerlaw
import scipy.stats as sps

# Basic Statistics

count_nodes = len(page_graph)
count_edges = len(page_graph.edges())

# Create degree statistic dicts
degrees = dict(page_graph.degree())
in_degrees = dict(page_graph.in_degree())
out_degrees = dict(page_graph.out_degree())

talk_page_in_degrees = {k: v for k, v in in_degrees.items() if page_graph.nodes[k]["page_class"] == "talk"}
user_out_degrees = {k: v for k, v in out_degrees.items() if page_graph.nodes[k]["page_class"] == "user"}

def find_top(n, stat_dict):
    degrees_pages = []
    degrees_users = []
    top_overall = ""

    for page, degree in dict(sorted(stat_dict.items(), key=lambda item: item[1], reverse=True)).items():
        if top_overall == "":
            top_overall = page + " - " + str(degree)

        # stat dicts don't distinguish between east/west, so we'll do that here
        if len(degrees_pages) < n and page_graph.nodes[page]["page_class"] == "talk": 
            degrees_pages.append(page + " - " + str(degree))
        elif len(degrees_users) < n and page_graph.nodes[page]["page_class"] == "user":
            degrees_users.append(page + " - " + str(degree))

        if len(degrees_pages) >= n and len(degrees_users) >= n:
            break  # found all top v
    
    return degrees_pages, degrees_users, top_overall

degrees_pages, degrees_users, top_overall = find_top(10, degrees)

print("Number of nodes: " + str(count_nodes))
print("Number of links: " + str(count_edges))

print()
print("Highest degrees for pages:")
print("> Overall:")
print(top_overall)
print("> Pages:")
print("\n".join(degrees_pages))
print("> Users:")
print("\n".join(degrees_users))


# Degree multiplicities
in_degrees_counts = p.Series(talk_page_in_degrees.values()).value_counts()
out_degrees_counts = p.Series(user_out_degrees.values()).value_counts()

max_degree = max([max(in_degrees_counts.index), max(out_degrees_counts.index)])
max_multiplicity = max([max(in_degrees_counts.values), max(out_degrees_counts.values)])
range_x = range(1, max_degree + 1)

in_degrees_counts_interp = in_degrees_counts.reindex(range(max_degree+1), fill_value=0).sort_index()
out_degrees_counts_interp = out_degrees_counts.reindex(range(max_degree+1), fill_value=0).sort_index()

# Exponents
fit_in = powerlaw.Fit(in_degrees_counts.sort_index().values, verbose=False)
fit_out = powerlaw.Fit(out_degrees_counts.sort_index().values, verbose=False)

exp_in = fit_in.alpha
exp_out = fit_out.alpha

print("Exponents:")
print("In-degrees: " + str(exp_in) + " sigma: " + str(fit_in.sigma))
print("Out-degrees: " + str(exp_out) + " sigma: " + str(fit_out.sigma))

In [None]:
import math
def fpl(x, a):
    return x ** (-a)

# Plots
fig, axs = plt.subplots(2, 2)

# Sturges rule
no_bins_sturges = int(1 + math.log(len(talk_page_in_degrees.values()), 2))

axs[0, 0].scatter(in_degrees_counts.index, in_degrees_counts.values, s=5, label='Data')
#hist, bin_edges, _ = axs[0, 0].hist(talk_page_in_degrees.values(), bins=no_bins_sturges, edgecolor='white', label='Data')
#axs[0, 0].set_xticks(bin_edges)
#axs[0, 0].set_xticklabels(['%.0f' % val for val in bin_edges], rotation=45)
axs[0,0].set_title('Multiplicity of In-degrees for Talk pages')
axs[0,0].legend()

axs[0,1].scatter(in_degrees_counts.index, in_degrees_counts.values, s=5, label='Data')
#axs[0,1].plot(range_x, fpl(range_x, exp_in) * count_nodes, 'k-', lw=1, alpha=.75, label='Power Law fit')
axs[0,1].set_yscale('log')
axs[0,1].set_xscale('log')
axs[0,1].set_title('Multiplicity of In-degrees for Talk pages [log-log]')
axs[0,1].set_xlim(1, max_degree)
axs[0,1].set_ylim(1, max_multiplicity)
axs[0,1].legend()

axs[1, 0].scatter(out_degrees_counts.index, out_degrees_counts.values, color="red", s=5, label='Data')
# hist, bin_edges, _ = axs[1, 0].hist(talk_page_in_degrees.values(), bins=no_bins_sturges, color="red", edgecolor='white', label='Data')
# axs[1, 0].set_xticks(bin_edges)
# axs[1, 0].set_xticklabels(['%.0f' % val for val in bin_edges], rotation=45)
axs[1,0].set_title('Multiplicity of Out-degrees for Users')
axs[1,0].legend()

axs[1,1].scatter(out_degrees_counts.index, out_degrees_counts.values, s=5, label='Data', color="red")
#axs[1,1].plot(range_x, fpl(range_x, exp_out) * count_nodes, 'k-', lw=1, alpha=.75, label='Power Law fit')
axs[1,1].set_yscale('log')
axs[1,1].set_xscale('log')
axs[1,1].set_title('Multiplicity of Out-degrees for Users [log-log]')
axs[1,1].set_xlim(1, max_degree)
axs[1,1].set_ylim(1, max_multiplicity)
axs[1,1].legend()

for ax in axs.flat:
    ax.set(xlabel='Degree', ylabel='Multiplicity')

fig.tight_layout()

# Sentiment analysis per user

For this, we will first need to extract all the comments from all pages, as well as the author of the comment
Then we will assign all comment texts to a single author, and run sentiment analysis on the texts.

In [26]:
wikipage_folder = pathlib.Path("./page_contents/")
filenames = list(wikipage_folder.rglob("*.txt"))

#worker_results = [parse_comments_from_pages(filenames[:100])]

with Pool(12) as pool:
    # perform calculations
    worker_results = pool.map(utils.parse_comments_from_pages, utils.chunk_list(filenames, 20))

failed to parse: page_contents/Talk:Crustacean.txt
failed to parse: page_contents/Talk:Natural rubber.txt
failed to parse: page_contents/Talk:Weak interaction.txt
failed to parse: page_contents/Talk:School/Archive 1.txt
failed to parse: page_contents/Talk:City.txt
failed to parse: page_contents/Talk:The Buddha/Archive 15.txt
failed to parse: page_contents/Talk:Racism/Archive 25.txt
failed to parse: page_contents/Talk:Scientific Revolution.txt
failed to parse: page_contents/Talk:The Buddha/Archive 17.txt
failed to parse: page_contents/Talk:News.txt
failed to parse: page_contents/Talk:State (polity)/Archive 1.txt
failed to parse: page_contents/Talk:State (polity)/Archive 2.txt
failed to parse: page_contents/Talk:Economy.txt
failed to parse: page_contents/Talk:Sport of athletics.txt
failed to parse: page_contents/Talk:Soviet Union/Archive 3.txt
failed to parse: page_contents/Talk:The Buddha.txt
failed to parse: page_contents/Talk:World War II/Archive 59.txt
failed to parse: page_contents/

In [28]:
with open("./parsing_worker_results.pkl", "wb") as file:
    pickle.dump(worker_results, file)

In [2]:
with open("./parsing_worker_results.pkl", "rb") as file:
    worker_results = pickle.load(file)

In [3]:
def append_or_create_list_in_dict(dict, key, value):
    if(key not in dict):
        dict[key] = []
    
    dict[key].append(value)

def append_comment_to_talkpage(page_dict, filepath, comment):
    '''Append a comment to a list of comments on that talk page. Will collect all comments for a specific talk
    page, including comments archived talk pages'''
    filepath_parts = filepath.split('/')

    if(len(filepath_parts) > 2):
        append_or_create_list_in_dict(page_dict, filepath_parts[1], comment)

    else:
        filename_parts = filepath_parts[1].split('.')
        append_or_create_list_in_dict(page_dict,filename_parts[0], comment)

In [4]:
author_dict = {}
list_for_df = []
page_dict = {}

# iterate over the results by the workers
# and transform the output into a dictionary with the users as keys
# and their comments as text
for worker_result in worker_results:
    for filepath, page in worker_result:
        for subsection in page["sections"]:
            if subsection.get("heading"):
                for comments in subsection.get("comments"):
                    if comments.get("author"):
                        for author, comment in utils.parse_comment_subcomment(comments):
                            append_or_create_list_in_dict(author_dict, author, comment)
                            append_comment_to_talkpage(page_dict, filepath, comment)
                            list_for_df.append([author, comment, filepath])

## Analyze sentiment on per-author basis

In [5]:
import utils
# show the top 5 authors written the most text in comment pages
# before tokenizing the comments
items = author_dict.items()
items_sorted = sorted(items, key=lambda x: len(x[1]), reverse=True)
[(author, len(utils.flatten(comments))) for author, comments in items_sorted][:5]

[('Slrubenstein', 2945947),
 (None, 1447914),
 ('The Four Deuces', 1509544),
 ('Rick Norwood', 1548158),
 ('Fowler&amp;fowler', 2809855)]

In [6]:
labMT = pd.read_csv("./labMT.txt", sep="\t")
# to facilitate happiness_average value lookup
labMT.set_index("word", inplace=True)

# Do sentiment analysis
# code taken from assignment 2

def sentiment(tokens):
    if(len(tokens) == 0):
        return
    freq = FreqDist(tokens)

    # filter for the vocabulary we can evaluate with LabMT
    vocab = list(filter(lambda word: word in labMT.index, np.unique(tokens)))

    # array of each token's average happiness weighted by the token's frequency
    weighted_happiness = np.fromiter((freq[word] * labMT.loc[word].happiness_average for word in vocab), dtype=float)
    # each token's frequency
    word_frequencies = np.fromiter((freq[word] for word in vocab), dtype=float)
    return np.sum(weighted_happiness) / np.sum(word_frequencies)

In [7]:
def extract_sentiments(key, text):
    text = [utils.tokenize_custom(s) for s in text]
    text = utils.flatten(text)
    # compute sentiment for individual rapper wiki page
    if len(text) > 100:
        sentiment_value = sentiment(text)
        if(sentiment_value):
            return (key, sentiment_value)

with Pool(12) as pool:
    worker_results = pool.starmap(extract_sentiments, author_dict.items(), 40)
    worker_results = [worker_result for worker_result in worker_results if worker_result is not None]

In [None]:
sentiments = {author: sentiment for author, sentiment in worker_results}
sentiments_df = pd.DataFrame({"Author": sentiments.keys(), "comment_happiness": sentiments.values()})
# use the author's name as index
sentiments_df.set_index("Author", inplace=True)
sentiments_df.head()

Unnamed: 0_level_0,comment_happiness
Author,Unnamed: 1_level_1
Asilvering,5.452088
NotTheFakeJTP,5.449191
WP Ludicer,5.322311
WelpThatWorked,5.437543
VHarbee,5.332811


In [None]:
print("Top 10 authors with the happiest comments:")
best_sentiments = sentiments_df.sort_values("comment_happiness", ascending=False).head(10)
best_sentiments

Top 10 authors with the happiest comments:


Unnamed: 0_level_0,comment_happiness
Author,Unnamed: 1_level_1
2400:1A00:B050:D9DB:10E3:BD5B:F365:EAF9,5.950227
Lia 199712,5.897957
88.107.193.220,5.888372
64.252.198.75,5.884074
2A02:CE0:1801:50E:C51D:9330:A910:98D3,5.87954
Bkusmono,5.855294
76.2.40.115,5.841782
Alex Ramon,5.838727
Krishna mainali777,5.830319
MB297,5.805904


In [None]:
print("Top 10 authors with the saddest comments:")
worst_sentiments = sentiments_df.sort_values("comment_happiness", ascending=True).head(10)
worst_sentiments

Top 10 authors with the saddest comments:


Unnamed: 0_level_0,comment_happiness
Author,Unnamed: 1_level_1
93.229.148.172,4.7345
Cyrillic,4.756739
Pissedpat,4.803218
Nordenfeldt,4.806176
78.2.102.150,4.824016
71.192.41.250,4.825415
2600:4040:A4DB:A600:8059:2428:93E0:3FD6,4.84675
181.63.26.168,4.848586
203.59.112.221,4.857449
BonoboSpiderMonkey,4.863038


In [None]:
# this shows the original comment before tokenizin
[s for s in author_dict.get(best_sentiments.iloc[0].name)]

[" == Movie ==\n \n The 1st Nepali film made in Nepal is Aama.\n The 1st colourful film in the world is Gone with the Wind.\n The 1st actor of Nepali film was Shiva Shankar Mandhar.\n The 1st motion picture made by Nepal film corporation was Paral Ko Ago.\n The 1st Hollywood film to win 11 Oscar awards was Ben Hur.\n The first Nepali Colour movie is Kumari.\n Carvan(Himalayan) is 1st Nepali film made in India.\n Tatum O'Neal was 1st child star to win Oscar for her portrayal of Addie in Paper Moon.\n The 1st Nepali film nominated for the Academy Award was Carvan(Himalayan).\n The 1st Bollywood film to win the Oscar awards was Slam Dog. [[Special:Contributions/2400:1A00:B050:D9DB:10E3:BD5B:F365:EAF9|2400:1A00:B050:D9DB:10E3:BD5B:F365:EAF9]] ([[User talk:2400:1A00:B050:D9DB:10E3:BD5B:F365:EAF9|talk]]) 04:19, 13 November 2022 (UTC)\n"]

In [None]:
[s for s in author_dict.get(worst_sentiments.iloc[0].name)]

[' == Copper Vectoring ==\n \n Im Artikel steht der Satz \'\'\'"Kupfer leitet den elektrischen Strom sehr gut (58 · 106 S/m)."\'\'\' \n <br>Das Kupfernetz der Deutschen Telekom in Deutschland besteht zu 100 Prozent aus Kupfer. Die elektrische Spannung V beträgt zwischen 60-70 Volt. Der elektrische Stromfluss A ist variabel (Elektronik bis 5 Ampere). Die Bundesnetzagentur hat entschieden, dass im Netz der Deutschen Telekom der \'\'\'Bitstrom\'\'\' (Datenpakete) mit 100 MBit/s (300 MBit/s - Leitung/virtuelle Leitung/Leitung-) fließen soll. Dazu investiert die Deutsche Telekom ab 2013 zirka 200 Euro pro Haushalt, bei 24 Millionen Haushalten sind das  4,8 Milliarden Euro. Dabei wird die Kabelverteilertechnik in 330.000 Kabelverteilern in Deutschland durch Technik der Firma [[Alcatel Lucent]] ersetzt. Auf Seiten der Telekomkunden und Mitbewerberkunden werden entsprechende 100 MBit/s-Modems benötigt.  [[Special:Contributions/93.229.148.172|93.229.148.172]] ([[User talk:93.229.148.172|talk]])

### Results from per-author sentiment analysis

What we have seen is that comment sentiment mostly correlates with the topic they are discussing. E.g.: topics discussing the pages fear, Nazi, Hitler have a very low score, but comments discussing e.g. happiness, or the great barrier reef have very high sentiments.

These results show, that it is not beneficial to use the labMT approach to find toxic comments. Different methods should be used for our problem.
To show this also concretely, we will show a statistical correlation between talk page sentiment & article page sentiment.

### Per-talkpage sentiment analysis

In [None]:
def extract_talkpage_sentiments(pagename, text):
    pagename = pagename.replace('Talk:', '')
    text = [utils.tokenize_custom(s) for s in text]
    text = utils.flatten(text)
    # compute sentiment for individual rapper wiki page
    if len(text) > 100:
        sentiment_value = sentiment(text)
        if(sentiment_value):
            return (pagename, sentiment_value)


with Pool(12) as pool:
    worker_results = pool.starmap(extract_talkpage_sentiments, page_dict.items(), 40)
    worker_results = [worker_result for worker_result in worker_results if worker_result is not None]

sentiments_talk_pages = {page: sentiment for page, sentiment in worker_results}

sentiments_pages_df = pd.DataFrame({"pagename": sentiments_talk_pages.keys(), "talkpage_sentiment": sentiments_talk_pages.values()})
# use the author's name as index
# sentiments_pages_df.set_index("pagename", inplace=True)
sentiments_pages_df.head()

Unnamed: 0,pagename,talkpage_sentiment
0,Black Death,5.28306
1,French language,5.443214
2,China,5.355382
3,Africa,5.369347
4,Virus,5.269483


In [None]:
print("Top 10 happiest talk pages:")
best_sentiments = sentiments_pages_df.sort_values("talkpage_sentiment", ascending=False).head(10)
best_sentiments

Top 10 happiest talk pages:


Unnamed: 0,pagename,talkpage_sentiment
576,Memory,5.577665
931,Friendship,5.545372
80,Dance,5.541977
190,Drink,5.537447
792,Play (activity),5.534066
760,Festival,5.533009
546,Happiness,5.519176
860,Entertainment,5.518798
548,Heredity,5.516819
405,Garden,5.51665


In [None]:
print("Top 10 saddest talk pages:")
worst_sentiments = sentiments_pages_df.sort_values("talkpage_sentiment", ascending=True).head(10)
worst_sentiments

Top 10 saddest talk pages:


Unnamed: 0,pagename,talkpage_sentiment
19,Fear,5.188344
635,Terrorism,5.211078
53,War,5.215088
826,Violence,5.219212
649,Nuclear weapon,5.227378
286,Disease,5.228522
159,Weapon,5.230729
642,Tax,5.254547
510,Anger,5.25953
18,Slavery,5.260223


### Per-article-page sentiment analysis results


In [None]:
wikipage_folder = pathlib.Path("./article_pages_plaintext/")
filepaths = list(wikipage_folder.rglob("*.txt"))

# calculate sentiments for article pages
def extract_articlepage_sentiments(filepath):
    pagename = filepath.stem
    with open(filepath, 'r') as file:
        text = file.read()
        return (pagename, sentiment(utils.tokenize_custom(text)))
    
with Pool(12) as pool:
    worker_results = pool.map(extract_articlepage_sentiments, filepaths, 5)
    worker_results = [worker_result for worker_result in worker_results if worker_result is not None]

In [None]:
sentiments_article_pages =  {pagename: sentiment for pagename, sentiment in worker_results}

### Look into potential correlation of article and talk page sentiments

In [None]:
# insert the article page dictionary into the dataframe
sentiments_pages_df["articlepage_sentiment"] = sentiments_pages_df["pagename"].map(sentiments_article_pages)
# check that we didn't get any null values becuase a page didn't have a match
sentiments_pages_df.isnull().values.any()

True

In [None]:
# compute correlation of talk pages and article pages using pearson method 
# see wikipedia which pandas documentetation refers to
#sentiments_pages_df.corr(numeric_only=True)

Unnamed: 0,talkpage_sentiment,articlepage_sentiment
talkpage_sentiment,1.0,0.83011
articlepage_sentiment,0.83011,1.0


In [None]:
# create graph showing correlation value

sentiments_pages_df.scatter()

### Interpretation of correlation results

The results give a correlation coefficient of 0.83. 
This means, the sentiments of article and talk pages are strongly positively correlated (anything over 0.5 is considered strongly correlated). 
Thus, if the sentiment of a article page is higher,
the probability for the talk page to have a higher sentiment is high.

## Toxic comment extraction & analysis

In [None]:
df_comments = pd.DataFrame(list_for_df, columns = ['Author', 'Comment', 'Filename']) 

In [None]:
# imports for loading pickles
from nltk.tokenize import word_tokenize
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk

In [None]:
# Flatten the dictionary values
#flattened_data = [(key, value) for key, values in author_dict.items() for value in values]

# Create a DataFrame from the flattened data
#df = pd.DataFrame(flattened_data, columns=['Author', 'Comment'])

# load the vectorizer
with open('./sentiment-models/vectorizer.pkl', 'rb') as file:
    vec = pickle.load(file)

In [None]:
# create sparse TF-IDF matrix with vectorizer trained on kaggle toxic comment dataset

comments_sparse = vec.transform(df_comments['Comment'])

In [None]:
# Classify comments according to the following categories:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

preds = np.zeros((len(df_comments), len(label_cols)))

for i, j in enumerate(label_cols):
    print('predict', j)
    with open(f'./sentiment-models/{j}.pkl', 'rb') as file:
        m,r = pickle.load(file)
    preds[:,i] = m.predict_proba(comments_sparse.multiply(r))[:,1]
    df_comments[j] = preds[:,i]

# this created a matrix where each new column for label_cols contains the probability that a comment is in
# the category with the same title as the column name 

predict toxic
predict severe_toxic
predict obscene
predict threat
predict insult
predict identity_hate


In [None]:
# output all offensive comments
authors = []
for row in df_comments[df_comments["toxic"] > 0.9].iterrows():
    if row[1]["Author"]:
        authors.append(row[1]["Author"])
    print(row[1]["Filename"], row[1]["Author"], row[1]["toxic"], row[1]["Comment"])

page_contents/Talk:Scramble for Africa.txt DePiep 0.9962537910110114  :::::Sure you don't have to reply. But unless you ''quote me saying what you put in my mouth'', you can just as well shut up and fuck off. -[[User:DePiep|DePiep]] ([[User talk:DePiep|talk]]) 11:08, 2 December 2014 (UTC)

page_contents/Talk:Russia.txt GreenMeansGo 0.9062575750097962  :::Yeah, well I can call Joe Biden a big dumb idiot, and say he's too old to be using Legos, and that his feet probably smell bad, and somehow, I have no fear I'll be poisoned or haphazardly fall out a window. [[User:GreenMeansGo|<span style="font-family:Impact"><span style="color:#07CB4B">G</span><span style="color:#449351">M</span><span style="color:#35683d">G</span></span>]][[User talk:GreenMeansGo#top|<sup style="color:#000;font-family:Impact">talk</sup>]] 12:50, 11 August 2023 (UTC)

page_contents/Talk:Association football.txt 78.86.130.228 0.9946258023660924  
 :::::::::::::It's called football. You people are ridiculous. Fifa. No s

In [None]:
seen = set()
multiple_offenders = [x for x in authors if x in seen or seen.add(x)]    

# number of people that have written multiple toxic comments
len(set(multiple_offenders))


19

In [None]:
# output all toxic comments of multiple offenders
for row in df_comments[df_comments["toxic"] > 0.9].iterrows():
    if row[1]["Author"] in multiple_offenders:
        print(f'{row[1]["Filename"]}, Author: {row[1]["Author"]}, Comment: {" ".join(word_tokenize(row[1]["Comment"]))}')

page_contents/Talk:Russia.txt, Author: GreenMeansGo, Comment: : : : Yeah , well I can call Joe Biden a big dumb idiot , and say he 's too old to be using Legos , and that his feet probably smell bad , and somehow , I have no fear I 'll be poisoned or haphazardly fall out a window . [ [ User : GreenMeansGo| < span style= '' font-family : Impact '' > < span style= '' color : # 07CB4B '' > G < /span > < span style= '' color : # 449351 '' > M < /span > < span style= '' color : # 35683d '' > G < /span > < /span > ] ] [ [ User talk : GreenMeansGo # top| < sup style= '' color : # 000 ; font-family : Impact '' > talk < /sup > ] ] 12:50 , 11 August 2023 ( UTC )
page_contents/Talk:Soviet Union/Archive 5.txt, Author: Tourskin, Comment: PEOPLE ARE ARRESTED FOR BEING ARABS ? ! ARE YOU GODDAMN CRAZY ? ! [ [ User : Tourskin|Tourskin ] ] 02:37 , 2 June 2007 ( UTC )
page_contents/Talk:Soviet Union/Archive 5.txt, Author: Tourskin, Comment: == Nukes reply == I can tell you why they put a picture of an at

In [None]:
pages_to_categories = await scraper.get_wikipedia_article_categories(category_titles)

Fetching 3 categories: 100%|██████████| 3/3 [00:00<00:00,  5.65it/s]
Fetching 1001 article pages: 100%|██████████| 101/101 [00:01<00:00, 72.47it/s]
Parsing talk page batches: 100%|██████████| 101/101 [00:00<00:00, 9169.76it/s]


In [None]:
pages_to_categories

{'Earth': ['All Wikipedia articles written in American English',
  'All articles containing potentially dated statements',
  'All articles lacking reliable references',
  'Articles containing Ancient Greek (to 1453)-language text',
  'Articles containing potentially dated statements from September 2021',
  'Articles lacking reliable references from February 2023',
  'Articles with BNE identifiers',
  'Articles with BNF identifiers',
  'Articles with BNFdata identifiers',
  'Articles with EMU identifiers',
  'Articles with FAST identifiers',
  'Articles with GND identifiers',
  'Articles with ISNI identifiers',
  'Articles with J9U identifiers',
  'Articles with LCCN identifiers',
  'Articles with NARA identifiers',
  'Articles with NDL identifiers',
  'Articles with NKC identifiers',
  'Articles with NLK identifiers',
  'Articles with TDVİA identifiers',
  'Articles with VIAF identifiers',
  'Articles with WorldCat Entities identifiers',
  'Articles with hAudio microformats',
  'Articl

In [None]:
categories_to_pages = { page: [] for page in set([value for categories_page in pages_to_categories.values() for value in categories_page])}

for page, categories in pages_to_categories.items():
    for category in categories:
        categories_to_pages[category].append(page)

list_categories_with_most_pages = sorted(categories_to_pages, key = lambda x: len(categories_to_pages[x]), reverse=True)

In [None]:
# number of distinct categories

len(set([value for categories_page in pages_to_categories.values() for value in categories_page]))

9522

In [None]:
len(df_comments[df_comments["toxic"] > 0.9])

NameError: name 'df_comments' is not defined