In [1]:
#!pip install vaderSentiment

In [2]:
import os, json
import pandas as pd
import regex as re
import numpy as np
import pickle

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import nltk

import warnings
warnings.filterwarnings('ignore')

In [3]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [4]:
data = pd.read_csv("Washington_Post_NER_all_10K_as_imput_for_enrichment")

## Enrichment of the DataFrame

### Category

In [5]:
def findCategory(x):
    if re.search('-',x):
        return "other"
    else:
        return x

In [6]:
def convertURL(predata):
    predata["category"] = predata.apply(lambda row: str(row["article_url"]).split('/')[3] , axis=1)
    predata["category"] = predata.apply(lambda row: findCategory(str(row["category"])) , axis=1)
    return predata   

### Sentiment Analysis

In [24]:
def getSentimentScore(predata):
    analyzer = SentimentIntensityAnalyzer()
    cs = []
    for row in range(len(data)):
        cs.append(analyzer.polarity_scores(data['title'].iloc[row])['compound'])
    predata['title_vader_score'] = cs
    predata = predata[(predata[['title_vader_score']] != 0).all(axis=1)].reset_index(drop=True)
    return predata    

In [8]:
def sentiment(vader_score):
    if vader_score >= 0.05 : 
        return("Positive") 
    elif vader_score <= - 0.05 : 
        return("Negative")  
    else : 
        return("Neutral")

In [9]:
def getSentiment(predata):
    getSentimentScore(predata)
    predata['title_vader_score'] = predata.apply(lambda row: sentiment(row["title_vader_score"]), axis=1)
    return predata

### Lenght

In [10]:
def getLenght(predata):
    predata["word_title"] = predata.apply(lambda row: len(nltk.word_tokenize(str(row["title"]))), axis=1)
    predata["len_title"] = predata.apply(lambda row: len(str(row["title"])), axis=1)
    return predata

### agency_communion

In [11]:
def load_list(file):
    with open(file, "r") as tf:
        lines = tf.read().split('\n')
    return lines

In [12]:
def count_occurances(row, attributes):
    counter = 0
    for attribute in attributes:
        subcounter = str(row).count(str(attribute))
        counter += subcounter
    return counter

In [26]:
def getAtributes(predata):
    agency = load_list("a_agency.txt")
    communion = load_list("a_communion.txt")
    predata["agency"] = predata.apply(lambda row: count_occurances(str(row["title"]), agency), axis=1)
    predata["communion"] = predata.apply(lambda row: count_occurances(str(row["title"]), communion), axis=1)
    predata["agency_context"] = predata.apply(lambda row: count_occurances(str(row["content"]), agency), axis=1)
    predata["communion_context"] = predata.apply(lambda row: count_occurances(str(row["content"]), communion), axis=1)
    return predata

In [27]:
def getAll(predata):
    predata = convertURL(predata)
    predata = getSentiment(predata)
    predata = getLenght(predata)
    predata = getAtributes(predata)
    return predata

In [28]:
data = getAll(data)

## Datasets

In [29]:
compare = data
compare = compare.drop(columns=["article_url","title","published_date", "source", "type", "contents", "content", "merged_total_text", "entity","entity_type", "last_name", "agency", "communion"])
compare

Unnamed: 0.1,Unnamed: 0,id,author,first_name,gender_guesser,person_author,gender_guesser_author,text,category,title_vader_score,word_title,len_title,agency_context,communion_context
0,0,b2e89334-33f9-11e1-825f-dabc29fd7071,Mark Giannotto,Danny,male,Mark,male,<function createText at 0x000001E7CF406820>,sports,Positive,13,69,33,75
1,1,b2e89334-33f9-11e1-825f-dabc29fd7071,Mark Giannotto,Jarrett,male,Mark,male,<function createText at 0x000001E7CF406820>,sports,Positive,13,69,33,75
2,4,b2e89334-33f9-11e1-825f-dabc29fd7071,Mark Giannotto,Danny,male,Mark,male,<function createText at 0x000001E7CF406820>,sports,Positive,13,69,33,75
3,8,b2e89334-33f9-11e1-825f-dabc29fd7071,Mark Giannotto,Jarrett,male,Mark,male,<function createText at 0x000001E7CF406820>,sports,Positive,13,69,33,75
4,9,b2e89334-33f9-11e1-825f-dabc29fd7071,Mark Giannotto,John,male,Mark,male,<function createText at 0x000001E7CF406820>,sports,Positive,13,69,33,75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174613,760782,5c9c18dc-7393-11e1-98b8-6a87961e5beb,Vanessa Williams,Abraham,male,Vanessa,female,<function createText at 0x000001E7CF406820>,lifestyle,Neutral,18,91,30,57
174614,760785,5c9c18dc-7393-11e1-98b8-6a87961e5beb,Vanessa Williams,Williams,unknown,Vanessa,female,<function createText at 0x000001E7CF406820>,lifestyle,Neutral,18,91,30,57
174615,760787,5c9c18dc-7393-11e1-98b8-6a87961e5beb,Vanessa Williams,Catherine,female,Vanessa,female,<function createText at 0x000001E7CF406820>,lifestyle,Neutral,18,91,30,57
174616,760791,5c9c18dc-7393-11e1-98b8-6a87961e5beb,Vanessa Williams,Williams,unknown,Vanessa,female,<function createText at 0x000001E7CF406820>,lifestyle,Neutral,18,91,30,57


#### compare_author: Jeder einzige Artikel einmal ohne Personen im Text

In [30]:
compare_author = data
compare_author = compare_author .drop(columns=["article_url","published_date", "source", "type", "contents", "content", "merged_total_text", "entity","entity_type", "first_name", "last_name", "gender_guesser"])
compare_author = compare_author.drop_duplicates()

In [31]:
# Calculate percentange of gender (all articles)
gender_percentage = pd.DataFrame()
gender_percentage["gender_total_author"] = compare_author.groupby("gender_guesser_author")["id"].count()
gender_percentage["gender_percent_author"] = gender_percentage["gender_total_author"] / gender_percentage["gender_total_author"].sum() * 100
gender_percentage["gender_percent_author"] = gender_percentage["gender_percent_author"].astype(int)
gender_percentage

Unnamed: 0_level_0,gender_total_author,gender_percent_author
gender_guesser_author,Unnamed: 1_level_1,Unnamed: 2_level_1
andy,529,0
female,47813,27
male,88010,50
mostly_female,2777,1
mostly_male,4156,2
unknown,31333,17


Fast die Häfte der Artikel wurde von <font color='red'>Männern</font> geschrieben. Etwa ein Viertel der Artikel stammt von <font color='blue'>Frauen</font>.

In [32]:
unique_authors = compare_author
unique_authors = unique_authors.drop(columns=["id", "title", "len_title", "word_title", "title_vader_score"])
unique_authors = unique_authors.drop_duplicates()
unique_authors

Unnamed: 0.1,Unnamed: 0,author,person_author,gender_guesser_author,text,category,agency,communion,agency_context,communion_context
0,0,Mark Giannotto,Mark,male,<function createText at 0x000001E7CF406820>,sports,0,1,33,75
1,1,Mark Giannotto,Mark,male,<function createText at 0x000001E7CF406820>,sports,0,1,33,75
2,4,Mark Giannotto,Mark,male,<function createText at 0x000001E7CF406820>,sports,0,1,33,75
3,8,Mark Giannotto,Mark,male,<function createText at 0x000001E7CF406820>,sports,0,1,33,75
4,9,Mark Giannotto,Mark,male,<function createText at 0x000001E7CF406820>,sports,0,1,33,75
...,...,...,...,...,...,...,...,...,...,...
174613,760782,Vanessa Williams,Vanessa,female,<function createText at 0x000001E7CF406820>,lifestyle,0,1,30,57
174614,760785,Vanessa Williams,Vanessa,female,<function createText at 0x000001E7CF406820>,lifestyle,0,1,30,57
174615,760787,Vanessa Williams,Vanessa,female,<function createText at 0x000001E7CF406820>,lifestyle,0,1,30,57
174616,760791,Vanessa Williams,Vanessa,female,<function createText at 0x000001E7CF406820>,lifestyle,0,1,30,57


In [33]:
# Calculate percentange of gender (single authors)
gender_percentage = pd.DataFrame()
gender_percentage["gender_total_author"] = unique_authors.groupby("gender_guesser_author")["author"].count()
gender_percentage["gender_percent_author"] = gender_percentage["gender_total_author"] / gender_percentage["gender_total_author"].sum() * 100
gender_percentage["gender_percent_author"] = gender_percentage["gender_percent_author"].astype(int)
gender_percentage

Unnamed: 0_level_0,gender_total_author,gender_percent_author
gender_guesser_author,Unnamed: 1_level_1,Unnamed: 2_level_1
andy,529,0
female,47813,27
male,88010,50
mostly_female,2777,1
mostly_male,4156,2
unknown,31333,17


Die Autorenschaft besteht aus 30% aus <font color='blue'>Frauen</font>, 51% aus <font color='red'>Männern</font> und 11% <font color='green'>unknown</font>. Das deckt sich etwa mit den geschriebenen Artikeln insgesamt

In [34]:
not_quiet_unique_authors = compare_author
not_quiet_unique_authors = not_quiet_unique_authors.drop(columns=["id", "title"])
not_quiet_unique_authors = not_quiet_unique_authors.drop_duplicates()
not_quiet_unique_authors

Unnamed: 0.1,Unnamed: 0,author,person_author,gender_guesser_author,text,category,title_vader_score,word_title,len_title,agency,communion,agency_context,communion_context
0,0,Mark Giannotto,Mark,male,<function createText at 0x000001E7CF406820>,sports,Positive,13,69,0,1,33,75
1,1,Mark Giannotto,Mark,male,<function createText at 0x000001E7CF406820>,sports,Positive,13,69,0,1,33,75
2,4,Mark Giannotto,Mark,male,<function createText at 0x000001E7CF406820>,sports,Positive,13,69,0,1,33,75
3,8,Mark Giannotto,Mark,male,<function createText at 0x000001E7CF406820>,sports,Positive,13,69,0,1,33,75
4,9,Mark Giannotto,Mark,male,<function createText at 0x000001E7CF406820>,sports,Positive,13,69,0,1,33,75
...,...,...,...,...,...,...,...,...,...,...,...,...,...
174613,760782,Vanessa Williams,Vanessa,female,<function createText at 0x000001E7CF406820>,lifestyle,Neutral,18,91,0,1,30,57
174614,760785,Vanessa Williams,Vanessa,female,<function createText at 0x000001E7CF406820>,lifestyle,Neutral,18,91,0,1,30,57
174615,760787,Vanessa Williams,Vanessa,female,<function createText at 0x000001E7CF406820>,lifestyle,Neutral,18,91,0,1,30,57
174616,760791,Vanessa Williams,Vanessa,female,<function createText at 0x000001E7CF406820>,lifestyle,Neutral,18,91,0,1,30,57


In [35]:
compare_persons = data
compare_persons = compare_persons.drop(columns=['id', 'article_url', 'title', 'author', 'published_date', 'contents',
       'type', 'source', 'content', 'merged_total_text', 'entity','entity_type', 'last_name', 
       ])
compare_persons = compare_persons.drop_duplicates()
compare_persons

Unnamed: 0.1,Unnamed: 0,first_name,gender_guesser,person_author,gender_guesser_author,text,category,title_vader_score,word_title,len_title,agency,communion,agency_context,communion_context
0,0,Danny,male,Mark,male,<function createText at 0x000001E7CF406820>,sports,Positive,13,69,0,1,33,75
1,1,Jarrett,male,Mark,male,<function createText at 0x000001E7CF406820>,sports,Positive,13,69,0,1,33,75
2,4,Danny,male,Mark,male,<function createText at 0x000001E7CF406820>,sports,Positive,13,69,0,1,33,75
3,8,Jarrett,male,Mark,male,<function createText at 0x000001E7CF406820>,sports,Positive,13,69,0,1,33,75
4,9,John,male,Mark,male,<function createText at 0x000001E7CF406820>,sports,Positive,13,69,0,1,33,75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174613,760782,Abraham,male,Vanessa,female,<function createText at 0x000001E7CF406820>,lifestyle,Neutral,18,91,0,1,30,57
174614,760785,Williams,unknown,Vanessa,female,<function createText at 0x000001E7CF406820>,lifestyle,Neutral,18,91,0,1,30,57
174615,760787,Catherine,female,Vanessa,female,<function createText at 0x000001E7CF406820>,lifestyle,Neutral,18,91,0,1,30,57
174616,760791,Williams,unknown,Vanessa,female,<function createText at 0x000001E7CF406820>,lifestyle,Neutral,18,91,0,1,30,57


In [36]:
data.to_csv("data_to_use")
compare.to_csv("compare")
compare_author.to_csv("compare")
unique_authors.to_csv("unique_authors")
compare_persons.to_csv("compare_persons")

## Comparing

In [37]:
def get_gendered_information(name, dataframe, focused_column, search_index):
    frame = pd.DataFrame()
    frame[str(name) + " total"] = dataframe.groupby(str(focused_column))[str(search_index)].count()
    frame[str(name) + " percent"] = frame[str(name) + " total"] / frame[str(name) + " total"].sum() * 100
    frame[str(name) + " percent"] = frame[str(name) + " percent"].astype(int)
    return frame

In [38]:
def combine_dataframes(dataframe1, dataframe2, dataframe3):
    frame = pd.DataFrame()
    frame = dataframe1.join(dataframe2, how='outer')
    frame = frame.join(dataframe3, how = 'outer')
    return frame

### Comparing Authors

In [39]:
male_authors = compare_author[compare_author.gender_guesser_author == "male"] 
female_authors = compare_author[compare_author.gender_guesser_author == "female"]
unknown_authors = compare_author[compare_author.gender_guesser_author == "unknown"] 

#### Category

In [40]:
author_male_category = get_gendered_information('male', male_authors, 'category', 'author')
author_female_category = get_gendered_information('female', female_authors, 'category', 'author')
author_unknown_category = get_gendered_information('unknown' , unknown_authors, 'category', 'author')

category_data = combine_dataframes(author_male_category, author_female_category, author_unknown_category)
category_data

Unnamed: 0_level_0,male total,male percent,female total,female percent,unknown total,unknown percent
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010,,,,,35,0
2012,62.0,0.0,,,34,0
business,4675.0,5.0,4995.0,10.0,2030,6
cars,15.0,0.0,,,2,0
conversations,12.0,0.0,44.0,0.0,3,0
entertainment,6549.0,7.0,3403.0,7.0,705,2
goingoutguide,85.0,0.0,509.0,1.0,158,0
investigations,218.0,0.0,226.0,0.0,4,0
jobs_articles,,,3.0,0.0,12,0
kidspost,,,25.0,0.0,106,0



Die <font color='red'>männlichen</font> Autoren schreiben vor allem über <font color='red'>Sport, Lokales und Politik</font>. <font color='blue'>Weibliche Autoren</font> schreiben vor allem Artikel über <font color='blue'>Lifestyle, Lokales und Business</font>.

#### Sentiment

In [41]:
author_male_sentiment = get_gendered_information('male', male_authors, 'title_vader_score', 'author')
author_female_sentiment = get_gendered_information('female', female_authors, 'title_vader_score', 'author')
author_unknown_sentiment = get_gendered_information('unknown' , unknown_authors, 'title_vader_score', 'author')

sentiment_data = combine_dataframes(author_male_sentiment, author_female_sentiment, author_unknown_sentiment)
sentiment_data


Unnamed: 0_level_0,male total,male percent,female total,female percent,unknown total,unknown percent
title_vader_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Negative,23226,26,11939,24,5860,18
Neutral,39374,44,22391,46,19070,60
Positive,25410,28,13483,28,6403,20


Der Title wird beinahme immer polarisierend geschrieben. Negative und positive ist daber gleichgewichtig. Auch im Hinblick auf das Geschlecht gibt es keine großen Auffälligkeiten.

#### Lenght

In [42]:
author_male_lenght_title = get_gendered_information('male', male_authors, 'word_title', 'author')
author_female_lenght_title = get_gendered_information('female', female_authors, 'word_title', 'author')
author_unknown_lenght_title = get_gendered_information('unknown' , unknown_authors, 'word_title', 'author')

lenght_data_title = combine_dataframes(author_male_lenght_title, author_female_lenght_title, author_unknown_lenght_title)
lenght_data_title

Unnamed: 0_level_0,male total,male percent,female total,female percent,unknown total,unknown percent
word_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,6.0,0.0,,,8.0,0.0
2,1158.0,1.0,1225.0,2.0,1340.0,4.0
3,184.0,0.0,190.0,0.0,356.0,1.0
4,574.0,0.0,724.0,1.0,909.0,2.0
5,1997.0,2.0,1479.0,3.0,2258.0,7.0
6,4214.0,4.0,2439.0,5.0,2151.0,6.0
7,6332.0,7.0,3694.0,7.0,2420.0,7.0
8,7410.0,8.0,4454.0,9.0,4499.0,14.0
9,9294.0,10.0,5163.0,10.0,2078.0,6.0
10,8646.0,9.0,4948.0,10.0,3543.0,11.0


#### agency - communion

In [43]:
author_male_agency = get_gendered_information('male', male_authors, 'agency_context', 'author')
author_female_agency = get_gendered_information('female', female_authors, 'agency_context', 'author')
author_unknown_agency = get_gendered_information('unknown' , unknown_authors, 'agency_context', 'author')

author__agency = combine_dataframes(author_male_agency, author_female_agency, author_unknown_agency)
author__agency

Unnamed: 0_level_0,male total,male percent,female total,female percent,unknown total,unknown percent
agency_context,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,,,2.0,0.0,719.0,2.0
1,40.0,0.0,11.0,0.0,940.0,3.0
2,60.0,0.0,28.0,0.0,811.0,2.0
3,79.0,0.0,67.0,0.0,940.0,3.0
4,112.0,0.0,141.0,0.0,726.0,2.0
...,...,...,...,...,...,...
253,,,283.0,0.0,,
259,192.0,0.0,,,,
296,,,4.0,0.0,,
373,,,,,34.0,0.0


In [44]:
author_male_communion = get_gendered_information('male', male_authors, 'communion_context', 'author')
author_female_communion = get_gendered_information('female', female_authors, 'communion_context', 'author')
author_unknown_communion = get_gendered_information('unknown' , unknown_authors, 'communion_context', 'author')

author__communion = combine_dataframes(author_male_communion, author_female_communion, author_unknown_communion)
author__communion

Unnamed: 0_level_0,male total,male percent,female total,female percent,unknown total,unknown percent
communion_context,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,,,,,36.0,0.0
1,,,,,183.0,0.0
2,1.0,0.0,,,279.0,0.0
3,9.0,0.0,8.0,0.0,587.0,1.0
4,58.0,0.0,7.0,0.0,528.0,1.0
...,...,...,...,...,...,...
360,139.0,0.0,,,,
366,168.0,0.0,,,,
488,,,,,24.0,0.0
494,,,,,8.0,0.0


In [45]:
author__communion['male percent'].max()

2.0

In [46]:
author__communion['female percent'].max()

2.0

In [47]:
author__communion['unknown percent'].max()

2.0

In [48]:
sentiment_data.to_csv("sentiment_data")
lenght_data_title.to_csv("lenght_data_title")
category_data.to_csv("category_data")
author__agency.to_csv("author__agency")
author__communion.to_csv("author__communion")

### Compare Person in Context

In [49]:
male_context = compare_persons[compare_persons.gender_guesser == "male"] 
female_context = compare_persons[compare_persons.gender_guesser == "female"]
unknown_context = compare_persons[compare_persons.gender_guesser == "unknown"]

#### category

In [50]:
context_author_male_category = get_gendered_information('male', male_context, 'category', 'first_name')
context_author_female_category = get_gendered_information('female', female_context, 'category', 'first_name')
context_author_unknown_category = get_gendered_information('unknown' , unknown_context, 'category', 'first_name')

context_category_data = combine_dataframes(context_author_male_category, context_author_female_category, context_author_unknown_category)
context_category_data

Unnamed: 0_level_0,male total,male percent,female total,female percent,unknown total,unknown percent
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010,17.0,0.0,9,0,6,0
2012,27.0,0.0,9,0,48,0
business,5239.0,7.0,1409,7,4708,6
cars,8.0,0.0,3,0,4,0
conversations,38.0,0.0,8,0,23,0
entertainment,4273.0,5.0,1799,9,4327,6
goingoutguide,368.0,0.0,128,0,258,0
investigations,140.0,0.0,30,0,246,0
jobs_articles,,,6,0,9,0
kidspost,46.0,0.0,25,0,33,0


#### Sentiment

In [51]:
context_author_male_sentiment = get_gendered_information('male', male_context, 'title_vader_score', 'first_name')
context_author_female_sentiment = get_gendered_information('female', female_context, 'title_vader_score', 'first_name')
context_author_unknown_sentiment = get_gendered_information('unknown' , unknown_context, 'title_vader_score', 'first_name')

context_sentiment_data = combine_dataframes(context_author_male_sentiment, context_author_female_sentiment, context_author_unknown_sentiment)
context_sentiment_data

Unnamed: 0_level_0,male total,male percent,female total,female percent,unknown total,unknown percent
title_vader_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Negative,18136,24,4092,20,18600,26
Neutral,35192,48,10819,54,32524,45
Positive,19775,27,5005,25,19855,27


#### lenght

In [52]:
context_male_lenght_title = get_gendered_information('male', male_context, 'word_title', 'first_name')
context_female_lenght_title = get_gendered_information('female', female_context, 'word_title', 'first_name')
context_unknown_lenght_title = get_gendered_information('unknown' , unknown_context, 'word_title', 'first_name')

context_lenght_data_title = combine_dataframes(context_male_lenght_title, context_female_lenght_title, context_unknown_lenght_title)
context_lenght_data_title

Unnamed: 0_level_0,male total,male percent,female total,female percent,unknown total,unknown percent
word_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,5,0,1,0,7,0
2,1643,2,606,3,1354,1
3,333,0,111,0,249,0
4,926,1,348,1,833,1
5,2505,3,928,4,2024,2
6,4004,5,1111,5,3652,5
7,5387,7,1445,7,5286,7
8,7228,9,2280,11,6120,8
9,7492,10,1662,8,7229,10
10,7645,10,2166,10,6937,9


#### Gender of Author (werden männliche Personen häufiger von Männern erwähnt etc)

In [53]:
context_male_author_gender = get_gendered_information('male', male_context, 'gender_guesser_author', 'first_name')
context_female_author_gender = get_gendered_information('female', female_context, 'gender_guesser_author', 'first_name')
context_unknown_author_gender = get_gendered_information('unknown' , unknown_context, 'gender_guesser_author', 'first_name')

context_gender_data = combine_dataframes(context_male_author_gender, context_female_author_gender, context_unknown_author_gender)
context_gender_data

Unnamed: 0_level_0,male total,male percent,female total,female percent,unknown total,unknown percent
gender_guesser_author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
andy,283,0,50,0,153,0
female,19235,26,6475,32,19384,27
male,37573,51,6390,32,38807,54
mostly_female,981,1,417,2,1163,1
mostly_male,1484,2,509,2,1942,2
unknown,13547,18,6075,30,9530,13


<font color='red'>Männliche Autoren</font> schreiben doppelt so oft über <font color='red'>Männer</font>.
<font color='blue'>Weibliche Autoren</font> schreiben ausgeglichener über beide Geschlechter.


#### agency-communion

In [54]:
context_male_agency = get_gendered_information('male', male_context, 'agency', 'first_name')
context_female_agency = get_gendered_information('female', female_context, 'agency', 'first_name')
context_unknown_agency = get_gendered_information('unknown' , unknown_context, 'agency', 'first_name')

context__agency = combine_dataframes(context_male_agency, context_female_agency, context_unknown_agency)
context__agency

Unnamed: 0_level_0,male total,male percent,female total,female percent,unknown total,unknown percent
agency,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,52842,72,15096.0,75.0,50701.0,71.0
1,14572,19,3655.0,18.0,14426.0,20.0
2,4439,6,957.0,4.0,4690.0,6.0
3,1008,1,161.0,0.0,963.0,1.0
4,215,0,46.0,0.0,183.0,0.0
5,26,0,1.0,0.0,16.0,0.0
8,1,0,,,,


In [55]:
context_male_communion = get_gendered_information('male', male_context, 'communion', 'first_name')
context_female_communion = get_gendered_information('female', female_context, 'communion', 'first_name')
context_unknown_communion = get_gendered_information('unknown' , unknown_context, 'communion', 'first_name')

context__communion = combine_dataframes(context_male_communion, context_female_communion, context_unknown_communion)
context__communion

Unnamed: 0_level_0,male total,male percent,female total,female percent,unknown total,unknown percent
communion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,49553,67,13712.0,68.0,47664.0,67.0
1,19394,26,5126.0,25.0,18854.0,26.0
2,3342,4,903.0,4.0,3580.0,5.0
3,767,1,169.0,0.0,817.0,1.0
4,46,0,6.0,0.0,64.0,0.0
5,1,0,,,,


In [56]:
context_sentiment_data.to_csv("context_sentiment_data")
context_lenght_data_title.to_csv("context_lenght_data_title")
context_category_data.to_csv("context_category_data")
context_gender_data.to_csv("context_gender_data")
context__agency.to_csv("context__agency")
context__communion.to_csv("context__communion")

<b>Erste Erkenntnisse, die wir für eine Prediction nutzen können:
<li>Es werden fast doppelt so viele Artikel von Männern geschrieben als von Frauen</li>
<li>Die Autorenschaft besteht aus 30% aus Frauen, 51% aus Männern und 11% unknown</li>
<li>Ein großer Teil der Texte sind unknown (Nur Buchstaben als Vornahmen, ohne Autor etc.)</li>
<li>Männer schreiben eher über sport und politics</li>
<li>Frauen schreiben eher über lifestyle, business</li>
<li>Lokal beide etwa gleich viel</li>
<li>unknown vor allem opinions </li>
<li>Es werden doppelt so viele Männer in Texten erwähnt als Frauen</li>
<li>Frauen schreiben etwa gleichgewichtet über Männer und Frauen</li>
<li>Männer und unknown schreiben deutlich mehr über Männer (zu 25% mehr)</li>
<li>Besonders häufig werden Männer in der Kategorie sports erwähnt</li>
<li>Besonders häufig werden Frauen in Lifestyle und Local erwähnt</li>
<li>In Politik werden männer und Frauen etwa gleichhäufig besprochen</li>

<li>Männer schreiben mit mehr agency Wörtern</li>
<li>Frauen schreiben mit mehr communion Wörtern</li>

<li>Männer werden häufiger durch agency Wörter beschrieben</li>
<li>Frauen werden mit mehr communion Wörtern beschrieben</li>

<li>Der Sentiment des Titels kann keinen Rückschluss auf das Autoren-Geschlecht bieten</li>

<b>Fragen, die offen sind:
<li>Kann der Sentiment des Titels Rückschluss auf die Geschlechter, über die gesprochen wird, bieten?</li>
<li>Kann der Sentiment des Contents Rückschluss auf die Geschlechter, über die gesprochen wird, bieten?</li>
<li>Kann der Sentiment des Titels Rückschluss auf das Autoren-Geschlecht bieten?</li>

<b>Annahme, die noch zu prüfen sind:
<li>Die Länge des Titels gibt keine Rückschlüsse auf das Geschlecht</li>
<li>Die Länge des Textes gibt Rückschlüsse auf das Geschlecht (Männer bekommen mehr Platz?)</li>
<li>Die Anzahl der Artikel, die von Frauen geschrieben wurden, haben mit der Zeit prozentual zugenommen</li>
<li>Die Anzahl der Artikel, die über Frauen geschrieben wurden, haben mit der Zeit prozentual zugenommen</li>
