In [1]:
#!pip install vaderSentiment

In [2]:
import os, json
import pandas as pd
import regex as re
import numpy as np
import pickle

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import nltk

import warnings
warnings.filterwarnings('ignore')

## Enrichment of the DataFrame

### Category

In [3]:
data = pd.read_csv("Washington_Post_NER_all_10K_as_imput_for_enrichment")

In [4]:
def findCategory(x):
    if re.search('-',x):
        return "other"
    else:
        return x

In [5]:
data["category"] = data.apply(lambda row: str(row["article_url"]).split('/')[3] , axis=1)
data["category"] = data.apply(lambda row: findCategory(str(row["category"])) , axis=1)

### Sentiment Analysis

In [7]:
analyzer = SentimentIntensityAnalyzer()

In [8]:
cs = []
for row in range(len(data)):
    cs.append(analyzer.polarity_scores(data['title'].iloc[row])['compound'])

data['title_vader_score'] = cs
data = data[(data[['title_vader_score']] != 0).all(axis=1)].reset_index(drop=True)

In [9]:
def sentiment(vader_score):
    if vader_score >= 0.05 : 
        return("Positive") 
  
    elif vader_score <= - 0.05 : 
        return("Negative") 
  
    else : 
        return("Neutral")

In [10]:
data['title_vader_score'] = data.apply(lambda row: sentiment(row["title_vader_score"]), axis=1)

### Lenght

In [12]:
data["word_title"] = data.apply(lambda row: len(nltk.word_tokenize(str(row["title"]))), axis=1)

In [13]:
data["len_title"] = data.apply(lambda row: len(str(row["title"])), axis=1)

### agency_communion

In [14]:
def load_list(file):
    with open(file, "r") as tf:
        lines = tf.read().split('\n')
    return lines

In [15]:
def count_occurances(row, attributes):
    counter = 0
    for attribute in attributes:
        subcounter = str(row).count(str(attribute))
        counter += subcounter
    return counter

In [16]:
agency = load_list("a_agency.txt")
communion = load_list("a_communion.txt")

In [17]:
data["agency"] = data.apply(lambda row: count_occurances(str(row["title"]), agency), axis=1)
data["communion"] = data.apply(lambda row: count_occurances(str(row["title"]), communion), axis=1)

In [18]:
data["agency_context"] = data.apply(lambda row: count_occurances(str(row["content"]), agency), axis=1)
data["communion_context"] = data.apply(lambda row: count_occurances(str(row["content"]), communion), axis=1)

## Datasets

In [20]:
compare = data
compare = compare.drop(columns=["article_url","title","published_date", "source", "type", "contents", "content", "merged_total_text", "entity","entity_type", "last_name", "agency", "communion"])
compare

Unnamed: 0.1,Unnamed: 0,id,author,first_name,gender_guesser,person_author,gender_guesser_author,text,category,title_vader_score,word_title,len_title,agency_context,communion_context
0,0,b2e89334-33f9-11e1-825f-dabc29fd7071,Mark Giannotto,Danny,male,Mark,male,<function createText at 0x000001E7CF406820>,sports,Positive,13,69,33,75
1,1,b2e89334-33f9-11e1-825f-dabc29fd7071,Mark Giannotto,Jarrett,male,Mark,male,<function createText at 0x000001E7CF406820>,sports,Positive,13,69,33,75
2,4,b2e89334-33f9-11e1-825f-dabc29fd7071,Mark Giannotto,Danny,male,Mark,male,<function createText at 0x000001E7CF406820>,sports,Positive,13,69,33,75
3,8,b2e89334-33f9-11e1-825f-dabc29fd7071,Mark Giannotto,Jarrett,male,Mark,male,<function createText at 0x000001E7CF406820>,sports,Positive,13,69,33,75
4,9,b2e89334-33f9-11e1-825f-dabc29fd7071,Mark Giannotto,John,male,Mark,male,<function createText at 0x000001E7CF406820>,sports,Positive,13,69,33,75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92391,760697,66414244-739e-11e1-9ab6-c7562e04789c,Mark Maske,Williams,unknown,Mark,male,<function createText at 0x000001E7CF406820>,sports,Negative,14,80,54,99
92392,760698,66414244-739e-11e1-9ab6-c7562e04789c,Mark Maske,Payton,andy,Mark,male,<function createText at 0x000001E7CF406820>,sports,Negative,14,80,54,99
92393,760699,66414244-739e-11e1-9ab6-c7562e04789c,Mark Maske,Loomis,unknown,Mark,male,<function createText at 0x000001E7CF406820>,sports,Negative,14,80,54,99
92394,760704,66414244-739e-11e1-9ab6-c7562e04789c,Mark Maske,Mike,male,Mark,male,<function createText at 0x000001E7CF406820>,sports,Negative,14,80,54,99


#### compare_author: Jeder einzige Artikel einmal ohne Personen im Text

In [21]:
compare_author = data
compare_author = compare_author .drop(columns=["article_url","published_date", "source", "type", "contents", "content", "merged_total_text", "entity","entity_type", "first_name", "last_name", "gender_guesser"])
compare_author = compare_author.drop_duplicates()

In [22]:
# Calculate percentange of gender (all articles)
gender_percentage = pd.DataFrame()
gender_percentage["gender_total_author"] = compare_author.groupby("gender_guesser_author")["id"].count()
gender_percentage["gender_percent_author"] = gender_percentage["gender_total_author"] / gender_percentage["gender_total_author"].sum() * 100
gender_percentage["gender_percent_author"] = gender_percentage["gender_percent_author"].astype(int)
gender_percentage

Unnamed: 0_level_0,gender_total_author,gender_percent_author
gender_guesser_author,Unnamed: 1_level_1,Unnamed: 2_level_1
andy,178,0
female,25872,28
male,49542,53
mostly_female,1802,1
mostly_male,2583,2
unknown,12419,13


Fast die Häfte der Artikel wurde von <font color='red'>Männern</font> geschrieben. Etwa ein Viertel der Artikel stammt von <font color='blue'>Frauen</font>.

In [23]:
unique_authors = compare_author
unique_authors = unique_authors.drop(columns=["id", "title", "len_title", "word_title", "title_vader_score"])
unique_authors = unique_authors.drop_duplicates()
unique_authors

Unnamed: 0.1,Unnamed: 0,author,person_author,gender_guesser_author,text,category,agency,communion,agency_context,communion_context
0,0,Mark Giannotto,Mark,male,<function createText at 0x000001E7CF406820>,sports,0,1,33,75
1,1,Mark Giannotto,Mark,male,<function createText at 0x000001E7CF406820>,sports,0,1,33,75
2,4,Mark Giannotto,Mark,male,<function createText at 0x000001E7CF406820>,sports,0,1,33,75
3,8,Mark Giannotto,Mark,male,<function createText at 0x000001E7CF406820>,sports,0,1,33,75
4,9,Mark Giannotto,Mark,male,<function createText at 0x000001E7CF406820>,sports,0,1,33,75
...,...,...,...,...,...,...,...,...,...,...
92391,760697,Mark Maske,Mark,male,<function createText at 0x000001E7CF406820>,sports,1,1,54,99
92392,760698,Mark Maske,Mark,male,<function createText at 0x000001E7CF406820>,sports,1,1,54,99
92393,760699,Mark Maske,Mark,male,<function createText at 0x000001E7CF406820>,sports,1,1,54,99
92394,760704,Mark Maske,Mark,male,<function createText at 0x000001E7CF406820>,sports,1,1,54,99


In [24]:
# Calculate percentange of gender (single authors)
gender_percentage = pd.DataFrame()
gender_percentage["gender_total_author"] = unique_authors.groupby("gender_guesser_author")["author"].count()
gender_percentage["gender_percent_author"] = gender_percentage["gender_total_author"] / gender_percentage["gender_total_author"].sum() * 100
gender_percentage["gender_percent_author"] = gender_percentage["gender_percent_author"].astype(int)
gender_percentage

Unnamed: 0_level_0,gender_total_author,gender_percent_author
gender_guesser_author,Unnamed: 1_level_1,Unnamed: 2_level_1
andy,178,0
female,25872,28
male,49542,53
mostly_female,1802,1
mostly_male,2583,2
unknown,12419,13


Die Autorenschaft besteht aus 30% aus <font color='blue'>Frauen</font>, 51% aus <font color='red'>Männern</font> und 11% <font color='green'>unknown</font>. Das deckt sich etwa mit den geschriebenen Artikeln insgesamt

In [25]:
not_quiet_unique_authors = compare_author
not_quiet_unique_authors = not_quiet_unique_authors.drop(columns=["id", "title"])
not_quiet_unique_authors = not_quiet_unique_authors.drop_duplicates()
not_quiet_unique_authors

Unnamed: 0.1,Unnamed: 0,author,person_author,gender_guesser_author,text,category,title_vader_score,word_title,len_title,agency,communion,agency_context,communion_context
0,0,Mark Giannotto,Mark,male,<function createText at 0x000001E7CF406820>,sports,Positive,13,69,0,1,33,75
1,1,Mark Giannotto,Mark,male,<function createText at 0x000001E7CF406820>,sports,Positive,13,69,0,1,33,75
2,4,Mark Giannotto,Mark,male,<function createText at 0x000001E7CF406820>,sports,Positive,13,69,0,1,33,75
3,8,Mark Giannotto,Mark,male,<function createText at 0x000001E7CF406820>,sports,Positive,13,69,0,1,33,75
4,9,Mark Giannotto,Mark,male,<function createText at 0x000001E7CF406820>,sports,Positive,13,69,0,1,33,75
...,...,...,...,...,...,...,...,...,...,...,...,...,...
92391,760697,Mark Maske,Mark,male,<function createText at 0x000001E7CF406820>,sports,Negative,14,80,1,1,54,99
92392,760698,Mark Maske,Mark,male,<function createText at 0x000001E7CF406820>,sports,Negative,14,80,1,1,54,99
92393,760699,Mark Maske,Mark,male,<function createText at 0x000001E7CF406820>,sports,Negative,14,80,1,1,54,99
92394,760704,Mark Maske,Mark,male,<function createText at 0x000001E7CF406820>,sports,Negative,14,80,1,1,54,99


In [26]:
compare_persons = data
compare_persons = compare_persons.drop(columns=['id', 'article_url', 'title', 'author', 'published_date', 'contents',
       'type', 'source', 'content', 'merged_total_text', 'entity','entity_type', 'last_name', 
       ])
compare_persons = compare_persons.drop_duplicates()
compare_persons

Unnamed: 0.1,Unnamed: 0,first_name,gender_guesser,person_author,gender_guesser_author,text,category,title_vader_score,word_title,len_title,agency,communion,agency_context,communion_context
0,0,Danny,male,Mark,male,<function createText at 0x000001E7CF406820>,sports,Positive,13,69,0,1,33,75
1,1,Jarrett,male,Mark,male,<function createText at 0x000001E7CF406820>,sports,Positive,13,69,0,1,33,75
2,4,Danny,male,Mark,male,<function createText at 0x000001E7CF406820>,sports,Positive,13,69,0,1,33,75
3,8,Jarrett,male,Mark,male,<function createText at 0x000001E7CF406820>,sports,Positive,13,69,0,1,33,75
4,9,John,male,Mark,male,<function createText at 0x000001E7CF406820>,sports,Positive,13,69,0,1,33,75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92391,760697,Williams,unknown,Mark,male,<function createText at 0x000001E7CF406820>,sports,Negative,14,80,1,1,54,99
92392,760698,Payton,andy,Mark,male,<function createText at 0x000001E7CF406820>,sports,Negative,14,80,1,1,54,99
92393,760699,Loomis,unknown,Mark,male,<function createText at 0x000001E7CF406820>,sports,Negative,14,80,1,1,54,99
92394,760704,Mike,male,Mark,male,<function createText at 0x000001E7CF406820>,sports,Negative,14,80,1,1,54,99


In [27]:
data.to_csv("data_to_use")
compare.to_csv("compare")
compare_author.to_csv("compare")
unique_authors.to_csv("unique_authors")
compare_persons.to_csv("compare_persons")

## Comparing

In [28]:
def get_gendered_information(name, dataframe, focused_column, search_index):
    frame = pd.DataFrame()
    frame[str(name) + " total"] = dataframe.groupby(str(focused_column))[str(search_index)].count()
    frame[str(name) + " percent"] = frame[str(name) + " total"] / frame[str(name) + " total"].sum() * 100
    frame[str(name) + " percent"] = frame[str(name) + " percent"].astype(int)
    return frame

In [29]:
def combine_dataframes(dataframe1, dataframe2, dataframe3):
    frame = pd.DataFrame()
    frame = dataframe1.join(dataframe2, how='outer')
    frame = frame.join(dataframe3, how = 'outer')
    return frame
    

### Comparing Authors

In [30]:
male_authors = compare_author[compare_author.gender_guesser_author == "male"] 
female_authors = compare_author[compare_author.gender_guesser_author == "female"]
unknown_authors = compare_author[compare_author.gender_guesser_author == "unknown"] 

#### Category

In [31]:
author_male_category = get_gendered_information('male', male_authors, 'category', 'author')
author_female_category = get_gendered_information('female', female_authors, 'category', 'author')
author_unknown_category = get_gendered_information('unknown' , unknown_authors, 'category', 'author')

category_data = combine_dataframes(author_male_category, author_female_category, author_unknown_category)
category_data

Unnamed: 0_level_0,male total,male percent,female total,female percent,unknown total,unknown percent
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010,,,,,35.0,0.0
2012,62.0,0.0,,,,
business,2535.0,5.0,2609.0,10.0,581.0,4.0
cars,5.0,0.0,,,2.0,0.0
conversations,1.0,0.0,,,1.0,0.0
entertainment,2648.0,5.0,1507.0,5.0,290.0,2.0
goingoutguide,45.0,0.0,92.0,0.0,102.0,0.0
investigations,117.0,0.0,180.0,0.0,4.0,0.0
kidspost,,,25.0,0.0,40.0,0.0
lifestyle,4089.0,8.0,5514.0,21.0,1401.0,11.0



Die <font color='red'>männlichen</font> Autoren schreiben vor allem über <font color='red'>Sport, Lokales und Politik</font>. <font color='blue'>Weibliche Autoren</font> schreiben vor allem Artikel über <font color='blue'>Lifestyle, Lokales und Business</font>.

#### Sentiment

In [32]:
author_male_sentiment = get_gendered_information('male', male_authors, 'title_vader_score', 'author')
author_female_sentiment = get_gendered_information('female', female_authors, 'title_vader_score', 'author')
author_unknown_sentiment = get_gendered_information('unknown' , unknown_authors, 'title_vader_score', 'author')

sentiment_data = combine_dataframes(author_male_sentiment, author_female_sentiment, author_unknown_sentiment)
sentiment_data

Unnamed: 0_level_0,male total,male percent,female total,female percent,unknown total,unknown percent
title_vader_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Negative,23226,46,11939,46,5860,47
Neutral,906,1,450,1,156,1
Positive,25410,51,13483,52,6403,51


Der Title wird beinahme immer polarisierend geschrieben. Negative und positive ist daber gleichgewichtig. Auch im Hinblick auf das Geschlecht gibt es keine großen Auffälligkeiten.

#### Lenght

In [33]:
author_male_lenght_title = get_gendered_information('male', male_authors, 'word_title', 'author')
author_female_lenght_title = get_gendered_information('female', female_authors, 'word_title', 'author')
author_unknown_lenght_title = get_gendered_information('unknown' , unknown_authors, 'word_title', 'author')

lenght_data_title = combine_dataframes(author_male_lenght_title, author_female_lenght_title, author_unknown_lenght_title)
lenght_data_title

Unnamed: 0_level_0,male total,male percent,female total,female percent,unknown total,unknown percent
word_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2,1098.0,2.0,1068.0,4.0,1198.0,9.0
3,63.0,0.0,56.0,0.0,69.0,0.0
4,203.0,0.0,410.0,1.0,406.0,3.0
5,656.0,1.0,450.0,1.0,457.0,3.0
6,1636.0,3.0,1147.0,4.0,633.0,5.0
7,3525.0,7.0,1719.0,6.0,826.0,6.0
8,3588.0,7.0,2297.0,8.0,1074.0,8.0
9,5141.0,10.0,2925.0,11.0,833.0,6.0
10,4278.0,8.0,2511.0,9.0,1014.0,8.0
11,4849.0,9.0,2249.0,8.0,1287.0,10.0


#### agency - communion

In [34]:
author_male_agency = get_gendered_information('male', male_authors, 'agency', 'author')
author_female_agency = get_gendered_information('female', female_authors, 'agency', 'author')
author_unknown_agency = get_gendered_information('unknown' , unknown_authors, 'agency', 'author')

author__agency = combine_dataframes(author_male_agency, author_female_agency, author_unknown_agency)
author__agency

Unnamed: 0_level_0,male total,male percent,female total,female percent,unknown total,unknown percent
agency,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,33675.0,67.0,18122,70,9223,74
1,11250.0,22.0,5353,20,1922,15
2,3606.0,7.0,1844,7,996,8
3,882.0,1.0,370,1,229,1
4,129.0,0.0,165,0,25,0
5,,,18,0,24,0


In [35]:
author_male_communion = get_gendered_information('male', male_authors, 'communion', 'author')
author_female_communion = get_gendered_information('female', female_authors, 'communion', 'author')
author_unknown_communion = get_gendered_information('unknown' , unknown_authors, 'communion', 'author')

author__communion = combine_dataframes(author_male_communion, author_female_communion, author_unknown_communion)
author__communion

Unnamed: 0_level_0,male total,male percent,female total,female percent,unknown total,unknown percent
communion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,33208,67,17083,66,9182,73
1,12962,26,6982,26,2427,19
2,2345,4,1525,5,730,5
3,922,1,281,1,67,0
4,105,0,1,0,13,0


In [36]:
sentiment_data.to_csv("sentiment_data")
lenght_data_title.to_csv("lenght_data_title")
category_data.to_csv("category_data")

In [37]:
author__agency.to_csv("author__agency")
author__communion.to_csv("author__communion")

### Compare Person in Context

In [38]:
male_context = compare_persons[compare_persons.gender_guesser == "male"] 
female_context = compare_persons[compare_persons.gender_guesser == "female"]
unknown_context = compare_persons[compare_persons.gender_guesser == "unknown"]

#### category

In [39]:
context_author_male_category = get_gendered_information('male', male_context, 'category', 'first_name')
context_author_female_category = get_gendered_information('female', female_context, 'category', 'first_name')
context_author_unknown_category = get_gendered_information('unknown' , unknown_context, 'category', 'first_name')

context_category_data = combine_dataframes(context_author_male_category, context_author_female_category, context_author_unknown_category)
context_category_data

Unnamed: 0_level_0,male total,male percent,female total,female percent,unknown total,unknown percent
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010,17,0,9,0,6,0
2012,17,0,5,0,29,0
business,2411,6,625,6,2435,6
cars,4,0,1,0,1,0
conversations,9,0,1,0,9,0
entertainment,1875,4,777,8,1754,4
goingoutguide,101,0,44,0,96,0
investigations,106,0,15,0,176,0
kidspost,26,0,14,0,14,0
lifestyle,4704,12,2081,22,4340,11


#### Sentiment

In [40]:
context_author_male_sentiment = get_gendered_information('male', male_context, 'title_vader_score', 'first_name')
context_author_female_sentiment = get_gendered_information('female', female_context, 'title_vader_score', 'first_name')
context_author_unknown_sentiment = get_gendered_information('unknown' , unknown_context, 'title_vader_score', 'first_name')

context_sentiment_data = combine_dataframes(context_author_male_sentiment, context_author_female_sentiment, context_author_unknown_sentiment)
context_sentiment_data

Unnamed: 0_level_0,male total,male percent,female total,female percent,unknown total,unknown percent
title_vader_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Negative,18136,46,4092,44,18600,47
Neutral,698,1,159,1,773,1
Positive,19775,51,5005,54,19855,50


#### lenght

In [41]:
context_male_lenght_title = get_gendered_information('male', male_context, 'word_title', 'first_name')
context_female_lenght_title = get_gendered_information('female', female_context, 'word_title', 'first_name')
context_unknown_lenght_title = get_gendered_information('unknown' , unknown_context, 'word_title', 'first_name')

context_lenght_data_title = combine_dataframes(context_male_lenght_title, context_female_lenght_title, context_unknown_lenght_title)
context_lenght_data_title

Unnamed: 0_level_0,male total,male percent,female total,female percent,unknown total,unknown percent
word_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2,1490,3,544,5,1221,3
3,87,0,27,0,69,0
4,418,1,182,1,359,0
5,677,1,210,2,646,1
6,1539,3,393,4,1560,3
7,2600,6,537,5,2850,7
8,3270,8,682,7,2935,7
9,4049,10,939,10,3840,9
10,3484,9,753,8,3364,8
11,3664,9,880,9,3837,9


#### Gender of Author (werden männliche Personen häufiger von Männern erwähnt etc)

In [42]:
context_male_author_gender = get_gendered_information('male', male_context, 'gender_guesser_author', 'first_name')
context_female_author_gender = get_gendered_information('female', female_context, 'gender_guesser_author', 'first_name')
context_unknown_author_gender = get_gendered_information('unknown' , unknown_context, 'gender_guesser_author', 'first_name')

context_gender_data = combine_dataframes(context_male_author_gender, context_female_author_gender, context_unknown_author_gender)
context_gender_data

Unnamed: 0_level_0,male total,male percent,female total,female percent,unknown total,unknown percent
gender_guesser_author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
andy,84,0,25,0,45,0
female,10657,27,3267,35,10503,26
male,21158,54,3516,37,22079,56
mostly_female,612,1,298,3,766,1
mostly_male,942,2,273,2,1230,3
unknown,5156,13,1877,20,4605,11


<font color='red'>Männliche Autoren</font> schreiben doppelt so oft über <font color='red'>Männer</font>.
<font color='blue'>Weibliche Autoren</font> schreiben ausgeglichener über beide Geschlechter.


#### agency-communion

In [43]:
context_male_agency = get_gendered_information('male', male_context, 'agency', 'first_name')
context_female_agency = get_gendered_information('female', female_context, 'agency', 'first_name')
context_unknown_agency = get_gendered_information('unknown' , unknown_context, 'agency', 'first_name')

context__agency = combine_dataframes(context_male_agency, context_female_agency, context_unknown_agency)
context__agency

Unnamed: 0_level_0,male total,male percent,female total,female percent,unknown total,unknown percent
agency,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,26506,68,6655,71,26741,68
1,8203,21,1852,20,8700,22
2,3005,7,593,6,3032,7
3,698,1,120,1,606,1
4,172,0,35,0,133,0
5,25,0,1,0,16,0


In [44]:
context_male_communion = get_gendered_information('male', male_context, 'communion', 'first_name')
context_female_communion = get_gendered_information('female', female_context, 'communion', 'first_name')
context_unknown_communion = get_gendered_information('unknown' , unknown_context, 'communion', 'first_name')

context__communion = combine_dataframes(context_male_communion, context_female_communion, context_unknown_communion)
context__communion

Unnamed: 0_level_0,male total,male percent,female total,female percent,unknown total,unknown percent
communion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,26107,67,6258,67,26155,66
1,10126,26,2325,25,10372,26
2,1868,4,563,6,2051,5
3,462,1,104,1,586,1
4,46,0,6,0,64,0


In [45]:
context_sentiment_data.to_csv("context_sentiment_data")
context_lenght_data_title.to_csv("context_lenght_data_title")
context_category_data.to_csv("context_category_data")
context_gender_data.to_csv("context_gender_data")

In [46]:
context__agency.to_csv("context__agency")
context__communion.to_csv("context__communion")

<b>Erste Erkenntnisse, die wir für eine Prediction nutzen können:
<li>Es werden fast doppelt so viele Artikel von Männern geschrieben als von Frauen</li>
<li>Die Autorenschaft besteht aus 30% aus Frauen, 51% aus Männern und 11% unknown</li>
<li>Ein großer Teil der Texte sind unknown (Nur Buchstaben als Vornahmen, ohne Autor etc.)</li>
<li>Männer schreiben eher über sport und politics</li>
<li>Frauen schreiben eher über lifestyle, business</li>
<li>Lokal beide etwa gleich viel</li>
<li>unknown vor allem opinions </li>
<li>Es werden doppelt so viele Männer in Texten erwähnt als Frauen</li>
<li>Frauen schreiben etwa gleichgewichtet über Männer und Frauen</li>
<li>Männer und unknown schreiben deutlich mehr über Männer (zu 25% mehr)</li>
<li>Besonders häufig werden Männer in der Kategorie sports erwähnt</li>
<li>Besonders häufig werden Frauen in Lifestyle und Local erwähnt</li>
<li>In Politik werden männer und Frauen etwa gleichhäufig besprochen</li>

<li>Männer schreiben mit mehr agency Wörtern</li>
<li>Frauen schreiben mit mehr communion Wörtern</li>

<li>Männer werden häufiger durch agency Wörter beschrieben</li>
<li>Frauen werden mit mehr communion Wörtern beschrieben</li>

<li>Der Sentiment des Titels kann keinen Rückschluss auf das Autoren-Geschlecht bieten</li>

<b>Fragen, die offen sind:
<li>Kann der Sentiment des Titels Rückschluss auf die Geschlechter, über die gesprochen wird, bieten?</li>
<li>Kann der Sentiment des Contents Rückschluss auf die Geschlechter, über die gesprochen wird, bieten?</li>
<li>Kann der Sentiment des Titels Rückschluss auf das Autoren-Geschlecht bieten?</li>

<b>Annahme, die noch zu prüfen sind:
<li>Die Länge des Titels gibt keine Rückschlüsse auf das Geschlecht</li>
<li>Die Länge des Textes gibt Rückschlüsse auf das Geschlecht (Männer bekommen mehr Platz?)</li>
<li>Die Anzahl der Artikel, die von Frauen geschrieben wurden, haben mit der Zeit prozentual zugenommen</li>
<li>Die Anzahl der Artikel, die über Frauen geschrieben wurden, haben mit der Zeit prozentual zugenommen</li>
