In [1]:
%matplotlib inline

import pandas as pd

In [2]:
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

## Assembly of the newly extracted tweets in an unique dataset

In [3]:
nt_arts = pd.read_csv('../data_sets/arts.csv') 
nt_sport = pd.read_csv('../data_sets/sport.csv', sep = ';')
nt_pol = pd.read_csv('../data_sets/politics.csv')
nt_sci_cul = pd.read_csv('../data_sets/sci_cul.csv')
nt_show = pd.read_csv('../data_sets/show.csv')

In [4]:
# we gather the partial datasets (i.e., related to single occupation) into a single dataset and analyse some distributions

new_tweets = pd.concat([nt_arts, nt_pol, nt_sci_cul, nt_show, nt_sport], axis=0)

In [5]:
len(new_tweets)

1014

In [6]:
new_tweets['misogynous'].value_counts()

1    781
0    233
Name: misogynous, dtype: int64

In [7]:
new_tweets['kind_of_work'].value_counts()

sport      267
pol_act    256
arts       221
sci_cul    189
show        81
Name: kind_of_work, dtype: int64

In [8]:
new_tweets = new_tweets.drop_duplicates(subset='Tweet', keep="first")     # removal of duplicates 

In [9]:
len(new_tweets)      # actually there were 24 duplicates. Distraction is always just around the corner 

990

In [10]:
# analysis of distributions after the removal of duplicates

In [11]:
new_tweets['misogynous'].value_counts()

1    760
0    230
Name: misogynous, dtype: int64

In [12]:
new_tweets['kind_of_work'].value_counts()

sport      262
pol_act    242
arts       221
sci_cul    186
show        79
Name: kind_of_work, dtype: int64

In [14]:
# exportation of the resulting dataframe, containin all the new tweets. It is available in the folder 'data_sets'

# new_tweets.to_csv("new_tweets_df.csv", sep=",", index=False)

## Creation of a dataset made up of only misogynous tweets

Now we create the dataset tha combines (1) the misogynous tweets of the original dataset w.r.t. which it was possible to identify the profession of the victim and (2) the misogynous tweets among the newly downloaded ones

In [15]:
# just as a reminder: 'final_df.csv' is the version of the original training set that has been corrected and annotated with jobs

final_df = pd.read_csv('../data_sets/final_df.csv') 

In [16]:
#final_df

In [17]:
# we keep only the tweets that have one of the five occupational field in the column 'occupation'

# we also discard tweets reporting the label 'derailing' in the column 'misogyny_category'. For more information about this
# choice, see Section 3.2 of the report

final_occ_df = final_df[final_df['occupation'] != 'generic_r']
final_occ_df = final_occ_df[final_occ_df['occupation'] != 'generic_g']
final_occ_df = final_occ_df[final_occ_df['occupation'] != 'other']
final_occ_df = final_occ_df[final_occ_df['occupation'] != '0']
final_occ_df = final_occ_df[final_occ_df['occupation'] != 'no_woman']
final_occ_df = final_occ_df[final_occ_df['misogyny_category'] != 'derailing'] 

In [18]:
final_occ_df.reset_index(drop=True, inplace=True)

In [19]:
# in addition to delete the category 'derailing', we decided to put the categories 'dominance' and 'stereotype' together

final_occ_df['misogyny_category'] = final_occ_df['misogyny_category'].replace('dominance', 'stereotype_dominance').replace('stereotype', 'stereotype_dominance')

In [20]:
# final_occ_df

The misogynous tweets contained in 'new_tweets_df.csv' (i.e., the misogynous tweets that we extracted) was automatically annotated w.r.t. the 'misogyny_category' by means of the most performative of the trained classifiers. 

The resulting dataset is 'new_tweets_annot.csv', that is available in the folder 'data_sets'. 
We upload it in order to create an unique dataset of misogynous tweets.

In [21]:
new_df_annot = pd.read_csv('../data_sets/new_tweets_annot.csv') 

In [22]:
# we change the form of the 'misogyny_category' values from numerical (as they were assigned by the classifier) to the original
# strings used for the annotation of the 'AMI' training set

categories = {0: "discredit", 1: "sexual_harassment", 2: "stereotype_dominance"}
new_df_annot.replace({"label": categories}, inplace=True)

In [23]:
new_df_annot.columns = ['text', 'occupation', 'misogyny_category']   # we change the name of colums so as to make 
                                                                     # the concatenation possible

In [24]:
# creation of the dataset, analysis of distributions and exportation

all_annotated_tweets = pd.concat ([final_occ_df[['text', 'occupation', 'misogyny_category']], new_df_annot], axis=0)

In [25]:
all_annotated_tweets.reset_index(drop=True, inplace=True)

In [26]:
len(all_annotated_tweets)

969

In [27]:
all_annotated_tweets['misogyny_category'].value_counts()

discredit               604
stereotype_dominance    209
sexual_harassment       156
Name: misogyny_category, dtype: int64

In [28]:
# the resulting dataset is available in the folder 'data_sets' under the name 'all_annotated_tweets_update'

# all_annotated_tweets.to_csv("all_annotated_tweets.csv", sep=",", index=False)