In [1]:
import pandas as pd

pd.set_option('display.max_rows', None)

In [2]:
df = pd.read_csv('major_dataset.csv')
df

Unnamed: 0,name,gender,organization,date,location,type_of_death,attack,role,region,year,staff_freelancer
0,Alaa Taher Al-Hassanat,Female,AlMajedat Media Network,"November 20, 2023",Israel and the Occupied Palestinian Territory,Dangerous Assignment,Killed,Journalist,Middle East,2023,Staff
1,Alexandra Tuttle,Female,The Wall Street Journal,"September 22, 1993",Georgia,Crossfire,Killed,Journalist,All other countries,1993,Staff
2,Alison Parker,Female,WDBJ7,"August 26, 2015",USA,Dangerous Assignment,Killed,Journalist,All other countries,2015,Staff
3,Amparo Leonor Jiménez Pallares,Female,"""QAP"" and ""En Vivo""","August 11, 1998",Colombia,Murder,Killed,Journalist,All other countries,1998,Staff
4,Anastasiya Baburova,Female,Novaya Gazeta,"January 19, 2009",Russia,Murder,Killed,Journalist,All other countries,2009,Staff
5,Anja Niedringhaus,Female,The Associated Press,"April 4, 2014",Afghanistan,Dangerous Assignment,Killed,Journalist,All other countries,2014,Staff
6,Anna Politkovskaya,Female,Novaya Gazeta,"October 7, 2006",Russia,Murder,Killed,Journalist,All other countries,2006,Staff
7,Asiya Jeelani,Female,Freelancer,"April 20, 2004",India,Dangerous Assignment,Killed,Journalist,All other countries,2004,Freelancer
8,Atwar Bahjat,Female,Al-Arabiya,"February 23, 2006",Iraq,Murder,Killed,Journalist,Middle East,2006,Staff
9,Audrey Gaid Estrada,Female,101.3 Grace Covenant FM,"March 17, 2022",Philippines,Murder,Killed,Journalist,All other countries,2022,Staff


In [3]:
# add a new column

df['staff_freelancer'] = ''
for i, item in df.iterrows():

    if (item['organization'] == 'Freelancer'):

        df.at[i, 'staff_freelancer'] = 'Freelancer'
    else:
        df.at[i, 'staff_freelancer'] = 'Staff'

In [4]:
# update file and save
df.to_csv('major_dataset.csv',index=False)

In [5]:
# add a new column for the analysis
df['number'] = 1

In [6]:
# groub data by location for the visualisation
location = df.groupby(['location']).number.sum().to_frame().unstack().reset_index()

In [7]:
# rename columns
location = location.rename(columns={'level_0': 'cases', 'location': 'countries'})

In [8]:
# drop unecessary column
location = location.drop('cases', axis=1)
location

Unnamed: 0,countries,0
0,Afghanistan,75
1,Albania,1
2,Algeria,64
3,Angola,10
4,Argentina,2
5,Azerbaijan,9
6,Bahrain,8
7,Bangladesh,27
8,Barbados,1
9,Belarus,28


In [9]:
# save to csv
location.to_csv('location_visualisation.csv',index=False)

In [10]:
# convert Unknown values into floats
df['year']=df.year.replace('Unknown', float('NaN'))

In [11]:
# convert the year column into float
df['year'] = df.year.astype(float)

In [13]:
#groub data by region and year for the visualisation
attacks = df.groupby(['year', 'region']).number.sum().reset_index()
attacks

Unnamed: 0,year,region,number
0,1982.0,Middle East,1
1,1992.0,All other countries,34
2,1992.0,Middle East,12
3,1993.0,All other countries,51
4,1993.0,Middle East,5
5,1994.0,All other countries,64
6,1994.0,Middle East,3
7,1995.0,All other countries,55
8,1995.0,Middle East,1
9,1996.0,All other countries,28


In [14]:
attacks = attacks.loc[attacks.year.between(2000, 2023)]
attacks

Unnamed: 0,year,region,number
16,2000.0,All other countries,25
17,2001.0,All other countries,49
18,2001.0,Middle East,1
19,2002.0,All other countries,21
20,2002.0,Middle East,3
21,2003.0,All other countries,27
22,2003.0,Middle East,27
23,2004.0,All other countries,36
24,2004.0,Middle East,40
25,2005.0,All other countries,28


In [15]:
# save to csv
attacks.to_csv('attacks_visualisation.csv',index=False)

In [16]:
#from df keep only data where region is "Middle East"
kind_of_attacks = df[df.region=='Middle East']

In [17]:
#turn the attack column values into percentages
kind_of_attacks = kind_of_attacks.attack.value_counts(dropna=False,normalize=True)*100

In [18]:
# make the numbers rounded
kind_of_attacks = round(kind_of_attacks,1)
kind_of_attacks

attack
Killed        76.8
Imprisoned    20.5
Missing        2.8
Name: proportion, dtype: float64

In [23]:
# turn the results into a df
df_koa = pd.DataFrame(kind_of_attacks)
df_koa

Unnamed: 0,attack,percentage
0,Killed,76.8
1,Imprisoned,20.5
2,Missing,2.8


In [24]:
# save to csv
df_koa.to_csv('df_koa_visualisation.csv',index=False)

In [45]:
#groub data by gender and region for the visualisation
gender = df.groupby(['gender', 'region']).number.sum().reset_index()
gender

Unnamed: 0,gender,region,number
0,Female,All other countries,120
1,Female,Middle East,75
2,Male,All other countries,1298
3,Male,Middle East,572


In [46]:
# save to csv
gender.to_csv('gender_visualisation.csv',index=False)

In [47]:
#groub data by staff or freelancer and region for the visualisation
fr_staff = df.groupby(['staff_freelancer', 'region']).number.sum().reset_index()
fr_staff

Unnamed: 0,staff_freelancer,region,number
0,Freelancer,All other countries,166
1,Freelancer,Middle East,145
2,Staff,All other countries,1297
3,Staff,Middle East,582


In [48]:
# save to csv
fr_staff.to_csv('fr_staff_visualisation.csv',index=False)