In [2]:
# Read libraries

# Data wrangling
import pandas as pd
import numpy as np
import re
import pycountry
from shapely import wkt
import fiona

# Visualization 
import missingno as msno
import matplotlib
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import geopandas as gpd

# Machine learning 
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

In [3]:
# Read the Parquet file
df_encoded = pd.read_parquet('df_encoded.parquet')

# Display the DataFrame
df_encoded

Unnamed: 0,year,countries,region,pf_ss_disappearances_disap,pf_ss_disappearances_violent,pf_ss_disappearances_violent_data,pf_ss_disappearances_organized,pf_ss_disappearances_fatalities,pf_ss_disappearances_fatalities_data,pf_ss_disappearances_injuries,...,region_Caucasus & Central Asia,region_East Asia,region_Eastern Europe,region_Latin America & the Caribbean,region_Middle East & North Africa,region_North America,region_Oceania,region_South Asia,region_Sub-Saharan Africa,region_Western Europe
0,2020,Albania,Eastern Europe,10.0,10.000000,0.0,7.5,10.000000,0.0,10.000000,...,0,0,1,0,0,0,0,0,0,0
1,2020,Algeria,Middle East & North Africa,10.0,9.687083,25.0,5.0,10.000000,0.0,10.000000,...,0,0,0,0,1,0,0,0,0,0
2,2020,Angola,Sub-Saharan Africa,10.0,9.582498,25.0,7.5,9.736578,5.0,9.971733,...,0,0,0,0,0,0,0,0,1,0
3,2020,Argentina,Latin America & the Caribbean,5.0,10.000000,0.0,7.5,9.925379,2.0,10.000000,...,0,0,0,1,0,0,0,0,0,0
4,2020,Armenia,Caucasus & Central Asia,10.0,10.000000,0.0,7.5,10.000000,0.0,10.000000,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3481,2000,Venezuela,Latin America & the Caribbean,10.0,10.000000,0.0,,10.000000,0.0,10.000000,...,0,0,0,1,0,0,0,0,0,0
3482,2000,Vietnam,South Asia,10.0,10.000000,0.0,,10.000000,0.0,10.000000,...,0,0,0,0,0,0,0,1,0,0
3483,2000,Yemen,Middle East & North Africa,5.0,10.000000,0.0,,8.170079,19.0,7.932996,...,0,0,0,0,1,0,0,0,0,0
3484,2000,Zambia,Sub-Saharan Africa,10.0,10.000000,0.0,,10.000000,0.0,9.827262,...,0,0,0,0,0,0,0,0,1,0


### 1) Statistics

In [4]:
# Data frame information (columns, dtypes, mem usage)
df_encoded.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3486 entries, 0 to 3485
Columns: 247 entries, year to region_Western Europe
dtypes: float64(62), int64(3), object(7), uint8(175)
memory usage: 2.5+ MB


In [5]:
# Summary statistics
df_encoded.describe()

Unnamed: 0,year,pf_ss_disappearances_disap,pf_ss_disappearances_violent,pf_ss_disappearances_violent_data,pf_ss_disappearances_organized,pf_ss_disappearances_fatalities,pf_ss_disappearances_fatalities_data,pf_ss_disappearances_injuries,pf_ss_disappearances_injuries_data,pf_ss_killings,...,region_Caucasus & Central Asia,region_East Asia,region_Eastern Europe,region_Latin America & the Caribbean,region_Middle East & North Africa,region_North America,region_Oceania,region_South Asia,region_Sub-Saharan Africa,region_Western Europe
count,3486.0,3436.0,3486.0,3486.0,1984.0,3486.0,3486.0,3486.0,3486.0,3423.0,...,3486.0,3486.0,3486.0,3486.0,3486.0,3486.0,3486.0,3486.0,3486.0,3486.0
mean,2010.0,8.442957,9.218442,214.990964,6.602193,8.97516,75.381727,9.072712,108.331216,7.352308,...,0.036145,0.036145,0.13253,0.156627,0.114458,0.012048,0.024096,0.10241,0.271084,0.114458
std,6.056169,3.105136,2.380573,2130.057043,2.840456,2.607188,497.217838,2.412915,796.563634,2.570794,...,0.186677,0.186677,0.339115,0.3635,0.318412,0.109117,0.15337,0.30323,0.444583,0.318412
min,2000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1775,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2005.0,10.0,10.0,0.0,5.0,9.71383,0.0,9.762162,0.0,5.89,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2010.0,10.0,10.0,0.0,7.5,10.0,0.0,10.0,0.0,8.2375,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2015.0,10.0,10.0,0.0,10.0,10.0,3.0,10.0,6.0,9.4725,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
max,2020.0,10.0,10.0,69089.0,10.0,10.0,13075.99,10.0,21894.0,9.9175,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### 2) Visualisations