In [103]:
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

# Set option to show all columns
pd.set_option('display.max_columns', None)

In [104]:
# Loading the global terrorism
globalterrorism_df = pd.read_csv("../datasets/globalterrorism.csv", encoding="ISO-8859-1", lineterminator="\n")

# Verify it's columns
globalterrorism_df.head()

Unnamed: 0,eventid,iyear,imonth,iday,approxdate,extended,resolution,country,country_txt,region,region_txt,provstate,city,latitude,longitude,specificity,vicinity,location,summary,crit1,crit2,crit3,doubtterr,alternative,alternative_txt,multiple,success,suicide,attacktype1,attacktype1_txt,attacktype2,attacktype2_txt,attacktype3,attacktype3_txt,targtype1,targtype1_txt,targsubtype1,targsubtype1_txt,corp1,target1,natlty1,natlty1_txt,targtype2,targtype2_txt,targsubtype2,targsubtype2_txt,corp2,target2,natlty2,natlty2_txt,targtype3,targtype3_txt,targsubtype3,targsubtype3_txt,corp3,target3,natlty3,natlty3_txt,gname,gsubname,gname2,gsubname2,gname3,gsubname3,motive,guncertain1,guncertain2,guncertain3,individual,nperps,nperpcap,claimed,claimmode,claimmode_txt,claim2,claimmode2,claimmode2_txt,claim3,claimmode3,claimmode3_txt,compclaim,weaptype1,weaptype1_txt,weapsubtype1,weapsubtype1_txt,weaptype2,weaptype2_txt,weapsubtype2,weapsubtype2_txt,weaptype3,weaptype3_txt,weapsubtype3,weapsubtype3_txt,weaptype4,weaptype4_txt,weapsubtype4,weapsubtype4_txt,weapdetail,nkill,nkillus,nkillter,nwound,nwoundus,nwoundte,property,propextent,propextent_txt,propvalue,propcomment,ishostkid,nhostkid,nhostkidus,nhours,ndays,divert,kidhijcountry,ransom,ransomamt,ransomamtus,ransompaid,ransompaidus,ransomnote,hostkidoutcome,hostkidoutcome_txt,nreleased,addnotes,scite1,scite2,scite3,dbsource,INT_LOG,INT_IDEO,INT_MISC,INT_ANY,related\r
0,197000000001,1970,7,2,,0,,58,Dominican Republic,2,Central America & Caribbean,,Santo Domingo,18.456792,-69.951164,1.0,0,,,1,1,1,0.0,,,0.0,1,0,1,Assassination,,,,,14,Private Citizens & Property,68.0,Named Civilian,,Julio Guzman,58.0,Dominican Republic,,,,,,,,,,,,,,,,,MANO-D,,,,,,,0.0,,,0,,,,,,,,,,,,,13,Unknown,,,,,,,,,,,,,,,,1.0,,,0.0,,,0,,,,,0.0,,,,,,,0.0,,,,,,,,,,,,,PGIS,0,0,0,0,\r
1,197000000002,1970,0,0,,0,,130,Mexico,1,North America,Federal,Mexico city,19.371887,-99.086624,1.0,0,,,1,1,1,0.0,,,0.0,1,0,6,Hostage Taking (Kidnapping),,,,,7,Government (Diplomatic),45.0,"Diplomatic Personnel (outside of embassy, cons...",Belgian Ambassador Daughter,"Nadine Chaval, daughter",21.0,Belgium,,,,,,,,,,,,,,,,,23rd of September Communist League,,,,,,,0.0,,,0,7.0,,,,,,,,,,,,13,Unknown,,,,,,,,,,,,,,,,0.0,,,0.0,,,0,,,,,1.0,1.0,0.0,,,,Mexico,1.0,800000.0,,,,,,,,,,,,PGIS,0,1,1,1,\r
2,197001000001,1970,1,0,,0,,160,Philippines,5,Southeast Asia,Tarlac,Unknown,15.478598,120.599741,4.0,0,,,1,1,1,0.0,,,0.0,1,0,1,Assassination,,,,,10,Journalists & Media,54.0,Radio Journalist/Staff/Facility,Voice of America,Employee,217.0,United States,,,,,,,,,,,,,,,,,Unknown,,,,,,,0.0,,,0,,,,,,,,,,,,,13,Unknown,,,,,,,,,,,,,,,,1.0,,,0.0,,,0,,,,,0.0,,,,,,,0.0,,,,,,,,,,,,,PGIS,-9,-9,1,1,\r
3,197001000002,1970,1,0,,0,,78,Greece,8,Western Europe,Attica,Athens,37.99749,23.762728,1.0,0,,,1,1,1,0.0,,,0.0,1,0,3,Bombing/Explosion,,,,,7,Government (Diplomatic),46.0,Embassy/Consulate,,U.S. Embassy,217.0,United States,,,,,,,,,,,,,,,,,Unknown,,,,,,,0.0,,,0,,,,,,,,,,,,,6,Explosives,16.0,Unknown Explosive Type,,,,,,,,,,,,,Explosive,,,,,,,1,,,,,0.0,,,,,,,0.0,,,,,,,,,,,,,PGIS,-9,-9,1,1,\r
4,197001000003,1970,1,0,,0,,101,Japan,4,East Asia,Fukouka,Fukouka,33.580412,130.396361,1.0,0,,,1,1,1,-9.0,,,0.0,1,0,7,Facility/Infrastructure Attack,,,,,7,Government (Diplomatic),46.0,Embassy/Consulate,,U.S. Consulate,217.0,United States,,,,,,,,,,,,,,,,,Unknown,,,,,,,0.0,,,0,,,,,,,,,,,,,8,Incendiary,,,,,,,,,,,,,,,Incendiary,,,,,,,1,,,,,0.0,,,,,,,0.0,,,,,,,,,,,,,PGIS,-9,-9,1,1,\r


In [105]:
# Check for null values in the dataset

pd.set_option("display.max_rows", None)  # Show all rows
pd.set_option("display.max_columns", None)  # Show all columns


# Calculate the percentage of null (NaN) values in each column
null_percentage = globalterrorism_df.isnull().mean() * 100

# Display the percentage of null values in each column
print(null_percentage)



# The following columns will be dropped due to redundancy, missing values, or low relevance:
# - approxdate: Not essential (attack date is already covered by iyear, imonth, iday)
# - resolution: Too many missing values, not useful for most analyses
# - location: Redundant (province/state & city already exist)
# - summary: Optional text description, can be removed for efficiency
# - alternative, alternative_txt: Rarely used, unclear relevance
# - related: Often missing, not crucial for most analyses
# - gsubname, gsubname2, gsubname3: Secondary group names, mostly empty
# - weaptype2, weaptype3, weaptype4: Most attacks involve a single weapon type
# - weapsubtype1, weapsubtype2, weapsubtype3, weapsubtype4: Detailed weapon breakdown, not always needed
# - weapdetail: Often missing, very specific information
# - claimmode, claimmode2, claimmode3: Claim details, frequently unavailable
# - propextent, propvalue, propcomment: Property damage details, not essential
# - nhostkid, nhostkidus, nhours, ndays: Hostage data, often incomplete
# - ransom, ransomamt, ransompaid, ransompaidus: Ransom information, rare cases
# - hostkidoutcome, nreleased: Hostage outcomes, too many missing values
# - scite1, scite2, scite3: Citation sources, not useful for analysis



eventid                0.000000
iyear                  0.000000
imonth                 0.000000
iday                   0.000000
approxdate            94.914993
extended               0.000000
resolution            98.778145
country                0.000000
country_txt            0.000000
region                 0.000000
region_txt             0.000000
provstate              0.231712
city                   0.239417
latitude               2.507554
longitude              2.508104
specificity            0.003302
vicinity               0.000000
location              69.456385
summary               36.396409
crit1                  0.000000
crit2                  0.000000
crit3                  0.000000
doubtterr              0.000550
alternative           84.032781
alternative_txt       84.032781
multiple               0.000550
success                0.000000
suicide                0.000000
attacktype1            0.000000
attacktype1_txt        0.000000
attacktype2           96.524869
attackty

In [106]:
columns_to_drop = [
    "approxdate", "resolution", "location", "summary", "alternative", "alternative_txt",
    "gsubname", "gsubname2", "gsubname3", "weaptype2", "weaptype3", "weaptype4",
    "weapsubtype1", "weapsubtype2", "weapsubtype3", "weapsubtype4", "weapdetail",
    "claimmode", "claimmode2", "claimmode3", "propextent", "propvalue", "propcomment",
    "nhostkid", "nhostkidus", "nhours", "ndays", "ransom", "ransomamt", "ransompaid", "ransompaidus",
    "ransomnote", "hostkidoutcome", "nreleased", "scite1", "scite2", "scite3", "addnotes","INT_LOG",
    "INT_IDEO",	"INT_MISC",	"INT_ANY","country", "region",
    
    # Additional columns with >60% missing data
    "targtype2", "targtype2_txt", "targsubtype2", "targsubtype2_txt", "corp2", "target2", "natlty2", "natlty2_txt",
    "targtype3", "targtype3_txt", "targsubtype3", "targsubtype3_txt", "corp3", "target3", "natlty3", "natlty3_txt",
    "guncertain2", "guncertain3", "claim2", "claimmode2", "claimmode2_txt", "claim3", "claimmode3", "claimmode3_txt",
    "weaptype2", "weaptype2_txt", "weapsubtype2", "weapsubtype2_txt", "weaptype3", "weaptype3_txt", "weapsubtype3", "weapsubtype3_txt",
    "weaptype4", "weaptype4_txt", "weapsubtype4", "weapsubtype4_txt", "ransomamtus", "ransomnote"
]


globalterrorism_df.drop(columns=columns_to_drop, inplace=True)


In [107]:
# List of columns with missing values (excluding those in columns_to_drop)
columns_with_missing = [
    'provstate', 'city', 'latitude', 'longitude', 'specificity', 'doubtterr', 
    'targsubtype1', 'corp1', 'target1', 'natlty1', 'natlty1_txt', 'targsubtype1_txt',
    'nperps', 'nperpcap', 'claimed', 'nkill', 'nkillus', 'nkillter', 'nwound', 'nwoundus', 
    'nwoundte', 'property', 'ishostkid', 'guncertain1', 'individual', 'attacktype2', 'attacktype2_txt', 'attacktype3', 'attacktype3_txt',
    'gname2', 'gname3', 'motive', 'claimmode_txt', 'compclaim', 
    'divert', 'kidhijcountry', 'hostkidoutcome_txt'
]

# Fill numerical columns with the median or mean
numerical_columns = [
    'latitude', 'longitude', 'specificity', 'doubtterr', 'targsubtype1', 'nperps', 'nperpcap', 
    'claimed', 'nkill', 'nkillus', 'nkillter', 'nwound', 'nwoundus', 'nwoundte'
]

# Fill categorical columns with the mode (most frequent value)
categorical_columns = [
    'provstate', 'city', 'targsubtype1_txt', 'corp1', 'target1', 'natlty1', 'natlty1_txt', 
    'targsubtype1_txt', 'property', 'ishostkid', 'guncertain1', 'individual'
]

# Fill missing values in numerical columns with median (or mean if you prefer)
for col in numerical_columns:
    if col in globalterrorism_df.columns:
        globalterrorism_df[col] = globalterrorism_df[col].fillna(globalterrorism_df[col].median())

# Fill missing values in categorical columns with mode
for col in categorical_columns:
    if col in globalterrorism_df.columns:
        globalterrorism_df[col] = globalterrorism_df[col].fillna(globalterrorism_df[col].mode()[0])

# Check the filled columns
print(globalterrorism_df.isnull().sum())


eventid                    0
iyear                      0
imonth                     0
iday                       0
extended                   0
country_txt                0
region_txt                 0
provstate                  0
city                       0
latitude                   0
longitude                  0
specificity                0
vicinity                   0
crit1                      0
crit2                      0
crit3                      0
doubtterr                  0
multiple                   1
success                    0
suicide                    0
attacktype1                0
attacktype1_txt            0
attacktype2           175377
attacktype2_txt       175377
attacktype3           181263
attacktype3_txt       181263
targtype1                  0
targtype1_txt              0
targsubtype1               0
targsubtype1_txt           0
corp1                      0
target1                    0
natlty1                    0
natlty1_txt                0
gname         

In [108]:
# Calculate the percentage of NaN values in each column
nan_percentage = globalterrorism_df.isna().mean() * 100

# Display the percentage of NaN values in each column
print(nan_percentage)


eventid                0.000000
iyear                  0.000000
imonth                 0.000000
iday                   0.000000
extended               0.000000
country_txt            0.000000
region_txt             0.000000
provstate              0.000000
city                   0.000000
latitude               0.000000
longitude              0.000000
specificity            0.000000
vicinity               0.000000
crit1                  0.000000
crit2                  0.000000
crit3                  0.000000
doubtterr              0.000000
multiple               0.000550
success                0.000000
suicide                0.000000
attacktype1            0.000000
attacktype1_txt        0.000000
attacktype2           96.524869
attacktype2_txt       96.524869
attacktype3           99.764435
attacktype3_txt       99.764435
targtype1              0.000000
targtype1_txt          0.000000
targsubtype1           0.000000
targsubtype1_txt       0.000000
corp1                  0.000000
target1 

In [109]:
# Calculate the percentage of missing values for each column
missing_percentage = globalterrorism_df.isnull().mean()

# Identify columns with more than 60% missing values
columns_to_drop = missing_percentage[missing_percentage > 0.60].index

# Drop the identified columns
globalterrorism_df = globalterrorism_df.drop(columns=columns_to_drop)

# Verify the changes
print(globalterrorism_df.isnull().sum())


eventid                 0
iyear                   0
imonth                  0
iday                    0
extended                0
country_txt             0
region_txt              0
provstate               0
city                    0
latitude                0
longitude               0
specificity             0
vicinity                0
crit1                   0
crit2                   0
crit3                   0
doubtterr               0
multiple                1
success                 0
suicide                 0
attacktype1             0
attacktype1_txt         0
targtype1               0
targtype1_txt           0
targsubtype1            0
targsubtype1_txt        0
corp1                   0
target1                 0
natlty1                 0
natlty1_txt             0
gname                   0
guncertain1             0
individual              0
nperps                  0
nperpcap                0
claimed                 0
weaptype1               0
weaptype1_txt           0
weapsubtype1

In [110]:
# Fill missing values for numeric columns with 0 (or use another method as appropriate)
globalterrorism_df['weapsubtype1_txt'].fillna('Unknown', inplace=True)

# Alternatively, for other columns, you can fill missing values based on the column type
globalterrorism_df.fillna({
    'weapsubtype1_txt': 'Unknown',  # for categorical columns
    # You can add more columns if needed, with different strategies
}, inplace=True)

# Or you can use forward or backward filling for other columns
globalterrorism_df.fillna(method='ffill', inplace=True)

# Verify that all missing data is handled
print(globalterrorism_df.isnull().sum())


eventid             0
iyear               0
imonth              0
iday                0
extended            0
country_txt         0
region_txt          0
provstate           0
city                0
latitude            0
longitude           0
specificity         0
vicinity            0
crit1               0
crit2               0
crit3               0
doubtterr           0
multiple            0
success             0
suicide             0
attacktype1         0
attacktype1_txt     0
targtype1           0
targtype1_txt       0
targsubtype1        0
targsubtype1_txt    0
corp1               0
target1             0
natlty1             0
natlty1_txt         0
gname               0
guncertain1         0
individual          0
nperps              0
nperpcap            0
claimed             0
weaptype1           0
weaptype1_txt       0
weapsubtype1_txt    0
nkill               0
nkillus             0
nkillter            0
nwound              0
nwoundus            0
nwoundte            0
property  

In [111]:
globalterrorism_df.head()

Unnamed: 0,eventid,iyear,imonth,iday,extended,country_txt,region_txt,provstate,city,latitude,longitude,specificity,vicinity,crit1,crit2,crit3,doubtterr,multiple,success,suicide,attacktype1,attacktype1_txt,targtype1,targtype1_txt,targsubtype1,targsubtype1_txt,corp1,target1,natlty1,natlty1_txt,gname,guncertain1,individual,nperps,nperpcap,claimed,weaptype1,weaptype1_txt,weapsubtype1_txt,nkill,nkillus,nkillter,nwound,nwoundus,nwoundte,property,ishostkid,dbsource,related\r
0,197000000001,1970,7,2,0,Dominican Republic,Central America & Caribbean,Baghdad,Santo Domingo,18.456792,-69.951164,1.0,0,1,1,1,0.0,0.0,1,0,1,Assassination,14,Private Citizens & Property,68.0,Named Civilian,Unknown,Julio Guzman,58.0,Dominican Republic,MANO-D,0.0,0,-99.0,0.0,0.0,13,Unknown,Unknown,1.0,0.0,0.0,0.0,0.0,0.0,0,0.0,PGIS,\r
1,197000000002,1970,0,0,0,Mexico,North America,Federal,Mexico city,19.371887,-99.086624,1.0,0,1,1,1,0.0,0.0,1,0,6,Hostage Taking (Kidnapping),7,Government (Diplomatic),45.0,"Diplomatic Personnel (outside of embassy, cons...",Belgian Ambassador Daughter,"Nadine Chaval, daughter",21.0,Belgium,23rd of September Communist League,0.0,0,7.0,0.0,0.0,13,Unknown,Unknown,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,PGIS,\r
2,197001000001,1970,1,0,0,Philippines,Southeast Asia,Tarlac,Unknown,15.478598,120.599741,4.0,0,1,1,1,0.0,0.0,1,0,1,Assassination,10,Journalists & Media,54.0,Radio Journalist/Staff/Facility,Voice of America,Employee,217.0,United States,Unknown,0.0,0,-99.0,0.0,0.0,13,Unknown,Unknown,1.0,0.0,0.0,0.0,0.0,0.0,0,0.0,PGIS,\r
3,197001000002,1970,1,0,0,Greece,Western Europe,Attica,Athens,37.99749,23.762728,1.0,0,1,1,1,0.0,0.0,1,0,3,Bombing/Explosion,7,Government (Diplomatic),46.0,Embassy/Consulate,Unknown,U.S. Embassy,217.0,United States,Unknown,0.0,0,-99.0,0.0,0.0,6,Explosives,Unknown Explosive Type,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,PGIS,\r
4,197001000003,1970,1,0,0,Japan,East Asia,Fukouka,Fukouka,33.580412,130.396361,1.0,0,1,1,1,-9.0,0.0,1,0,7,Facility/Infrastructure Attack,7,Government (Diplomatic),46.0,Embassy/Consulate,Unknown,U.S. Consulate,217.0,United States,Unknown,0.0,0,-99.0,0.0,0.0,8,Incendiary,Unknown,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,PGIS,\r


In [None]:

# Load country dataset
country_df = pd.read_csv('../datasets/world-data-2023.csv')

print('Country datset columns')
country_df.columns


# Start by removing some of the redudant
columns_to_drop = [
    'Abbreviation', 'Calling Code', 'Capital/Major City', 'Largest city',
    'Population: Labor force participation (%)', 'Urban_population', 'Currency-Code',
    'Gasoline Price', 'Fertility Rate'
]

# Drop the specified columns
country_df = country_df.drop(columns=columns_to_drop)

# Display the first few rows to confirm the columns were dropped
country_df.head()




Country datset columns


Unnamed: 0,Country,Density\n(P/Km2),Agricultural Land( %),Land Area(Km2),Armed Forces size,Birth Rate,Co2-Emissions,CPI,CPI Change (%),Forested Area (%),GDP,Gross primary education enrollment (%),Gross tertiary education enrollment (%),Infant mortality,Life expectancy,Maternal mortality ratio,Minimum wage,Official language,Out of pocket health expenditure,Physicians per thousand,Population,Tax revenue (%),Total tax rate,Unemployment rate,Latitude,Longitude
0,Afghanistan,60,58.10%,652230,323000.0,32.49,8672,149.9,2.30%,2.10%,"$19,101,353,833",104.00%,9.70%,47.9,64.5,638.0,$0.43,Pashto,78.40%,0.28,38041754,9.30%,71.40%,11.12%,33.93911,67.709953
1,Albania,105,43.10%,28748,9000.0,11.78,4536,119.05,1.40%,28.10%,"$15,278,077,447",107.00%,55.00%,7.8,78.5,15.0,$1.12,Albanian,56.90%,1.2,2854191,18.60%,36.60%,12.33%,41.153332,20.168331
2,Algeria,18,17.40%,2381741,317000.0,24.28,150006,151.36,2.00%,0.80%,"$169,988,236,398",109.90%,51.40%,20.1,76.7,112.0,$0.95,Arabic,28.10%,1.72,43053054,37.20%,66.10%,11.70%,28.033886,1.659626
3,Andorra,164,40.00%,468,,7.2,469,,,34.00%,"$3,154,057,987",106.40%,,2.7,,,$6.63,Catalan,36.40%,3.33,77142,,,,42.506285,1.521801
4,Angola,26,47.50%,1246700,117000.0,40.73,34693,261.73,17.10%,46.30%,"$94,635,415,870",113.50%,9.30%,51.6,60.8,241.0,$0.71,Portuguese,33.40%,0.21,31825295,9.20%,49.10%,6.89%,-11.202692,17.873887


In [None]:
# Check for null values
country_df.isnull().mean() * 100

Country                                     0.000000
Density\n(P/Km2)                            0.000000
Agricultural Land( %)                       3.589744
Land Area(Km2)                              0.512821
Armed Forces size                          12.307692
Birth Rate                                  3.076923
Co2-Emissions                               3.589744
CPI                                         8.717949
CPI Change (%)                              8.205128
Forested Area (%)                           3.589744
GDP                                         1.025641
Gross primary education enrollment (%)      3.589744
Gross tertiary education enrollment (%)     6.153846
Infant mortality                            3.076923
Life expectancy                             4.102564
Maternal mortality ratio                    7.179487
Minimum wage                               23.076923
Official language                           2.564103
Out of pocket health expenditure            3.

In [135]:
# Rename the 'country' column in the country dataset to match the terrorism dataset
globalterrorism_df.rename(columns={'country_txt': 'Country'}, inplace=True)

# Merge the datasets on the 'country_txt' column
merged_df = pd.merge(globalterrorism_df, country_df, on='Country', how='left')

# Now you can inspect the merged dataset
merged_df.describe()

Unnamed: 0,eventid,iyear,imonth,iday,extended,latitude,longitude,specificity,vicinity,crit1,crit2,crit3,doubtterr,multiple,success,suicide,attacktype1,targtype1,targsubtype1,natlty1,guncertain1,individual,nperps,nperpcap,claimed,weaptype1,nkill,nkillus,nkillter,nwound,nwoundus,nwoundte,property,ishostkid,Birth Rate,Infant mortality,Life expectancy,Maternal mortality ratio,Physicians per thousand,Latitude,Longitude
count,181691.0,181691.0,181691.0,181691.0,181691.0,181691.0,181691.0,181691.0,181691.0,181691.0,181691.0,181691.0,181691.0,181691.0,181691.0,181691.0,181691.0,181691.0,181691.0,181691.0,181691.0,181691.0,181691.0,181691.0,181691.0,181691.0,181691.0,181691.0,181691.0,181691.0,181691.0,181691.0,181691.0,181691.0,177385.0,177385.0,177385.0,177385.0,177385.0,177385.0,177385.0
mean,200270500000.0,2002.638997,6.467277,15.505644,0.045346,23.698173,-446.1064,1.451437,0.068297,0.98853,0.993093,0.875668,-0.523168,0.137772,0.889598,0.036507,3.247547,8.439719,46.288005,127.405975,0.08127,0.00295,-78.52761,-0.937262,0.031592,6.447325,2.26686,0.029671,0.320825,2.883296,0.025076,0.066382,-0.544556,0.058996,22.430731,25.736236,71.616453,170.212306,1.346686,22.859725,27.376788
std,1325957000.0,13.25943,3.388303,8.814045,0.208063,18.377236,202194.6,0.995416,0.284553,0.106483,0.082823,0.329961,2.455813,0.344662,0.313391,0.187549,1.915772,6.653838,30.18485,88.966272,0.27325,0.054234,169.721125,10.109514,0.872204,2.173435,11.227057,4.564308,3.346474,34.309747,2.453378,1.172976,3.122889,0.461022,8.666184,20.20331,6.562958,230.459136,1.161462,18.434684,59.216634
min,197000000000.0,1970.0,0.0,0.0,0.0,-53.154613,-86185900.0,1.0,-9.0,0.0,0.0,0.0,-9.0,0.0,0.0,0.0,1.0,1.0,1.0,4.0,0.0,0.0,-99.0,-99.0,-9.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-9.0,-9.0,6.4,1.4,52.8,2.0,0.01,-40.900557,-106.346771
25%,199102100000.0,1991.0,4.0,8.0,0.0,11.84962,6.655,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,2.0,3.0,23.0,83.0,0.0,0.0,-99.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.03,10.2,67.1,37.0,0.6,12.879721,1.659626
50%,200902200000.0,2009.0,6.0,15.0,0.0,31.467463,43.24651,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,3.0,4.0,35.0,98.0,0.0,0.0,-99.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,20.55,22.5,71.1,83.0,0.91,28.394857,43.679291
75%,201408100000.0,2014.0,9.0,23.0,0.0,34.538561,68.35734,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,3.0,14.0,73.0,168.0,0.0,0.0,-99.0,0.0,0.0,6.0,2.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,29.08,38.0,76.8,145.0,1.95,33.854721,69.345116
max,201712300000.0,2017.0,12.0,31.0,1.0,74.633553,179.3667,5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,9.0,22.0,113.0,1004.0,1.0,1.0,25000.0,406.0,1.0,13.0,1570.0,1360.0,500.0,8191.0,751.0,200.0,1.0,1.0,46.08,84.5,84.2,1150.0,8.42,64.963051,178.065032


In [None]:
# Separate numeric and categorical columns
numeric_columns = merged_df.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = merged_df.select_dtypes(include=['object']).columns

# Fill missing values for numeric columns using mean
merged_df[numeric_columns] = merged_df[numeric_columns].fillna(merged_df[numeric_columns].mean())

# Alternatively, you can fill numeric columns with median if preferred:
# merged_df[numeric_columns] = merged_df[numeric_columns].fillna(merged_df[numeric_columns].median())

# Fill missing values for categorical columns using mode
merged_df[categorical_columns] = merged_df[categorical_columns].apply(lambda x: x.fillna(x.mode()[0]), axis=0)

# Verify that there are no missing values left
print(merged_df.isnull().sum())


eventid                                    0
iyear                                      0
imonth                                     0
iday                                       0
extended                                   0
Country                                    0
region_txt                                 0
provstate                                  0
city                                       0
latitude                                   0
longitude                                  0
specificity                                0
vicinity                                   0
crit1                                      0
crit2                                      0
crit3                                      0
doubtterr                                  0
multiple                                   0
success                                    0
suicide                                    0
attacktype1                                0
attacktype1_txt                            0
targtype1 

In [139]:
# Check for duplicate rows in merged_df
duplicates = merged_df.duplicated().sum()

# Print the number of duplicate rows
print(f"Number of duplicate rows: {duplicates}")

Number of duplicate rows: 0
