In [None]:
import pandas as pd
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Optional: make plots prettier
sns.set(style="whitegrid")
# Load the dataset with proper encoding
try:
  df = pd.read_csv(r"C:\Users\BOB\Documents\Data Analysis\.data\Global_Terrorism\globalterrorismdb_0718dist.csv", encoding='utf-8')
except UnicodeDecodeError:
  df = pd.read_csv(r"C:\Users\BOB\Documents\Data Analysis\.data\Global_Terrorism\globalterrorismdb_0718dist.csv", encoding='latin-1')
print("Dataset loaded successfully!")
print(f"Shape: {df.shape}")

# Show the first few rows
df.head()

In [None]:
# Get a list of field names
field_names = df.columns.tolist()

# Display the result
print(field_names)

# Copy to clipboard
pd.Series(field_names).to_clipboard(index=False, header=False)

print("Field names copied to clipboard!")


In [None]:
# Count missing values per column
missing_values = df.isnull().sum().sort_values(ascending=False)
print(missing_values.head(20))  # Top 20 columns with most missing values


In [None]:
# Fill numeric columns with 0 (example: nkill, nwound)
df['nkill'] = df['nkill'].fillna(0)
df['nwound'] = df['nwound'].fillna(0)

# Fill categorical columns with 'Unknown'
categorical_cols = ['gname', 'attacktype1_txt', 'target1', 'weaptype1_txt']
df[categorical_cols] = df[categorical_cols].fillna('Unknown')

# Optionally drop rows with critical missing values
df = df.dropna(subset=['iyear', 'country', 'city'])


In [None]:
# Numeric columns
numeric_cols = ['iyear','imonth','iday','latitude','longitude','nkill','nwound','ransomamt']
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')  # convert invalid to NaN

# Categorical columns
categorical_cols = ['country_txt','region_txt','city','gname','attacktype1_txt']
df[categorical_cols] = df[categorical_cols].astype('category')


In [None]:
# Year, month, day inconsistencies
df = df[(df['iyear'] >= 1970) & (df['iyear'] <= 2017)]  # globalterrorismdb_0718dist ends in 2017
df = df[(df['imonth'] >= 1) & (df['imonth'] <= 12)]
df = df[(df['iday'] >= 1) & (df['iday'] <= 31)]

# Latitude / Longitude check
df = df[(df['latitude'].between(-90, 90)) & (df['longitude'].between(-180, 180))]

# Duplicate events (optional)
df = df.drop_duplicates(subset=['eventid'])


In [None]:
# Check remaining missing values
print(df.isnull().sum())

# Check data types
print(df.dtypes)

# Sample cleaned data
print(df.head())


In [None]:
# Categorical Transformation: Convert categorical data into a format suitable for analysis, such as one-hot encoding.
# List all categorical columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
print(categorical_cols)

In [None]:
drop_cols = ['scite1','scite2','scite3','dbsource','INT_LOG','INT_IDEO','INT_MISC','INT_ANY']
df = df.drop(columns=drop_cols, errors='ignore')  # errors='ignore' skips missing columns

In [None]:
low_card_cols = ['attacktype1_txt', 'region_txt', 'weaptype1_txt', 'country_txt']
df_encoded = pd.get_dummies(df, columns=low_card_cols, drop_first=True)

In [None]:
high_card_cols = ['gname', 'target1']

for col in high_card_cols:
    df_encoded[col], _ = pd.factorize(df_encoded[col])

In [None]:
df_encoded = pd.get_dummies(df, columns=low_card_cols, drop_first=True, sparse=True)

In [None]:
# Numerical Normalization: Normalize or standardize numerical features as required.
# Select numeric columns
numeric_cols = df_encoded.select_dtypes(include=['int64', 'float64']).columns.tolist()
print(numeric_cols)


In [None]:
df_encoded[numeric_cols] = (df_encoded[numeric_cols] - df_encoded[numeric_cols].mean()) / df_encoded[numeric_cols].std()


In [None]:
df_encoded[numeric_cols] = (df_encoded[numeric_cols] - df_encoded[numeric_cols].min()) / (df_encoded[numeric_cols].max() - df_encoded[numeric_cols].min())


In [None]:
df_encoded[numeric_cols] = df_encoded[numeric_cols].fillna(0)


In [None]:
# Data Exploration: Conduct exploratory data analysis on the cleaned and integrated dataset.
# Shape and column info
print("Rows, Columns:", df_encoded.shape)
print(df_encoded.info())

# First few rows
print(df_encoded.head())

# Summary statistics for numerical features
print(df_encoded.describe())

# Check missing values
print(df_encoded.isnull().sum())


In [None]:
numeric_cols = df_encoded.select_dtypes(include=['int64','float64']).columns.tolist()

# Histograms for numerical features
df_encoded[numeric_cols].hist(figsize=(15,10), bins=30)
plt.tight_layout()
plt.show()


In [None]:
sns.barplot(
    x='count', 
    y='attack_type', 
    data=attack_df, 
    hue='attack_type',  # assign hue
    palette="magma",
    dodge=False,
    legend=False
)
plt.title("Number of Attacks by Type")
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

attacks_per_year = df.groupby('iyear').size()

plt.figure(figsize=(12,6))
sns.lineplot(x=attacks_per_year.index, y=attacks_per_year.values, marker="o")
plt.title("Number of Terrorist Attacks per Year")
plt.xlabel("Year")
plt.ylabel("Number of Attacks")
plt.grid(True)
plt.show()


In [None]:
region_counts = df['region_txt'].value_counts()

region_df = region_counts.reset_index()
region_df.columns = ['region', 'count']

plt.figure(figsize=(10,6))
sns.barplot(x='count', y='region', data=region_df, palette='viridis')
plt.title("Number of Attacks by Region")
plt.xlabel("Number of Attacks")
plt.ylabel("Region")
plt.show()


In [None]:
plt.figure(figsize=(12,6))
sns.boxplot(x='attacktype1_txt', y='nkill', data=df)
plt.xticks(rotation=45)
plt.title("Number of Deaths by Attack Type")
plt.ylabel("Deaths")
plt.xlabel("Attack Type")
plt.show()


In [None]:
top_countries = df['country_txt'].value_counts().head(10)
top_countries_df = top_countries.reset_index()
top_countries_df.columns = ['country', 'count']

plt.figure(figsize=(10,6))
sns.barplot(x='count', y='country', data=top_countries_df, palette='magma')
plt.title("Top 10 Countries by Number of Attacks")
plt.xlabel("Number of Attacks")
plt.ylabel("Country")
plt.show()


In [None]:
weapon_counts = df['weaptype1_txt'].value_counts().head(10)

weapon_df = weapon_counts.reset_index()
weapon_df.columns = ['weapon_type', 'count']

plt.figure(figsize=(10,6))
sns.barplot(x='count', y='weapon_type', data=weapon_df, palette='cubehelix')
plt.title("Top 10 Weapon Types Used")
plt.xlabel("Number of Attacks")
plt.ylabel("Weapon Type")
plt.show()


In [None]:
plt.figure(figsize=(15,8))
sns.scatterplot(
    x='longitude', y='latitude', 
    hue='region_txt', data=df, 
    alpha=0.5, palette='tab10'
)
plt.title("Global Terrorism Attack Locations")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)
plt.show()
