In [1]:
import numpy as np
import pandas as pd
import os
import networkx as nx
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from sklearn.metrics import classification_report
from collections import Counter
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

In [2]:
path = '../../../data/top30groups/engineered_dfs/df_top30_100.csv'
data = pd.read_csv(path, encoding='ISO-8859-1')

In [3]:
def create_unique_geo_data(df):
    # Convert to datetime and sort
    df['attack_date'] = pd.to_datetime({'year': df['iyear'], 'month': df['imonth'], 'day': df['iday']})
    df.sort_values(by=['longitude', 'latitude', 'attack_date'], inplace=True)

    # Keep only relevant columns
    df = df.drop(columns=['Unnamed: 0', 'country', 'city', 'region', 'provstate', 'natlty1', 'specificity', 'iyear', 'imonth', 'iday'])
    

    df.sort_values(by=['gname', 'attack_date'], inplace=True)

    df = df.drop(columns='attack_date')
    print("KOLUMNER!!!!!!!!!!!!!!!!!!!!!!!!!!!!!", df.columns)
    # Drop duplicates based on location, keep the earliest attack
    #df_unique = df.drop_duplicates(subset=['longitude', 'latitude'], keep='first').reset_index(drop=True)

    return df

In [4]:
def encode_longlat(df):
    # Step 1: Create new 'longlat' feature as tuple
    df['longlat'] = list(zip(df['longitude'], df['latitude']))

    # Step 2: One-hot encode the new 'longlat' feature
    longlat_encoded = pd.get_dummies(df['longlat'], prefix='loc')

    # Step 3: Drop original longitude and latitude
    df = df.drop(columns=['longitude', 'latitude', 'longlat'])

    # Step 4: Concatenate the one-hot encoded features
    df = pd.concat([df, longlat_encoded], axis=1)

    return df


In [5]:
# Filter dataset to only contain unique coordinates
print("Entries before dropping long/lat duplicates: ", data.shape)
df_unique_geo = create_unique_geo_data(data)
print("Entries after dropping long/lat duplicates (#Nodes): ", df_unique_geo.shape)


Entries before dropping long/lat duplicates:  (3000, 26)
KOLUMNER!!!!!!!!!!!!!!!!!!!!!!!!!!!!! Index(['extended', 'latitude', 'longitude', 'vicinity', 'multiple', 'success',
       'suicide', 'attacktype1', 'targtype1', 'target1', 'individual',
       'weaptype1', 'nkill', 'property', 'ishostkid', 'gname'],
      dtype='object')
Entries after dropping long/lat duplicates (#Nodes):  (3000, 16)


In [6]:
df_unique_geo = encode_longlat(df_unique_geo)
df_unique_geo.shape

(3000, 1804)

In [7]:
from sklearn.preprocessing import StandardScaler

# creates train and test data, first 70% of each group is added to train and remaining 30% to test
def handle_leakage(df):
    train_frames = []
    test_frames = []

    split_point = int(len(df) * 0.7)  # 70% for training
    train_df = df[:split_point]
    test_df = df[split_point:]          

    # Shuffle each DataFrame separately
    train_df = shuffle(train_df)
    test_df = shuffle(test_df)

       # Identify numeric columns (excluding label columns like 'gname')
    numeric_cols = df.select_dtypes(include=['number']).columns

    # Scale train data
    scaler = StandardScaler()
    train_df[numeric_cols] = scaler.fit_transform(train_df[numeric_cols])

    # Apply same scaler to test data
    test_df[numeric_cols] = scaler.transform(test_df[numeric_cols])

    print(train_df.shape)

    return train_df, test_df

In [8]:
train_df, test_df = handle_leakage(df_unique_geo)

(2100, 1804)


In [9]:
test_df.shape

(900, 1804)

In [10]:
non_numeric_cols = train_df.select_dtypes(exclude=['number', 'bool']).columns
print(non_numeric_cols)

Index(['gname'], dtype='object')


In [11]:
non_numeric_cols = test_df.select_dtypes(exclude=['number', 'bool']).columns
print(non_numeric_cols)

Index(['gname'], dtype='object')


In [12]:
train_df.to_csv("LongLatCombined/train100.csv")
test_df.to_csv("LongLatCombined/test100.csv")