In [1]:
import pandas as pd

In [2]:
#df = pd.read_csv("./data/rev_gtd4.csv", index_col=0)
testing = pd.read_csv("./data/testing.csv", index_col=0)
training = pd.read_csv("./data/training.csv", index_col=0)

In [3]:
def clean(df):
    
    df.date = pd.to_datetime(df.date)
    df = df.drop(columns=['city', 'loc_id'])
    df['year'] = df.date.dt.year
    df = df.assign(loc_id=(df['latitude'].astype(str) + '_' + df['longitude'].astype(str)).astype('category').cat.codes)
    df['unique_id'] = df[['loc_id','year']].apply(tuple, axis=1)

    for col in ['elevation', 'DIS_LAKE',
           'DIS_MAJOR_RIVER', 'DIS_OCEAN', 'DIS_RIVER', 'MER1990_40', 'MER1995_40',
           'MER2000_40', 'MER2005_40', 'POPGPW_1990_40', 'POPGPW_1995_40',
           'POPGPW_2000_40', 'POPGPW_2005_40', 'PRECAVNEW80_08', 'TEMPAV_8008']:
        df[col] = df[col].astype(float)
    
    return df
    

In [4]:
def aggregate(df):
    
    df = clean(df)

    df_list = list()
    col_names = list()
    for col in ['attacktype', 'targettype', 'group_name']:
        temp_series = df.groupby('unique_id')[col].nunique()
        df_list.append(temp_series)
        col_names.append(col)
    
    for col in ['nkill', 'nwound']:
        temp_series = df.groupby('unique_id')[col].sum()
        df_list.append(temp_series)
        col_names.append(col)
    
    for col in ['elevation', 'DIS_LAKE',
           'DIS_MAJOR_RIVER', 'DIS_OCEAN', 'DIS_RIVER', 'PRECAVNEW80_08', 'TEMPAV_8008',
           'ethin_div', 'HighRelig', 'ChrCatP', 'ReligCatP', 'year', 'loc_id']:
        temp_series = df.groupby('unique_id')[col].unique().apply(lambda x: x[0])
        df_list.append(temp_series)
        col_names.append(col)

    for col in ['MER{}_40', 'POPGPW_{}_40']:
        if df.year.max() < 1995:
            year = 1990
        elif df.year.max() < 2000:
            year = 1995
        elif df.year.max() < 2005:
            year = 2000
        else:
            year = 2005
        temp_series = df.groupby('unique_id')[col.format(year)].unique().apply(lambda x: x[0])
        df_list.append(temp_series)

    col_names += ['MER_40', 'POPGPW_40']

    final_df = pd.concat(df_list, axis=1, keys=col_names)
    
    final_df['attacked'] = 1
    
    return final_df

In [5]:
#final_df = aggregate(df)
test_df = aggregate(testing)
train_df = aggregate(training)

In [6]:
testing = clean(testing)
training = clean(training)

In [7]:
test_df.head()

Unnamed: 0_level_0,attacktype,targettype,group_name,nkill,nwound,elevation,DIS_LAKE,DIS_MAJOR_RIVER,DIS_OCEAN,DIS_RIVER,...,TEMPAV_8008,ethin_div,HighRelig,ChrCatP,ReligCatP,year,loc_id,MER_40,POPGPW_40,attacked
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"(0, 2017)",1,1,1,0.0,0.0,407.6225,653280.6,756490.4,407622.5,511623.0,...,27.221982,1,Christians,95-100%,95-100%,2017,0,0.038232,19524.49031,1
"(1, 2017)",1,1,1,0.0,3.0,373.0561,136567.2,1543883.0,1239130.0,373056.1,...,21.768104,1,Christians,95-100%,95-100%,2017,1,0.013878,143241.4035,1
"(2, 2017)",1,2,2,19.0,10.0,19.1244,557674.3,2830911.0,19124.4,1464731.0,...,26.140661,1,Muslims,10-40%,40-60%,2017,2,0.01459,47934.83824,1
"(3, 2017)",1,1,1,0.0,0.0,400.3782,413194.3,960858.1,400378.2,754241.3,...,20.193967,1,Christians,95-100%,95-100%,2017,3,0.114008,95943.85463,1
"(4, 2017)",1,2,2,0.0,0.0,825.0164,219463.5,1829711.0,847180.8,825016.4,...,20.927083,2,Christians,75-85%,75-85%,2017,4,0.186762,368985.4073,1


In [8]:
train_df.head()

Unnamed: 0_level_0,attacktype,targettype,group_name,nkill,nwound,elevation,DIS_LAKE,DIS_MAJOR_RIVER,DIS_OCEAN,DIS_RIVER,...,TEMPAV_8008,ethin_div,HighRelig,ChrCatP,ReligCatP,year,loc_id,MER_40,POPGPW_40,attacked
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"(0, 2015)",1,1,1,0.0,0.0,453.2507,645551.1,1004545.0,855846.2,453250.7,...,23.611782,4,Christians,90-95%,90-95%,2015,0,0.000702,1114.13758,1
"(1, 2016)",1,1,1,0.0,1.0,275.3147,393721.6,1215217.0,1110387.0,275314.7,...,22.97658,1,Christians,95-100%,95-100%,2016,1,0.008882,91672.75451,1
"(2, 2012)",1,1,1,5.0,5.0,407.6225,653280.6,756490.4,407622.5,511623.0,...,27.221982,1,Christians,95-100%,95-100%,2012,2,0.038232,19524.49031,1
"(2, 2013)",1,1,1,1.0,0.0,407.6225,653280.6,756490.4,407622.5,511623.0,...,27.221982,1,Christians,95-100%,95-100%,2013,2,0.038232,19524.49031,1
"(2, 2015)",1,1,1,1.0,2.0,407.6225,653280.6,756490.4,407622.5,511623.0,...,27.221982,2,Christians,95-100%,95-100%,2015,2,0.038232,19524.49031,1


In [None]:
#df = clean(df)

In [9]:
def sum_past_k_year_data(df, col, loc_id, year, k):
    
    agg_num = df.groupby(['loc_id','year']).sum().loc[loc_id].loc[year - k + 1:year].loc[:, col].sum()
    
    return agg_num


def count_past_k_year_data(df, col, loc_id, year, k):
    
    count = len(set(df.groupby(['loc_id', 'year', col]).count().loc[loc_id].loc[year - k + 1:year].index.get_level_values(col).values.tolist()))
    
    return count

In [None]:
sum_past_k_year_data(df, 'nkill', 0, 2015, 100)

In [None]:
#count_past_k_year_data(df, 'targsubtype1_txt', 0, 2015, 100)

In [None]:
#k = 5
#arg = [df, 'nkill', row['loc_id'], row['year'], 5]
#final_df['nkill_testing'] = final_df.apply(lambda row : sum_past_k_year_data(df, 'nkill', row['loc_id'], row['year'], 5), axis=1)

In [10]:
k = 5
#arg = [df, col, row['loc_id'], row['year'], 5]
for col in ['nkill', 'nwound']:
        col_name = col + '_past_{}'.format(5)
        test_df[col_name] = test_df.apply(lambda row : sum_past_k_year_data(testing, col, row['loc_id'], row['year'], 5), axis=1)
        
for col in ['attacktype', 'targettype', 'group_name']:
        col_name = col + '_past_{}'.format(5)
        test_df[col_name] = test_df.apply(lambda row : count_past_k_year_data(testing, col, row['loc_id'], row['year'], 5), axis=1)

In [None]:
final_df.to_csv('./test.csv')