In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("./data/rev_gtd4.csv", index_col=0)

In [6]:
def clean(df):
    
    df.date = pd.to_datetime(df.date)
    df = df.drop(columns=['city', 'loc_id'])
    df['year'] = df.date.dt.year
    df = df.assign(loc_id=(df['latitude'].astype(str) + '_' + df['longitude'].astype(str)).astype('category').cat.codes)
    df['unique_id'] = df[['loc_id','year']].apply(tuple, axis=1)

    for col in ['elevation', 'DIS_LAKE',
           'DIS_MAJOR_RIVER', 'DIS_OCEAN', 'DIS_RIVER', 'MER1990_40', 'MER1995_40',
           'MER2000_40', 'MER2005_40', 'POPGPW_1990_40', 'POPGPW_1995_40',
           'POPGPW_2000_40', 'POPGPW_2005_40', 'PRECAVNEW80_08', 'TEMPAV_8008']:
        df[col] = df[col].astype(float)
    
    return df
    

In [11]:
def aggregate(df):
    
    df = clean(df)

    df_list = list()
    col_names = list()
    for col in ['attacktype', 'targettype', 'group_name']:
        temp_series = df.groupby('unique_id')[col].nunique()
        df_list.append(temp_series)
        col_names.append(col)
    
    for col in ['nkill', 'nwound']:
        temp_series = df.groupby('unique_id')[col].sum()
        df_list.append(temp_series)
        col_names.append(col)
    
    for col in ['elevation', 'DIS_LAKE',
           'DIS_MAJOR_RIVER', 'DIS_OCEAN', 'DIS_RIVER', 'PRECAVNEW80_08', 'TEMPAV_8008',
           'ethin_div', 'HighRelig', 'ChrCatP', 'ReligCatP', 'year', 'loc_id']:
        temp_series = df.groupby('unique_id')[col].unique().apply(lambda x: x[0])
        df_list.append(temp_series)
        col_names.append(col)

    for col in ['MER{}_40', 'POPGPW_{}_40']:
        if df.year.max() < 1995:
            year = 1990
        elif df.year.max() < 2000:
            year = 1995
        elif df.year.max() < 2005:
            year = 2000
        else:
            year = 2005
        temp_series = df.groupby('unique_id')[col.format(year)].unique().apply(lambda x: x[0])
        df_list.append(temp_series)

    col_names += ['MER_40', 'POPGPW_40']

    final_df = pd.concat(df_list, axis=1, keys=col_names)
    
    final_df['attacked'] = 1
    
    return final_df

In [12]:
final_df = aggregate(df)

In [13]:
final_df.head(100)

Unnamed: 0_level_0,attacktype,targettype,group_name,nkill,nwound,elevation,DIS_LAKE,DIS_MAJOR_RIVER,DIS_OCEAN,DIS_RIVER,...,TEMPAV_8008,ethin_div,HighRelig,ChrCatP,ReligCatP,year,loc_id,MER_40,POPGPW_40,attacked
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"(0, 1988)",1,1,1,0.0,0.0,406.5024,741037.0,625854.1,510545.1,406502.4,...,25.683477,1,Christians,95-100%,95-100%,1988,0,0.000628,354.955445,1
"(0, 1989)",1,1,1,53.0,0.0,406.5024,741037.0,625854.1,510545.1,406502.4,...,25.683477,1,Christians,95-100%,95-100%,1989,0,0.000628,354.955445,1
"(0, 1990)",1,1,1,8.0,0.0,406.5024,741037.0,625854.1,510545.1,406502.4,...,25.683477,1,Christians,95-100%,95-100%,1990,0,0.000628,354.955445,1
"(0, 1992)",1,1,1,0.0,0.0,406.5024,741037.0,625854.1,510545.1,406502.4,...,25.683477,1,Christians,95-100%,95-100%,1992,0,0.000628,354.955445,1
"(0, 1994)",1,1,1,8.0,10.0,406.5024,741037.0,625854.1,510545.1,406502.4,...,25.683477,1,Christians,95-100%,95-100%,1994,0,0.000628,354.955445,1
"(1, 1984)",1,1,1,0.0,3.0,373.0099,805545.3,664347.9,373009.8,380116.9,...,27.755316,1,Christians,95-100%,95-100%,1984,1,0.041284,31009.549450,1
"(1, 1985)",1,1,1,0.0,0.0,373.0099,805545.3,664347.9,373009.8,380116.9,...,27.755316,2,Christians,95-100%,95-100%,1985,1,0.041284,31009.549450,1
"(1, 1987)",1,2,1,7.0,0.0,373.0099,805545.3,664347.9,373009.8,380116.9,...,27.755316,1,Christians,95-100%,95-100%,1987,1,0.041284,31009.549450,1
"(1, 1988)",1,1,1,7.0,0.0,373.0099,805545.3,664347.9,373009.8,380116.9,...,27.755316,2,Christians,95-100%,95-100%,1988,1,0.041284,31009.549450,1
"(1, 1989)",2,2,1,16.0,0.0,373.0099,805545.3,664347.9,373009.8,380116.9,...,27.755316,2,Christians,95-100%,95-100%,1989,1,0.041284,31009.549450,1


In [14]:
df = clean(df)

In [15]:
def sum_past_k_year_data(df, col, loc_id, year, k):
    
    agg_num = df.groupby(['loc_id','year']).sum().loc[loc_id].loc[year - k + 1:year].loc[:, col].sum()
    
    return agg_num


def count_past_k_year_data(df, col, loc_id, year, k):
    
    count = len(set(df.groupby(['loc_id', 'year', col]).count().loc[loc_id].loc[year - k + 1:year].index.get_level_values(col).values.tolist()))
    
    return count

In [16]:
sum_past_k_year_data(df, 'nkill', 0, 2015, 100)

69.0

In [None]:
#count_past_k_year_data(df, 'targsubtype1_txt', 0, 2015, 100)

In [None]:
#k = 5
#arg = [df, 'nkill', row['loc_id'], row['year'], 5]
#final_df['nkill_testing'] = final_df.apply(lambda row : sum_past_k_year_data(df, 'nkill', row['loc_id'], row['year'], 5), axis=1)

In [None]:
k = 5
#arg = [df, col, row['loc_id'], row['year'], 5]
for col in ['nkill', 'nwound']:
        col_name = col + '_past_{}'.format(5)
        final_df[col_name] = final_df.apply(lambda row : sum_past_k_year_data(df, col, row['loc_id'], row['year'], 5), axis=1)
        
for col in ['attacktype', 'targettype', 'group_name']:
        col_name = col + '_past_{}'.format(5)
        final_df[col_name] = final_df.apply(lambda row : count_past_k_year_data(df, col, row['loc_id'], row['year'], 5), axis=1)

In [None]:
final_df.to_csv('./test.csv')

In [None]:
df.iloc[86]

In [None]:
count_past_k_year_data(df, col, 86, 1991, 5)

In [None]:
df.groupby(['loc_id', 'year', col]).count().loc[:87].loc[:1991]

In [None]:
def check(attacked, default):
    
    if attacked == 0:
        return None
    else:
        return default
    
final_df[col] = final_df.apply(lambda row : check(row['attacked'], row[col]), axis = 1)