In [1]:
import numpy as np
import os
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
# import seaborn as sns
import pickle
import time
import gc
from tqdm import tqdm, tqdm_notebook

%matplotlib inline

#每次可以输出多个变量
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from pylab import rcParams
rcParams['figure.figsize'] = 14, 6

import warnings
warnings.filterwarnings("ignore")

#中文字体
import matplotlib
matplotlib.use('qt4agg')
#指定默认字体
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family'] = 'sans-serif'
#解决负号'-'显示为方块的问题
matplotlib.rcParams['axes.unicode_minus'] = False

In [2]:
df = pd.read_csv('./feature/df_preprocess.csv', encoding='gbk')
df.shape

(114183, 2734)

In [3]:
# 列分类
zeroone_col = ['extended', 'crit1', 'crit2', 'crit3', 'success', 'suicide', 'individual']
onehot_col = ['specificity', 'country', 'region', 'vicinity', 'doubtterr', 'multiple',
              'attacktype1', 'targtype1', 'targsubtype1', 'guncertain1', 'weaptype1', 'weapsubtype1', 'property', 'propextent', 
              'ishostkid', 'dbsource', 'natlty1', 'INT_LOG', 'INT_MISC', 'INT_ANY', 'INT_IDEO', 'claimed']
many_onehot_col = ['corp1', 'target1', 'provstate', 'city', 'weapdetail']
txt_col = ['location', 'summary', 'scite1', 'scite2', 'scite3', 'motive', 'propcomment']
cont_col = ['iyear', 'imonth', 'iday', 'latitude', 'longitude', 'nperps', 'nperpcap', 'nkill', 'nkillus', 'nkillter', 'nwound', 
            'nwoundus', 'nwoundte']

full_col = zeroone_col+onehot_col+many_onehot_col+txt_col+cont_col
len(full_col)

54

In [4]:
from collections import Counter

In [5]:
target_most = [i[0] for i in sorted(Counter(df.gname).items(), key=lambda x:x[1], reverse=True)[:20]]

In [6]:
df.gname = df.gname.apply(lambda x: x if x in target_most else 'Small')

In [7]:
def encode_feature(values):
    uniq = values.unique()
    mapping = dict(zip(uniq,range(1,len(uniq) + 1)))
    return values.map(mapping), mapping

df['gname'], mapping = encode_feature(df['gname'])

for i in tqdm_notebook(many_onehot_col):
    df[i], _ = encode_feature(df[i])

A Jupyter Widget




In [8]:
xlsx = [201701090031,201702210037,201703120023,201705050009,201705050010,201707010028,201707020006,201708110018,201711010006,201712010003]

In [9]:
flag = (((df.iyear == 2015) | (df.iyear == 2016)) & (df.gname == mapping['Unknown'])) | (df.eventid.isin(xlsx))

train = df[~flag]
test = df[flag]

train.shape
test.shape

(101807, 2734)

(12376, 2734)

## lgb

In [10]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_val, y_train, y_val = train_test_split(train, train.gname, test_size=0.2, random_state=2018)

X_train.shape
X_val.shape

(81445, 2734)

(20362, 2734)

In [12]:
len(X_train.gname.unique())
len(X_val.gname.unique())

21

21

In [13]:
train_data = lgb.Dataset(X_train.drop(['gname'], axis=1), X_train['gname'], categorical_feature=many_onehot_col)
val_data = lgb.Dataset(X_val.drop(['gname'], axis=1), X_val['gname'], categorical_feature=many_onehot_col)

cv_params =  {
        'objective': 'multiclass',
        'num_class': 22,
        'min_data_in_leaf': 30,
        'num_leaves': 127,
        'max_depth': 7,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.9,
        'seed': 2018
        }

bst = lgb.train(cv_params, train_data, early_stopping_rounds=5, verbose_eval=10, valid_sets=[train_data, val_data], 
                valid_names=['train', 'val'], num_boost_round=10000)

Training until validation scores don't improve for 5 rounds.
[10]	train's multi_logloss: 1.26275	val's multi_logloss: 1.27981
[20]	train's multi_logloss: 0.754536	val's multi_logloss: 0.779803
[30]	train's multi_logloss: 0.483503	val's multi_logloss: 0.514144
[40]	train's multi_logloss: 0.322462	val's multi_logloss: 0.35714
[50]	train's multi_logloss: 0.2218	val's multi_logloss: 0.260637
[60]	train's multi_logloss: 0.157085	val's multi_logloss: 0.199736
[70]	train's multi_logloss: 0.114458	val's multi_logloss: 0.160666
[80]	train's multi_logloss: 0.0853311	val's multi_logloss: 0.134975
[90]	train's multi_logloss: 0.0652822	val's multi_logloss: 0.117923
[100]	train's multi_logloss: 0.050858	val's multi_logloss: 0.10579
[110]	train's multi_logloss: 0.0403591	val's multi_logloss: 0.0973977
[120]	train's multi_logloss: 0.0324974	val's multi_logloss: 0.0913989
[130]	train's multi_logloss: 0.0263911	val's multi_logloss: 0.0870002
[140]	train's multi_logloss: 0.0216304	val's multi_logloss: 0.

In [14]:
pred = bst.predict(test.drop(['gname'], axis=1))

In [15]:
import heapq

In [16]:
res = []

for i in tqdm_notebook(range(test.shape[0])):
    tmp_list = []
    tmp = heapq.nlargest(7, pred[i])
    for a in tmp:
        tmp_list.append((np.where(pred[i] == a))[0][0])
    res.append(tmp_list)

A Jupyter Widget




In [17]:
res2 = []
for i in res:
    i.remove(2)
    i.remove(1)
    res2.append(i)

In [18]:
mapping

{'Abu Sayyaf Group (ASG)': 5,
 'Al-Qaida in Iraq': 13,
 'Al-Qaida in the Arabian Peninsula (AQAP)': 14,
 'Al-Shabaab': 18,
 'Boko Haram': 19,
 'Communist Party of India - Maoist (CPI-Maoist)': 15,
 "Donetsk People's Republic": 21,
 'Fulani extremists': 12,
 'Houthi extremists (Ansar Allah)': 16,
 'Islamic State of Iraq and the Levant (ISIL)': 20,
 "Kurdistan Workers' Party (PKK)": 6,
 'Liberation Tigers of Tamil Eelam (LTTE)': 3,
 'Maoists': 10,
 'Muslim extremists': 8,
 'National Liberation Army of Colombia (ELN)': 7,
 "New People's Army (NPA)": 9,
 'Revolutionary Armed Forces of Colombia (FARC)': 4,
 'Small': 1,
 'Taliban': 11,
 'Tehrik-i-Taliban Pakistan (TTP)': 17,
 'Unknown': 2}

In [19]:
result = test[['eventid']]

for idx in range(5):
    result['pred' + str(idx)] = [i[idx] for i in res2]
    result['pred_name_' + str(idx)] = result['pred' + str(idx)].map(dict(zip(mapping.values(), mapping.keys())))
    result = result.drop(['pred' + str(idx)], axis=1)

In [22]:
result.tail(10)

Unnamed: 0,eventid,pred_name_0,pred_name_1,pred_name_2,pred_name_3,pred_name_4
103533,201701090031,Muslim extremists,Al-Qaida in Iraq,Islamic State of Iraq and the Levant (ISIL),Taliban,Kurdistan Workers' Party (PKK)
104775,201702210037,Muslim extremists,Taliban,Maoists,Islamic State of Iraq and the Levant (ISIL),Boko Haram
105390,201703120023,Muslim extremists,Al-Shabaab,Boko Haram,Communist Party of India - Maoist (CPI-Maoist),Islamic State of Iraq and the Levant (ISIL)
106967,201705050009,Muslim extremists,Al-Shabaab,Boko Haram,Houthi extremists (Ansar Allah),Islamic State of Iraq and the Levant (ISIL)
106968,201705050010,Muslim extremists,Al-Shabaab,Boko Haram,Houthi extremists (Ansar Allah),Islamic State of Iraq and the Levant (ISIL)
109039,201707010028,Muslim extremists,Fulani extremists,Al-Shabaab,Boko Haram,Islamic State of Iraq and the Levant (ISIL)
109049,201707020006,Muslim extremists,Islamic State of Iraq and the Levant (ISIL),Taliban,New People's Army (NPA),Donetsk People's Republic
110333,201708110018,Muslim extremists,Taliban,Maoists,Kurdistan Workers' Party (PKK),Islamic State of Iraq and the Levant (ISIL)
112632,201711010006,Taliban,Muslim extremists,Al-Qaida in Iraq,Maoists,Islamic State of Iraq and the Levant (ISIL)
113434,201712010003,Muslim extremists,Al-Qaida in Iraq,Islamic State of Iraq and the Levant (ISIL),Maoists,Kurdistan Workers' Party (PKK)


In [21]:
result.to_csv('./result/result_new.csv', index=None)

## KMeans

In [10]:
xlsx = [200108110012,200511180002,200901170021,201402110015,201405010071,201411070002,201412160041,201508010015,201705080012]

In [11]:
from sklearn.cluster import KMeans

In [12]:
df = df.fillna(1)

In [13]:
df = df.iloc[:, :641]

In [14]:
kmeans = KMeans(n_clusters=5, random_state=2018).fit(df.iloc[:, 2:])

In [20]:
kmeans.cluster_centers_

array([[6.60411136e+00, 1.56465863e+01, 1.13879246e-01, ...,
        5.23886490e-16, 8.21999864e-01, 1.78000136e-01],
       [6.33641312e+00, 1.54249522e+01, 4.01647786e-02, ...,
        2.31572753e-02, 8.18625864e-01, 1.58216860e-01],
       [6.41054969e+00, 1.55215351e+01, 5.26646556e-02, ...,
        2.66465561e-02, 8.46279537e-01, 1.27073906e-01],
       [6.57817526e+00, 1.55350671e+01, 6.71138307e-02, ...,
        4.31474072e-04, 8.33176434e-01, 1.66392092e-01],
       [6.53973366e+00, 1.59069062e+01, 5.75410344e-02, ...,
        1.19348975e-15, 7.90151750e-01, 2.09848250e-01]])

In [29]:
distance = []

for i in tqdm_notebook(range(df.shape[0])):
    distance.append(np.linalg.norm(kmeans.cluster_centers_[int(df.iloc[0].cluster)] - np.array(df.iloc[i, 2:-1].tolist())))

A Jupyter Widget




In [32]:
distance[:100]

[6308.96390733411,
 6303.2649098793445,
 6301.790440831164,
 6299.845069141208,
 6299.256082956245,
 6297.0838119275295,
 6295.946237416021,
 6294.309420802653,
 6293.424098862097,
 6290.61758959156,
 6290.538317875247,
 6292.994700337289,
 6289.810936066146,
 6286.396934366976,
 6285.327832489607,
 6284.470600385504,
 6284.21568434819,
 6283.058005301587,
 6279.135118050135,
 6280.842786409109,
 6278.821714509279,
 6276.487863124484,
 6276.396521359455,
 6273.51581962813,
 6285.166802285402,
 6274.520303580547,
 6276.117361100859,
 6278.003314762325,
 6267.892891896886,
 6266.718736393361,
 6266.628846119954,
 6269.179140316924,
 6266.668925303042,
 6266.248833747433,
 6261.174516062015,
 6258.258768443418,
 6257.761112841103,
 6271.958468971917,
 6254.952603784353,
 6259.0719506084615,
 6250.75490717021,
 6250.452504636571,
 6262.274897399308,
 6248.1779523295045,
 6258.280187650244,
 6246.372303408061,
 6248.042975428373,
 6253.713586799941,
 6252.057638660578,
 6250.632555273392,
 

In [34]:
df['cluster'] = kmeans.labels_
df['distance'] = distance

In [17]:
Counter(kmeans.labels_)

Counter({0: 14688, 1: 33986, 2: 23869, 3: 25495, 4: 16145})

In [39]:
df[['eventid', 'distance']][df.cluster == 2].sort_values(by=['distance']).head()

Unnamed: 0,eventid,distance
11337,200508240003,6703.531972
11277,200508150010,6705.647316
11324,200508210010,6706.483977
11242,200508110010,6784.298931
11412,200509070004,6793.094695


In [18]:
df[df.eventid.isin(xlsx)][['eventid', 'cluster']]

Unnamed: 0,eventid,cluster
5139,200108110012,1
11700,200511180002,2
22894,200901170021,2
59671,201402110015,1
63638,201405010071,4
72622,201411070002,3
74130,201412160041,3
83767,201508010015,1
107031,201705080012,3


In [25]:
df[['eventid', 'cluster']].to_csv('./result/result_cluster_sample.csv', index=None)