In [1]:
import pandas as pd
import numpy as np


# stage1 feature merging

In [2]:
# read in features for stage1 (au score)
au_df = pd.read_csv("TrainStage1_AU.csv")
## filter self-reference
au_df = au_df.loc[au_df['Test'] != au_df['Reference']]
## filter 887 ugly article
au_df = au_df.loc[au_df['Test'] != 887]
au_df = au_df.loc[au_df['Reference'] != 887]
au_df

Unnamed: 0,Test,Reference,Similarity
1,1,10,0.000000
2,1,1000,0.000000
3,1,1005,0.000000
4,1,1007,0.000000
5,1,1010,0.142857
...,...,...,...
313594,998,984,0.000000
313595,998,986,0.000000
313596,998,988,0.000000
313597,998,992,0.333333


In [3]:
# read in features for stage1 (jaccard)
jcd_df1 = pd.read_csv("new_data/stage1_keyword_jaccard_similarity_yanyu.csv")
jcd_df2 = pd.read_csv("new_data/stage1_only_keyword_jaccard_similarity_yanyu.csv")
jcd_df = pd.merge(jcd_df1, jcd_df2, on=["Test", "Reference"], how="left")
# rename columns
jcd_df = jcd_df.rename(
    {'Jaccard_Similarity(%)_x': 'jaccard_once_keyword', 'Jaccard_Similarity(%)_y': 'jaccard_multi_keyword'},
    axis=1
)
jcd_df

Unnamed: 0,Test,Reference,jaccard_once_keyword,jaccard_multi_keyword
0,1,1,1.0,1.000000
1,1,10,0.0,0.000000
2,1,1000,0.0,0.000000
3,1,1005,0.0,0.000000
4,1,1007,0.0,0.000000
...,...,...,...,...
313595,998,986,0.0,0.000000
313596,998,988,0.0,0.000000
313597,998,992,1.0,0.181818
313598,998,997,0.0,0.714286


In [4]:
# read in features for stage1 (cosine similarities)
cos_df = pd.read_csv("new_data/results_cos_stage1.csv")
cos_df

Unnamed: 0,Test,Reference,tf_cosine_appearance,tfidf_cosine_appearance,tf_cosine_times,tfidf_cosine_times
0,1,10,0.000000,0.000000,0.000000,0.000000
1,1,1000,0.000000,0.000000,0.000000,0.000000
2,1,1005,0.000000,0.000000,0.000000,0.000000
3,1,1007,0.000000,0.000000,0.000000,0.000000
4,1,1010,0.188982,0.184381,0.158624,0.195430
...,...,...,...,...,...,...
313035,998,984,0.000000,0.000000,0.000000,0.000000
313036,998,986,0.000000,0.000000,0.000000,0.000000
313037,998,988,0.000000,0.000000,0.000000,0.000000
313038,998,992,0.308607,0.238700,0.099049,0.077671


In [5]:
# read in features for stage1 (heimin)
hemin_df = pd.read_csv("new_data/Sim_Hash_Results_stage1_Filtered.csv")
hemin_df

Unnamed: 0,Test,Reference,Sim_Hash_Similarity
0,1,10,0.30
1,1,1000,0.32
2,1,1005,0.27
3,1,1007,0.33
4,1,1010,0.20
...,...,...,...
311917,998,984,0.16
311918,998,986,0.25
311919,998,988,0.22
311920,998,992,0.26


In [6]:
# read in features for stage1 (heimin)
min_hemin_df = pd.read_csv("new_data/Min_Hash_Results_stage1_Filtered.csv")
min_hemin_df

Unnamed: 0,Test,Reference,Min_Similarity
0,1,10,0.109375
1,1,1000,0.000000
2,1,1005,0.046875
3,1,1007,0.046875
4,1,1010,0.109375
...,...,...,...
311917,998,984,0.085938
311918,998,986,0.000000
311919,998,988,0.031250
311920,998,992,0.062500


## merge all together
- **order**: similarity(au_score), jaccard_once, jaccard_multi, tf_cosine_once_keyword, tfidf_cosine_once_keyword, tf_cosine_multi_keyword, tfidf_cosine_multi_keyword, sim_hash

In [7]:
# merge all together
features1_df = pd.merge(au_df, jcd_df, on=["Test", "Reference"], how="left")
features1_df = pd.merge(features1_df, cos_df, on=["Test", "Reference"], how="left")
features1_df = pd.merge(features1_df, hemin_df, on=["Test", "Reference"], how="left")
features1_df = pd.merge(features1_df, min_hemin_df, on=["Test", "Reference"], how="left")
features1_df

Unnamed: 0,Test,Reference,Similarity,jaccard_once_keyword,jaccard_multi_keyword,tf_cosine_appearance,tfidf_cosine_appearance,tf_cosine_times,tfidf_cosine_times,Sim_Hash_Similarity,Min_Similarity
0,1,10,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.30,0.109375
1,1,1000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.32,0.000000
2,1,1005,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.27,0.046875
3,1,1007,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.33,0.046875
4,1,1010,0.142857,0.0,0.100000,0.188982,0.184381,0.158624,0.195430,0.20,0.109375
...,...,...,...,...,...,...,...,...,...,...,...
311917,998,984,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.16,0.085938
311918,998,986,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.25,0.000000
311919,998,988,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.22,0.031250
311920,998,992,0.333333,1.0,0.181818,0.308607,0.238700,0.099049,0.077671,0.26,0.062500


In [43]:
# output only appearance
stage1_app_df = features1_df[["Similarity", "jaccard_once_keyword",
              "tf_cosine_appearance", "tfidf_cosine_appearance",
              "Sim_Hash_Similarity", "Min_Similarity"]]
stage1_app_np = stage1_app_df.to_numpy()
with open("stage1_similarity_of_only_appearance.npy", "wb") as f:
    np.save(f, stage1_app_np)

In [46]:
len(stage1_app_np[0])

6

In [47]:
# output only times
stage1_times_df = features1_df[["Similarity", "jaccard_multi_keyword",
              "tf_cosine_times", "tfidf_cosine_times",
              "Sim_Hash_Similarity", "Min_Similarity"]]
stage1_times_np = stage1_times_df.to_numpy()
with open("stage1_similarity_of_only_times.npy", "wb") as f:
    np.save(f, stage1_times_np)

In [49]:
len(stage1_times_np[0])

6

In [68]:
# output all
all_df = features1_df[['Similarity', 'jaccard_once_keyword',
       'jaccard_multi_keyword', 'tf_cosine_appearance',
       'tfidf_cosine_appearance', 'tf_cosine_times', 'tfidf_cosine_times',
       'Sim_Hash_Similarity', 'Min_Similarity']]
all_np = all_df.to_numpy()
with open("stage1_similarity_feature.npy", "wb") as f:
    np.save(f, all_np)

In [92]:
len(all_np[0])

9

In [8]:
# output all to a csv include test, reference
features1_df.to_csv("stage1_similarity_features.csv", index=False)

In [41]:
# add answer column
ans_df = pd.read_csv("TrainLabel_AU.csv")
ans_df["Relevance"] = 1
stage1_df = pd.merge(features1_df, ans_df[["Test", "Reference", "Relevance"]], how="left", on=["Test","Reference"])
stage1_df["Relevance"] = stage1_df["Relevance"].fillna(0)
stage1_df

Unnamed: 0,Test,Reference,Similarity,jaccard_once_keyword,jaccard_multi_keyword,tf_cosine_once_keyword,tfidf_cosine_once_keyword,tf_cosine_multi_keyword,tfidf_cosine_multi_keyword,Sim_Hash_Similarity,Relevance
0,1,10,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.30,0.0
1,1,1000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.32,0.0
2,1,1005,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.27,0.0
3,1,1007,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.33,0.0
4,1,1010,0.142857,0.071429,0.071429,0.188982,0.000000,0.158624,0.000000,0.20,0.0
...,...,...,...,...,...,...,...,...,...,...,...
311917,998,984,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.16,0.0
311918,998,986,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.25,0.0
311919,998,988,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.22,0.0
311920,998,992,0.333333,0.142857,0.142857,0.308607,0.096121,0.099049,0.024938,0.26,0.0


In [43]:
# store compare csv
stage1_comp_df = stage1_df[["Test", "Reference", "Similarity", "Relevance"]]
#stage1_comp_df.to_csv("Stage1CompareAns.csv", index=False)
stage1_comp_df

Unnamed: 0,Test,Reference,Similarity,Relevance
0,1,10,0.000000,0.0
1,1,1000,0.000000,0.0
2,1,1005,0.000000,0.0
3,1,1007,0.000000,0.0
4,1,1010,0.142857,0.0
...,...,...,...,...
311917,998,984,0.000000,0.0
311918,998,986,0.000000,0.0
311919,998,988,0.000000,0.0
311920,998,992,0.333333,0.0


In [44]:
# extract features
extracted_df = stage1_df.copy()
del extracted_df['Test']
del extracted_df['Reference']
del extracted_df['Relevance']
X1 = extracted_df.to_numpy()
X1

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.3       ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.32      ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.27      ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.22      ],
       [0.33333333, 0.14285714, 0.14285714, ..., 0.09904924, 0.02493781,
        0.26      ],
       [0.83333333, 0.7       , 0.7       , ..., 0.93274356, 0.98369463,
        0.18      ]])

In [46]:
# output features
with open("X1_only_similarities.npy", "wb") as f:
    np.save(f, X1)

In [48]:
# output answers
Y1 = stage1_comp_df.Relevance.values
with open("Y1_only_similarities.npy", "wb") as f:
    np.save(f, Y1)

# stage3 feature merging

In [9]:
# read in features for stage1 (au score)
au2_df = pd.read_csv("TestStage3_AU.csv")
## filter self-reference
au2_df = au2_df.loc[au2_df['Test'] != au2_df['Reference']]
## filter 887 ugly article
au2_df = au2_df.loc[au2_df['Test'] != 887]
au2_df = au2_df.loc[au2_df['Reference'] != 887]
au2_df

Unnamed: 0,Test,Reference,Similarity
0,100,1002,0.133333
1,100,1003,0.066667
2,100,101,0.133333
3,100,1013,0.000000
4,100,1017,0.000000
...,...,...,...
175975,995,981,0.142857
175976,995,982,0.285714
175977,995,983,0.142857
175978,995,989,0.000000


In [10]:
# read in features for stage1 (jaccard)
jcd2_df1 = pd.read_csv("new_data/stage3_keyword_jaccard_similarity_yanyu.csv")
jcd2_df2 = pd.read_csv("new_data/stage3_only_keyword_jaccard_similarity_yanyu.csv")
jcd2_df = pd.merge(jcd2_df1, jcd2_df2, on=["Test", "Reference"], how="left")
# rename columns
jcd2_df = jcd2_df.rename(
    {'Jaccard_Similarity(%)_x': 'jaccard_once_keyword', 'Jaccard_Similarity(%)_y': 'jaccard_multi_keyword'},
    axis=1
)
jcd2_df

Unnamed: 0,Test,Reference,jaccard_once_keyword,jaccard_multi_keyword
0,100,100,1.0,1.000000
1,100,1002,0.1,0.071429
2,100,1003,0.0,0.052632
3,100,101,1.0,0.105263
4,100,1013,0.0,0.000000
...,...,...,...,...
176395,995,982,0.1,0.100000
176396,995,983,0.0,0.100000
176397,995,989,0.0,0.000000
176398,995,99,0.5,0.222222


In [11]:
# read in features for stage1 (cosine similarities)
cos2_df = pd.read_csv("new_data/results_cos_stage3.csv")
cos2_df

Unnamed: 0,Test,Reference,Similarity,tf_cosine_appearance,tfidf_cosine_appearance,tf_cosine_times,tfidf_cosine_times
0,100,1002,0.133333,0.133333,0.038855,0.123603,0.047808
1,100,1003,0.066667,0.115470,0.019133,0.099177,0.018642
2,100,101,0.133333,0.210819,0.106684,0.364363,0.144300
3,100,1013,0.000000,0.000000,0.000000,0.000000,0.000000
4,100,1017,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...
175975,995,981,0.142857,0.094491,0.022252,0.552917,0.210549
175976,995,982,0.285714,0.195180,0.100620,0.222871,0.072412
175977,995,983,0.142857,0.188982,0.055576,0.386444,0.129999
175978,995,989,0.000000,0.000000,0.000000,0.000000,0.000000


In [12]:
# read in features for stage1 (heimin)
hemin2_df = pd.read_csv("new_data/Sim_Hash_Results_stage3_Filtered.csv")
hemin2_df['Sim_Hash_Similarity'] = hemin2_df['Sim_Hash_Similarity']/100
hemin2_df

Unnamed: 0,Test,Reference,Sim_Hash_Similarity
0,100,1002,0.33
1,100,1003,0.26
2,100,101,0.15
3,100,1013,0.32
4,100,1017,0.35
...,...,...,...
175975,995,981,0.14
175976,995,982,0.26
175977,995,983,0.33
175978,995,989,0.22


In [13]:
# read in features for stage1 (min heimin)
min_hemin2_df = pd.read_csv("new_data/Min_Hash_Results_stage3_Filtered.csv")
min_hemin2_df

Unnamed: 0,Test,Reference,Min_Similarity
0,100,1002,0.000000
1,100,1003,0.164062
2,100,101,0.132812
3,100,1013,0.000000
4,100,1017,0.000000
...,...,...,...
175975,995,981,0.445312
175976,995,982,0.000000
175977,995,983,0.093750
175978,995,989,0.078125


## merge all together
- **order**: similarity(au_score), jaccard_once, jaccard_multi, tf_cosine_once_keyword, tfidf_cosine_once_keyword, tf_cosine_multi_keyword, tfidf_cosine_multi_keyword, sim_hash

In [14]:
# merge all together
features2_df = pd.merge(au2_df, jcd2_df, on=["Test", "Reference"], how="left")
features2_df = pd.merge(features2_df, cos2_df, on=["Test", "Reference", "Similarity"], how="left")
features2_df = pd.merge(features2_df, hemin2_df, on=["Test", "Reference"], how="left")
features2_df = pd.merge(features2_df, min_hemin2_df, on=["Test", "Reference"], how="left")
features2_df

Unnamed: 0,Test,Reference,Similarity,jaccard_once_keyword,jaccard_multi_keyword,tf_cosine_appearance,tfidf_cosine_appearance,tf_cosine_times,tfidf_cosine_times,Sim_Hash_Similarity,Min_Similarity
0,100,1002,0.133333,0.1,0.071429,0.133333,0.038855,0.123603,0.047808,0.33,0.000000
1,100,1003,0.066667,0.0,0.052632,0.115470,0.019133,0.099177,0.018642,0.26,0.164062
2,100,101,0.133333,1.0,0.105263,0.210819,0.106684,0.364363,0.144300,0.15,0.132812
3,100,1013,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.32,0.000000
4,100,1017,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.35,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
175975,995,981,0.142857,0.0,0.045455,0.094491,0.022252,0.552917,0.210549,0.14,0.445312
175976,995,982,0.285714,0.1,0.100000,0.195180,0.100620,0.222871,0.072412,0.26,0.000000
175977,995,983,0.142857,0.0,0.100000,0.188982,0.055576,0.386444,0.129999,0.33,0.093750
175978,995,989,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.22,0.078125


In [61]:
# output only appearance
stage2_app_df = features2_df[["Similarity", "jaccard_once_keyword",
              "tf_cosine_appearance", "tfidf_cosine_appearance",
              "Sim_Hash_Similarity", "Min_Similarity"]]
stage2_app_np = stage2_app_df.to_numpy()
with open("stage3_similarity_of_only_appearance.npy", "wb") as f:
    np.save(f, stage2_app_np)

In [96]:
len(stage2_app_np)

175980

In [63]:
# output only times
stage2_times_df = features2_df[["Similarity", "jaccard_multi_keyword",
              "tf_cosine_times", "tfidf_cosine_times",
              "Sim_Hash_Similarity", "Min_Similarity"]]
stage2_times_np = stage2_times_df.to_numpy()
with open("stage3_similarity_of_only_times.npy", "wb") as f:
    np.save(f, stage2_times_np)

In [93]:
len(stage2_times_np[0])

6

In [65]:
features2_df.columns.values

array(['Test', 'Reference', 'Similarity', 'jaccard_once_keyword',
       'jaccard_multi_keyword', 'tf_cosine_appearance',
       'tfidf_cosine_appearance', 'tf_cosine_times', 'tfidf_cosine_times',
       'Sim_Hash_Similarity', 'Min_Similarity'], dtype=object)

In [66]:
# output all
all3_df = features2_df[['Similarity', 'jaccard_once_keyword',
       'jaccard_multi_keyword', 'tf_cosine_appearance',
       'tfidf_cosine_appearance', 'tf_cosine_times', 'tfidf_cosine_times',
       'Sim_Hash_Similarity', 'Min_Similarity']]
all3_np = all3_df.to_numpy()
with open("stage3_similarity_feature.npy", "wb") as f:
    np.save(f, all3_np)

In [95]:
len(all3_np[0])

9

In [15]:
# output all to a csv include test, reference
features2_df.to_csv("stage3_similarity_features.csv", index=False)

In [None]:
# output only appearance
stage1_app_df = features1_df[["Similarity", "jaccard_once_keyword",
              "tf_cosine_appearance", "tfidf_cosine_appearance",
              "Sim_Hash_Similarity", "Min_Similarity"]]
stage1_app_np = stage1_app_df.to_numpy()
with open("stage1_similarity_of_only_appearance.npy", "wb") as f:
    np.save(f, stage1_app_np)

len(stage1_app_np[0])

# output only times
stage1_times_df = features1_df[["Similarity", "jaccard_multi_keyword",
              "tf_cosine_times", "tfidf_cosine_times",
              "Sim_Hash_Similarity", "Min_Similarity"]]
stage1_times_np = stage1_times_df.to_numpy()
with open("stage1_similarity_of_only_times.npy", "wb") as f:
    np.save(f, stage1_times_np)

len(stage1_times_np[0])

In [57]:
# store compare csv
stage2_df = features2_df
stage2_comp_df = stage2_df[["Test", "Reference", "Similarity"]]
#stage2_comp_df.to_csv("Stage2CompareAns.csv", index=False)
stage2_comp_df

Unnamed: 0,Test,Reference,Similarity
0,1001,1004,0.00
1,1001,1006,0.00
2,1001,1008,0.25
3,1001,1009,0.25
4,1001,1012,0.00
...,...,...,...
176815,999,990,0.00
176816,999,991,0.00
176817,999,993,0.00
176818,999,994,0.00


In [81]:
# extract features
extracted2_df = stage2_df.copy()
del extracted2_df['Test']
del extracted2_df['Reference']
X2 = extracted2_df.to_numpy()
X2

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.3       ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.31      ],
       [0.25      , 0.09090909, 0.09090909, ..., 0.1790513 , 0.10848065,
        0.27      ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.31      ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.21      ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.3       ]])

In [82]:
# output features
with open("X2_only_similarities.npy", "wb") as f:
    np.save(f, X2)

## Add on article tf-vectors (both appearance and times)

### stage 1

In [72]:
appearance_vects = np.load("../Labeling/result_train_appearance.npy")
times_vects = np.load("../Labeling/result_train_times.npy")
train_vects = np.concatenate((appearance_vects, times_vects), axis=1)
train_vects

array([[1, 1, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [73]:
# check the length to see concatenate works or not
len(train_vects[0])

1528

In [74]:
# delete data 887
train_vects = np.delete(train_vects, 514, 0)

In [75]:
# read in article_index
article_index = pd.read_csv("stage1.csv").Num
article_index = article_index[article_index != 887]
article_index = article_index.reset_index()
del article_index['index']
article_index

Unnamed: 0,Num
0,1
1,10
2,1000
3,1005
4,1007
...,...
554,986
555,988
556,992
557,997


In [76]:
# transfer to pd dataframe & add test labels on
train_vect_df = pd.DataFrame(train_vects)
train_vect_df = train_vect_df.reset_index()
del train_vect_df['index']
train_vect_df['Test'] = article_index
train_vect_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1519,1520,1521,1522,1523,1524,1525,1526,1527,Test
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,10
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1000
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1005
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1007
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
554,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,986
555,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,988
556,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,992
557,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,997


In [77]:
# ref df
ref_vect_df = train_vect_df.copy()
train_vect_df['key'] = 0
ref_vect_df['key'] = 0
stage1_vector_df = train_vect_df.merge(ref_vect_df, on="key", how="outer")
del stage1_vector_df['key']
stage1_vector_df = stage1_vector_df.rename({"Test_x": "Test", "Test_y": "Reference"}, axis=1)
# remove self-reference
stage1_vector_df = stage1_vector_df.loc[stage1_vector_df['Test'] != stage1_vector_df['Reference']]
stage1_vector_df

Unnamed: 0,0_x,1_x,2_x,3_x,4_x,5_x,6_x,7_x,8_x,9_x,...,1519_y,1520_y,1521_y,1522_y,1523_y,1524_y,1525_y,1526_y,1527_y,Reference
1,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,10
2,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1000
3,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1005
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1007
5,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
312475,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,984
312476,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,986
312477,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,988
312478,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,992


In [None]:
stage1_vector_df

In [30]:
# join similarity features with article vectors
all_feat_df = pd.merge(features1_df, stage1_vector_df, on=["Test", "Reference"], how="inner")
all_feat_df

Unnamed: 0,Test,Reference,Similarity,jaccard_once_keyword,jaccard_multi_keyword,tf_cosine_once_keyword,tfidf_cosine_once_keyword,tf_cosine_multi_keyword,tfidf_cosine_multi_keyword,Sim_Hash_Similarity,...,1518_y,1519_y,1520_y,1521_y,1522_y,1523_y,1524_y,1525_y,1526_y,1527_y
0,1,10,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.30,...,0,0,0,0,0,0,0,0,0,0
1,1,1000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.32,...,0,0,0,0,0,0,0,0,0,0
2,1,1005,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.27,...,0,0,0,0,0,0,0,0,0,0
3,1,1007,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.33,...,0,0,0,0,0,0,0,0,0,0
4,1,1010,0.142857,0.071429,0.071429,0.188982,0.000000,0.158624,0.000000,0.20,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
311917,998,984,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.16,...,0,0,0,0,0,0,0,0,0,0
311918,998,986,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.25,...,0,0,0,0,0,0,0,0,0,0
311919,998,988,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.22,...,0,0,0,0,0,0,0,0,0,0
311920,998,992,0.333333,0.142857,0.142857,0.308607,0.096121,0.099049,0.024938,0.26,...,0,0,0,0,0,0,0,0,0,0


In [31]:
# change to numpy array
del all_feat_df["Test"]
del all_feat_df["Reference"]
stage1_all_arr = all_feat_df.to_numpy()
stage1_all_arr

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.33333333, 0.14285714, 0.14285714, ..., 0.        , 0.        ,
        0.        ],
       [0.83333333, 0.7       , 0.7       , ..., 0.        , 0.        ,
        0.        ]])

In [71]:
stage1_all_arr

NameError: name 'stage1_all_arr' is not defined

In [32]:
# output to file
X1 = stage1_all_arr
with open("X1_appearance_times.npy", "wb") as f:
    np.save(f, X1)

In [33]:
len(X1)

311922

### stage2

In [34]:
appearance2_vects = np.load("../Labeling/result_test_appearance.npy")
times2_vects = np.load("../Labeling/result_test_times.npy")
test_vects = np.concatenate((appearance2_vects, times2_vects), axis=1)
test_vects

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [43]:
# check the length to see concatenate works or not
len(test_vects[0])

1528

In [36]:
# read in article_index
article2_index = pd.read_csv("stage2.csv").Num
article2_index

0      1001
1      1004
2      1006
3      1008
4      1009
       ... 
416     991
417     993
418     994
419     996
420     999
Name: Num, Length: 421, dtype: int64

In [37]:
# transfer to pd dataframe & add test labels on
test_vect_df = pd.DataFrame(test_vects)
test_vect_df = test_vect_df.reset_index()
del test_vect_df['index']
test_vect_df['Test'] = article2_index
test_vect_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1519,1520,1521,1522,1523,1524,1525,1526,1527,Test
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1001
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1004
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1006
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1008
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
416,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,991
417,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,993
418,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,994
419,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,996


In [38]:
# ref df
ref_vect2_df = test_vect_df.copy()
test_vect_df['key'] = 0
ref_vect2_df['key'] = 0
stage2_vector_df = test_vect_df.merge(ref_vect2_df, on="key", how="outer")
del stage2_vector_df['key']
stage2_vector_df = stage2_vector_df.rename({"Test_x": "Test", "Test_y": "Reference"}, axis=1)
# remove self-reference
stage2_vector_df = stage2_vector_df.loc[stage2_vector_df['Test'] != stage2_vector_df['Reference']]
stage2_vector_df

Unnamed: 0,0_x,1_x,2_x,3_x,4_x,5_x,6_x,7_x,8_x,9_x,...,1519_y,1520_y,1521_y,1522_y,1523_y,1524_y,1525_y,1526_y,1527_y,Reference
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1004
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1006
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1008
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1009
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177235,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,990
177236,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,991
177237,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,993
177238,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,994


In [39]:
# join similarity features with article vectors
all_feat2_df = pd.merge(features2_df, stage2_vector_df, on=["Test", "Reference"], how="inner")
all_feat2_df

Unnamed: 0,Test,Reference,Similarity,jaccard_once_keyword,jaccard_multi_keyword,tf_cosine_once_keyword,tfidf_cosine_once_keyword,tf_cosine_multi_keyword,tfidf_cosine_multi_keyword,Sim_Hash_Similarity,...,1518_y,1519_y,1520_y,1521_y,1522_y,1523_y,1524_y,1525_y,1526_y,1527_y
0,1001,1004,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.30,...,0,0,0,0,0,0,0,0,0,0
1,1001,1006,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.31,...,0,0,0,0,0,0,0,0,0,0
2,1001,1008,0.25,0.090909,0.090909,0.250000,0.061200,0.179051,0.108481,0.27,...,0,0,0,0,0,0,0,0,0,0
3,1001,1009,0.25,0.166667,0.166667,0.288675,0.119242,0.231800,0.106533,0.27,...,0,0,0,0,0,0,0,0,0,0
4,1001,1012,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.29,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176815,999,990,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.34,...,0,0,0,0,0,0,0,0,0,0
176816,999,991,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.34,...,0,0,0,0,0,0,0,0,0,0
176817,999,993,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.31,...,0,0,0,0,0,0,0,0,0,0
176818,999,994,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.21,...,0,0,0,0,0,0,0,0,0,0


In [40]:
# change to numpy array
del all_feat2_df["Test"]
del all_feat2_df["Reference"]
stage2_all_arr = all_feat2_df.to_numpy()
stage2_all_arr

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.25      , 0.09090909, 0.09090909, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [41]:
# output to file
X2 = stage2_all_arr
with open("X2_appearance_times.npy", "wb") as f:
    np.save(f, X2)

## Only appearance

### stage 1

In [18]:
train_vects = np.load("../Labeling/result_train_times.npy")
train_vects

array([[1, 2, 4, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [19]:
# delete data 887
train_vects = np.delete(train_vects, 514, 0)

In [20]:
# read in article_index
article_index = pd.read_csv("stage1.csv").Num
article_index = article_index[article_index != 887]
article_index = article_index.reset_index()
del article_index['index']
article_index

Unnamed: 0,Num
0,1
1,10
2,1000
3,1005
4,1007
...,...
554,986
555,988
556,992
557,997


In [21]:
# transfer to pd dataframe & add test labels on
train_vect_df = pd.DataFrame(train_vects)
train_vect_df = train_vect_df.reset_index()
del train_vect_df['index']
train_vect_df['Test'] = article_index
train_vect_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,755,756,757,758,759,760,761,762,763,Test
0,1,2,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,10
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1000
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1005
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1007
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
554,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,986
555,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,988
556,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,992
557,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,997


In [22]:
# ref df
ref_vect_df = train_vect_df.copy()
train_vect_df['key'] = 0
ref_vect_df['key'] = 0
stage1_vector_df = train_vect_df.merge(ref_vect_df, on="key", how="outer")
del stage1_vector_df['key']
stage1_vector_df = stage1_vector_df.rename({"Test_x": "Test", "Test_y": "Reference"}, axis=1)
# remove self-reference
stage1_vector_df = stage1_vector_df.loc[stage1_vector_df['Test'] != stage1_vector_df['Reference']]
stage1_vector_df

Unnamed: 0,0_x,1_x,2_x,3_x,4_x,5_x,6_x,7_x,8_x,9_x,...,755_y,756_y,757_y,758_y,759_y,760_y,761_y,762_y,763_y,Reference
1,1,2,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,10
2,1,2,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1000
3,1,2,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1005
4,1,2,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1007
5,1,2,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
312475,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,984
312476,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,986
312477,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,988
312478,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,992


In [23]:
# output the order_df
order_df = stage1_vector_df[["Test", "Reference"]]
order_df.to_csv("stage1_order.csv", index=False)

In [24]:
# delete test & reference  and save the article vectors into numpy
del order_df["Test"]
del order_df["Reference"]
stage1_vector_np = stage1_vector_df.to_numpy()
with open("stage1_times_feature.npy", "wb") as f:
    np.save(f, stage1_vector_np)

### stage 3

In [26]:
stage3_vects = np.load("../Labeling/result_stage3_times.npy")
stage3_vects

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [27]:
# read in article_index
article_index = pd.read_csv("../Labeling/stage3_keyword.csv").Num
article_index

0       100
1      1002
2      1003
3       101
4      1013
       ... 
415     982
416     983
417     989
418      99
419     995
Name: Num, Length: 420, dtype: int64

In [28]:
# transfer to pd dataframe & add test labels on
stage3_vect_df = pd.DataFrame(stage3_vects)
stage3_vect_df['Test'] = article_index
stage3_vect_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,755,756,757,758,759,760,761,762,763,Test
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,100
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1002
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1003
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,101
4,0,0,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
415,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,982
416,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,983
417,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,989
418,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,99


In [29]:
# ref df
ref3_vect_df = stage3_vect_df.copy()
stage3_vect_df['key'] = 0
ref3_vect_df['key'] = 0
stage3_vector_df = stage3_vect_df.merge(ref3_vect_df, on="key", how="outer")
del stage3_vector_df['key']
stage3_vector_df = stage3_vector_df.rename({"Test_x": "Test", "Test_y": "Reference"}, axis=1)
# remove self-reference
stage3_vector_df = stage3_vector_df.loc[stage3_vector_df['Test'] != stage3_vector_df['Reference']]
stage3_vector_df

Unnamed: 0,0_x,1_x,2_x,3_x,4_x,5_x,6_x,7_x,8_x,9_x,...,755_y,756_y,757_y,758_y,759_y,760_y,761_y,762_y,763_y,Reference
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1002
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1003
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,101
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1013
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176394,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,981
176395,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,982
176396,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,983
176397,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,989


In [30]:
# output the order_df
order3_df = stage3_vector_df[["Test", "Reference"]]
order3_df.to_csv("stage3_order.csv", index=False)

In [31]:
# delete test & reference  and save the article vectors into numpy
del order3_df["Test"]
del order3_df["Reference"]
stage3_vector_np = stage3_vector_df.to_numpy()
with open("stage3_times_feature.npy", "wb") as f:
    np.save(f, stage3_vector_np)

In [88]:
gan = np.load("stage1_times_feature.npy")

In [89]:
gan[:, 1529:1530]

array([[  10],
       [1000],
       [1005],
       ...,
       [ 988],
       [ 992],
       [ 997]], dtype=int64)

In [90]:
gan[:, 764:765]

array([[  1],
       [  1],
       [  1],
       ...,
       [998],
       [998],
       [998]], dtype=int64)