# Using Clustering Features

using output of 2020_10_31_clustering_on_unique_word_embedings and based on 2020_10_19_further_svm_tuning

## Imports and preprocessing

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
from tqdm import tqdm
train_dset_df = pd.read_csv("2020_10_19_train_dset_df_nostem_nostoprem.csv")
test_dset_df = pd.read_csv("2020_10_19_test_dset_df_nostem_nostoprem.csv")
train_dset_df["preprocessed_joined"].fillna("", inplace=True)
test_dset_df["preprocessed_joined"].fillna("", inplace=True)

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(train_dset_df["preprocessed_joined"])
sparse_train_x = vectorizer.transform(train_dset_df["preprocessed_joined"])
sparse_test_x  = vectorizer.transform(test_dset_df["preprocessed_joined"])
train_dset_y = train_dset_df["target"].to_numpy()



## Adding the clustering features

In [2]:
N_CLUSTERS = 100 
TRAIN_N_DATA_POINTS = sparse_train_x.shape[0]

In [3]:
CLUSTERS_FOLDER = "./outputs/2020_11_3_agglo_groups/"
FILE_STRING = "2020_11_3_agglo_group_"

In [4]:
group_strings = []
for i in range(100):
    with open(CLUSTERS_FOLDER + FILE_STRING + str(i) +".txt") as file_handler:
        group_strings.append(set([word[:-1] for word in file_handler.readlines()]))


In [5]:
print(group_strings[-1])

{'honest', 'interpreting', 'inform', 'clarify', 'circumstantial', 'contrary', 'regard', 'reprint', 'telephone', 'advises', 'associated', 'assertion', 'contradict', 'detail', 'guardian', 'proof', 'understood', 'interpol', 'conversations', 'headline', 'pointed', 'press', 'mistake', 'journalists', 'page', 'corroboration', 'confidential', 'daily', 'documentation', 'explanation', 'legal', 'please', 'bulletin', 'reported', 'strongly', 'agency', 'published', 'asking', 'corroborated', 'advice', 'scrutiny', 'understand', 'observer', 'commentary', 'arguments', 'consulate', 'cited', 'memo', 'call', 'mail', 'newspaper', 'incontrovertible', 'message', 'briefing', 'opinion', 'showing', 'disclosure', 'talked', 'contacted', 'wrong', 'changing', 'text', 'verify', 'authenticity', 'listen', 'observed', 'subject', 'quoting', 'columns', 'magazine', 'confirm', 'hears', 'relating', 'ask', 'exculpatory', 'interpretation', 'liked', 'opinions', 'thank', 'dossier', 'truly', 'attributed', 'interview', 'speakers',

In [6]:
extra_features = np.zeros((TRAIN_N_DATA_POINTS, N_CLUSTERS))

In [7]:
train_dset_df.head()

Unnamed: 0,qid,target,preprocessed_joined
0,6f47b0f60633c2056455,0,how can i reply to this comment india be poor ...
1,d49b3966070b27bf07fc,0,what do they use for transportation in ancient...
2,6d5faa49380557c8ca7b,0,what be the most important provision of obama ...
3,cebea75faa47388edcf5,0,at what age do most finns master english today
4,2a7b76a679cadb0a016e,0,what be cheap place to live in india for one m...


In [8]:
for i in tqdm(range(100)):
    for j in (range(len(train_dset_df))):
        sentence_list = train_dset_df["preprocessed_joined"][j].split()
        for word in sentence_list:
            if word in group_strings[i]:
                extra_features[j][i] += 1


100%|██████████| 100/100 [08:27<00:00,  5.07s/it]


In [14]:
from scipy.sparse import csr

In [15]:
sparse_extra_features = csr.csr_matrix(extra_features)

In [16]:
sparse_train_x

<783673x54972 sparse matrix of type '<class 'numpy.int64'>'
	with 8974499 stored elements in Compressed Sparse Row format>

In [17]:
from scipy.sparse import hstack

In [18]:
new_sparse_train_x = hstack((sparse_train_x, sparse_extra_features))

In [19]:
new_sparse_train_x

<783673x55072 sparse matrix of type '<class 'numpy.float64'>'
	with 12556371 stored elements in COOrdinate format>

In [44]:
sparse_train_x=new_sparse_train_x

## LinearSVC

In [45]:
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score


In [46]:
svm = LinearSVC(penalty="l2",dual=True,class_weight={0:1,1:3.35}, C=0.0125)
svm.fit(sparse_train_x, train_dset_y)

LinearSVC(C=0.0125, class_weight={0: 1, 1: 3.35})

In [47]:
train_dset_yhat = svm.predict(sparse_train_x)
f1_score(train_dset_y, train_dset_yhat)

0.6510702983833189

In [48]:
train_dset_df["yhat"] = train_dset_yhat
wrongs = train_dset_df[train_dset_df["target"] != train_dset_df["yhat"]]
print(len(wrongs.groupby(by="target").get_group(0)),len(wrongs.groupby(by="target").get_group(1)))
print(len(train_dset_df.groupby(by="target").get_group(0)), len(train_dset_df.groupby(by="target").get_group(1)))

22096 14401
735222 48451


## Cross validation

In [49]:
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score
import gc

In [50]:
gc.collect()

15219

In [51]:
kfcv = KFold(n_splits=10, shuffle=True)
train_f1_scores = []
test_f1_scores = []

In [52]:
i=0
for train_index, test_index in kfcv.split(sparse_train_x):
    print("TRAINING AGAIN.", i)
    i+=1
    # print("TRAIN:", train_index, "TEST:", test_index)
    x_train, x_test = sparse_train_x[train_index], sparse_train_x[test_index]
    y_train, y_test = train_dset_y[train_index], train_dset_y[test_index]
    svm.fit(x_train, y_train)
    train_yhat = svm.predict(x_train)
    train_f1_score = f1_score(y_train, train_yhat)
    test_yhat = svm.predict(x_test)
    test_f1_score = f1_score(y_test, test_yhat)
    n_y_one_and_yhat_zero = np.sum(np.logical_and(y_test == 1, test_yhat == 0).astype(int))
    n_y_zero_and_yhat_one = np.sum(np.logical_and(y_test == 0, test_yhat == 1).astype(int))
    print("Zeros wrong, ones wrong in test = ", n_y_zero_and_yhat_one,n_y_one_and_yhat_zero)
    print("Test f1 score:", test_f1_score)
    print("Test precision score:", precision_score(y_test, test_yhat))
    print("Recall score:", recall_score(y_test, test_yhat))
    train_f1_scores.append(train_f1_score)
    test_f1_scores.append(test_f1_score)
    train_yhat = None 
    test_yhat = None 
    x_train = None 
    x_test = None 
    y_train = None 
    y_test = None 
    print(gc.collect())
    print(gc.collect())
    print(gc.collect())

TRAINING AGAIN. 0


TypeError: only integer scalar arrays can be converted to a scalar index

In [332]:
train_f1_scores


[0.6523289074840969,
 0.6516767560295954,
 0.6506225131562059,
 0.6518062048448788,
 0.650933804248943,
 0.6507430884781336,
 0.6511568288149716,
 0.6511399538895055,
 0.6521511634115348,
 0.6512646797961126]

In [333]:
test_f1_scores                                                                  

[0.6165752351097179,
 0.6120565330256706,
 0.6215414073719483,
 0.6137766275134381,
 0.6193930139339568,
 0.6165015206514274,
 0.6122527737578389,
 0.6165020337013365,
 0.6074116965836712,
 0.616070563148202]

## Testset write

In [335]:
sparse_test_x = vectorizer.transform(test_dset_df["preprocessed_joined"])
test_yhat = svm.predict(sparse_test_x)
output_df = test_dset_df.copy()
output_df.info()
output_df["preprocessed_joined"] = test_yhat
output_df = output_df.rename(columns={"qid":"qid", "preprocessed_joined":"target"})
output_df.target = output_df.target.apply(round)
output_df.to_csv("./outputs/2020_10_28_a_testset_output.csv", index=False)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522449 entries, 0 to 522448
Data columns (total 2 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   qid                  522449 non-null  object
 1   preprocessed_joined  522449 non-null  object
dtypes: object(2)
memory usage: 8.0+ MB
