# Notebook to extract negative labels for the energy patent classifier.
The process chosen is to create 100 clusters and use data from clusters without any postive data.  
There are another methods more precise and reliables (hierarchical cluster from positive data and using bert embedings) that have not been used because of time constraint

In [0]:

names = [
    "US07820139B2.xml", "US20130002962A1.xml", "US20130018608A1.xml", "US20130060981A1.xml", "US20130064229A1.xml",
    "US20130094447A1.xml", "KR101894541B1.xml", "KR20180111181A.xml", "US20150219357A1.xml", "US20150241078A1.xml",
    "US20150277467A1.xml", "US20150319689A1.xml", "US20160007280A1.xml", "US20160070249A1.xml", "DE102008040440A1.xml",
    "DE102008042273A1.xml", "DE102009012488A1.xml", "DE102013202978A1.xml", "DE102017009546A1.xml", "DE19841922A1.xml",
    "DE2942932A1.xml", "DE3136273A1.xml", "FR3023438A1.xml", "FR3023438B1.xml", "FR3060930A1.xml", "IN2014CN07341A.xml",
    "JP2015056104A.xml", "JP2015527862A.xml", "US20010036222A1.xml", "US20030133031A1.xml", "US20030171851A1.xml",
    "US20050038571A1.xml", "US20050043862A1.xml", "US20160334030A1.xml", "US20170108905A1.xml", "US20170189889A1.xml",
    "US20170222437A1.xml", "US09810442B2.xml", "US09854515B2.xml", "US09867654B2.xml", "US09989949B2.xml",
    "US10054918B2.xml", "US20170288401A1.xml", "US20170324908A1.xml", "US20170332750A1.xml", "US20170341942A1.xml",
    "US20170343229A1.xml", "US20180031449A1.xml", "US20180173024A1.xml", "US20180206131A1.xml", "US20180274334A1.xml",
    "US20180289120A1.xml", "US20180291886A1.xml", "US20180294248A1.xml", "US20180310858A1.xml", "US20180326173A1.xml",
    "US10136669B2.xml", "US10139845B2.xml", "US10144586B2.xml"]
print(f"Hay {len(names)} registros")

In [0]:
import pyspark.sql.functions as sf
FILTERED_STORAGE_NAME = "challengebasf"
FILTERED_CONTAINER_NAME = "filtereddata"
FILTERED_OUTPUT_FOLDER = "output_data"
input_container = f"wasbs://{FILTERED_CONTAINER_NAME}@{FILTERED_STORAGE_NAME}.blob.core.windows.net"
input_path = f"{input_container}/{FILTERED_OUTPUT_FOLDER}/"

In [0]:
df = spark.read.parquet(input_path)

In [0]:
# Clean and tokenize -> move to spark nlp
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark.ml.feature import CountVectorizer
OUTPUT_COL_ENGLISH_TEXT = "english_text"
df_clean = df.select('_file', OUTPUT_COL_ENGLISH_TEXT)
tokenizer = Tokenizer(inputCol=OUTPUT_COL_ENGLISH_TEXT, outputCol="text_token")
df_words_token = tokenizer.transform(df_clean)
remover = StopWordsRemover(inputCol="text_token", outputCol="text_clean")
df_words_no_stopw = remover.transform(df_words_token)

In [0]:
minDF = 5
cv = CountVectorizer(inputCol="text_clean", outputCol="features", minDF=minDF)
cv_model = cv.fit(df_words_no_stopw)
vectorized_tokens = cv_model.transform(df_words_no_stopw)

In [0]:
from pyspark.ml.clustering import LDA

num_topics = 50
lda = LDA(k=num_topics, maxIter=20)
model = lda.fit(vectorized_tokens)
ll = model.logLikelihood(vectorized_tokens)
lp = model.logPerplexity(vectorized_tokens)
print(f"The lower bound on the log likelihood of the entire corpus: {ll}")
print(f"The upper bound on perplexity: {lp}")

In [0]:
# extract vocabulary from CountVectorizer
vocab = cv_model.vocabulary
topics = model.describeTopics()   
topics_rdd = topics.rdd
topics_words = topics_rdd\
       .map(lambda row: row['termIndices'])\
       .map(lambda idx_list: [vocab[idx] for idx in idx_list])\
       .collect()
for idx, topic in enumerate(topics_words):
    print("topic: {}".format(idx))
    print("*"*25)
    for word in topic:
       print(word)
    print("*"*25)

In [0]:
result = model.transform(vectorized_tokens)


In [0]:
output_container_path = f"wasbs://otros@{FILTERED_STORAGE_NAME}.blob.core.windows.net/lda_model_{minDF}/"
model.save(output_container_path)

In [0]:
output_container_path = f"wasbs://otros@{FILTERED_STORAGE_NAME}.blob.core.windows.net"
output_blob_folder = f"{output_container_path}/lda_results_{minDF}/"
result.write.mode("overwrite").parquet(output_blob_folder)
print(f"Data saved!")

In [0]:
result.printSchema()

In [0]:
result.show()

# Comprobación de resultados

We are going to check:
- If data has been scattered between all clusters
- Distribution of positive data between clusters
- Identification of clusters to select data as negative registers

In [0]:
from pyspark.sql.types import *
import numpy as np

In [0]:
minDF = 5
output_container_path = f"wasbs://otros@{FILTERED_STORAGE_NAME}.blob.core.windows.net"
output_blob_folder = f"{output_container_path}/lda_results_{minDF}/"
df_raw = spark.read.parquet(output_blob_folder)
df = df_raw

In [0]:
df = df.withColumn("positive_data", sf.when(sf.col("_file").isin(names), 1).otherwise(0))
to_array = sf.udf(lambda v: v.toArray().tolist(), ArrayType(FloatType()))
df = df.withColumn("topicDistribution", to_array("topicDistribution"))

for n in range(0,50):
  df = df.withColumn(f"topic_{n}", sf.col("topicDistribution").getItem(n))
  
@sf.udf(IntegerType())
def get_cluster(values):
  return max(enumerate(values), key=lambda x: x[1])[0]
df = df.withColumn("cluster", get_cluster(sf.col("topicDistribution")))
num_per_cluster = df.groupBy("cluster").agg(
  sf.sum("positive_data").alias("num_positive_data"), 
  sf.count("cluster").alias("num_total_data"))


In [0]:
num_per_cluster_p = num_per_cluster.toPandas()

In [0]:
assert num_per_cluster_p["num_positive_data"].sum() == len(names)
num_per_cluster_p.loc[:, "perc_positives"] = 100 * num_per_cluster_p["num_positive_data"] / num_per_cluster_p["num_positive_data"].sum()
num_per_cluster_p.loc[:, "perc_total"] = 100* num_per_cluster_p["num_total_data"] / num_per_cluster_p["num_total_data"].sum()

In [0]:
num_per_cluster_p.sort_values("num_positive_data", ascending=False)

Unnamed: 0,cluster,num_positive_data,num_total_data,perc_positives,perc_total
40,33,36,221885,61.016949,16.64751
5,44,8,145266,13.559322,10.898966
32,10,4,194177,6.779661,14.568644
43,42,3,37514,5.084746,2.814587
38,21,3,179800,5.084746,13.489971
11,16,2,32570,3.389831,2.44365
49,36,1,20956,1.694915,1.572279
47,0,1,40180,1.694915,3.014611
44,2,1,5201,1.694915,0.390219
35,25,0,1237,0.0,0.092809


In [0]:
num_per_cluster_p.loc[num_per_cluster_p["num_positive_data"] == 0, "perc_total"].sum()

All cluster have registers.  We can see that the LDA has not created a cluster with energy consumption patents (it would have been ideal) but they are scattered between the most populated clusters.  
It is interesting that cluster 33 has 61% of all positive data with only 16% of registers.
Clusters with no positive data have the 34% of all data

In [0]:
positives = df.filter(sf.col("positive_data") == 1)
positives = positives.drop("topicDistribution")
positives = positives.drop("english_text")
positives = positives.drop("text_token")
positives = positives.drop("text_clean")
positives = positives.drop("features")
positives_p = positives.toPandas()

In [0]:
positives_p["max"] = positives_p.drop("cluster", axis=1).max(axis=1)

In [0]:
positives_p.head()

Unnamed: 0,_file,positive_data,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24,topic_25,topic_26,topic_27,topic_28,topic_29,topic_30,topic_31,topic_32,topic_33,topic_34,topic_35,topic_36,topic_37,topic_38,topic_39,topic_40,topic_41,topic_42,topic_43,topic_44,topic_45,topic_46,topic_47,topic_48,topic_49,cluster,max
0,US20050038571A1.xml,1,0.001808,0.001781,0.00178,0.001783,0.001774,0.001779,0.001773,0.001781,0.001772,0.001774,0.002008,0.001778,0.001799,0.001773,0.001786,0.001772,0.001818,0.001783,0.001772,0.001773,0.001797,0.001928,0.001777,0.001801,0.001782,0.001774,0.001778,0.001781,0.001854,0.001774,0.001796,0.001772,0.001894,0.393589,0.001776,0.001774,0.001801,0.001778,0.001828,0.001773,0.001772,0.001773,0.001814,0.001781,0.520232,0.001814,0.001778,0.001791,0.001777,0.001773,44,1.0
1,DE102009012488A1.xml,1,0.001989,0.001959,0.001958,0.001961,0.001951,0.001957,0.001951,0.001959,0.00195,0.001952,0.222771,0.001955,0.001979,0.00195,0.001964,0.00195,0.002,0.001962,0.001949,0.001951,0.001977,0.275846,0.001955,0.001982,0.00196,0.001951,0.001956,0.001959,0.002039,0.001952,0.001976,0.001949,0.002083,0.408732,0.001954,0.001952,0.001982,0.001956,0.002011,0.00195,0.001949,0.001951,0.001996,0.00196,0.002176,0.001995,0.001956,0.00197,0.001954,0.00195,33,1.0
2,DE102013202978A1.xml,1,0.000795,0.000783,0.000783,0.000784,0.00078,0.000782,0.00078,0.000783,0.00078,0.00078,0.000883,0.000782,0.000791,0.00078,0.000785,0.000779,0.0008,0.000784,0.000779,0.00078,0.00079,0.000848,0.000782,0.000792,0.000784,0.00078,0.000782,0.000783,0.000815,0.00078,0.00079,0.000779,0.000833,0.000896,0.000781,0.00078,0.000792,0.000782,0.000804,0.00078,0.000779,0.00078,0.000798,0.000783,0.961202,0.000798,0.000782,0.000788,0.000781,0.00078,44,1.0
3,US20130002962A1.xml,1,0.001808,0.001781,0.00178,0.001783,0.001774,0.001779,0.001773,0.001781,0.001772,0.001774,0.002008,0.001778,0.001799,0.001773,0.001786,0.001772,0.001818,0.001783,0.001772,0.001773,0.001797,0.001928,0.001777,0.001801,0.001782,0.001774,0.001778,0.001781,0.001854,0.001774,0.001796,0.001772,0.001894,0.911842,0.001776,0.001774,0.001801,0.001778,0.001828,0.001773,0.001772,0.001773,0.001814,0.001781,0.001978,0.001814,0.001778,0.001791,0.001777,0.001773,33,1.0
4,US10139845B2.xml,1,0.00153,0.001507,0.001506,0.001508,0.001501,0.001505,0.0015,0.001507,0.0015,0.001501,0.001699,0.001504,0.001522,0.0015,0.001511,0.001499,0.001538,0.001509,0.001499,0.0015,0.001521,0.001631,0.001504,0.001524,0.001507,0.001501,0.001504,0.001507,0.001568,0.001501,0.001519,0.001499,0.001602,0.925414,0.001503,0.001501,0.001524,0.001504,0.001547,0.0015,0.001499,0.0015,0.001535,0.001507,0.001673,0.001535,0.001504,0.001515,0.001503,0.0015,33,1.0


In [0]:
positives_p.drop(["_file", "positive_data"],axis=1).sum().sort_values(ascending=False)

The clusters with positive info are: 33, 44, 10, 42, 21, 16, 36, 0, 2. we are going to use the last 30 to sample negative data

In [0]:
positives_p.drop(["_file", "positive_data"],axis=1).sum().sort_values(ascending=False)[-30:]

In [0]:
negative_topics_cluster = positives_p.drop(["_file", "positive_data"],axis=1).sum().sort_values(ascending=False)[-30:].index.tolist()
negative_clusters = [int(topic.split("topic_")[1]) for topic in negative_topics_cluster]
negative_cluster_data = df.filter(sf.col("cluster").isin(negative_clusters))
num_total_negative_registers = negative_cluster_data.count()

In [0]:
target_num_samples = 600
negatives = negative_cluster_data.sample(False, target_num_samples/num_total_negative_registers, seed=18)

In [0]:
assert negatives.columns == df.columns
total = negatives.union(df.filter(sf.col("positive_data") == 1))
total = total.select("_file", "positive_data")

Check the output container exists

In [0]:
output_container_path = f"wasbs://trainingsamples@{FILTERED_STORAGE_NAME}.blob.core.windows.net"
output_blob_folder = f"{output_container_path}/file_names/"
total.write.mode("overwrite").parquet(output_blob_folder)
print(f"Data saved!")