In [1]:
import os
os.chdir("..")
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/kairosds/speech2text-323505-6f8076112948.json"

In [2]:
import pandas as pd
import plotly.express as px

from google.cloud import translate_v2 as translate
from src.preprocess import preprocess_spacy as spacy_preprocessor
from src.preprocess import preprocess_google as google_preprocessor

In [3]:
claim_df = pd.read_csv("/Users/kairosds/verifiable-phrase-detection/data/ml_test_data.csv")

In [4]:
print("Number of documents:", claim_df.shape[0])

Number of documents: 15000


In [5]:
claim_df.dtypes

text     object
claim     int64
dtype: object

### Missing values

In [6]:
for column in claim_df.columns:
    print("Column \"{0}\" has {1} missing values".format(column, claim_df[column].isna().sum()))

Column "text" has 0 missing values
Column "claim" has 0 missing values


### Target variable

In [None]:
label_count = claim_df["claim"].value_counts().reset_index()
fig = px.bar(label_count, x="index", y="claim")
fig.update_traces(marker_color="#0ebf8c")
fig.update_layout(title="Number of negative/positive claims")
fig.show("svg")

### Word count

In [None]:
claim_pipeline = spacy_preprocessor.load_processing_pipeline()

In [None]:
claim_df["num_words"] = claim_df["text"].apply(
    lambda x: len(spacy_preprocessor.tokenize(claim_pipeline, x, with_punctuation=False))
)

In [43]:
fig = px.violin(claim_df, x="claim", y="num_words", color="claim", box=True)
fig.update_layout(title="Word count distribution", yaxis_title="# of words")
fig.show("svg")

### Word count of normalized text

 <strong>Normalization steps</strong>:
 - Tokenization
 - Removing stop words and punctuation
 - Lemmatization

In [None]:
claim_df["num_significative_words"] = claim_df["text"].apply(
    lambda x: len(set(spacy_preprocessor.normalize(claim_pipeline, x)))
)

In [42]:
fig = px.violin(claim_df, x="claim", y="num_significative_words", color="claim", box=True)
fig.update_layout(title="Significative word count distribution", yaxis_title="# of words")
fig.show("svg")

In [41]:
claim_df[["num_words", "num_unique_words"]].describe().drop(["count"]).apply(lambda x: round(x, 2))

Unnamed: 0,num_words,num_unique_words
mean,19.03,7.12
std,15.27,5.81
min,1.0,0.0
25%,8.0,3.0
50%,15.0,6.0
75%,26.0,10.0
max,136.0,61.0


In [46]:
# Claims with 0 significant words
claim_df.loc[(claim_df["num_significant_words"] == 0) & (claim_df["claim"] == 1)]

Unnamed: 0,text,claim,num_words,num_unique_words,language,language_detection_confidence
10678,Eran cuatro de los siete.,1,5,0,es,1.0


### Language detection and analysis

In [7]:
# client = translate.Client()
# claim_df["language"], claim_df["language_detection_confidence"] = zip(*claim_df["text"].map(
#     lambda x: google_preprocessor.detect_language(client, x)
# ))

In [8]:
# claim_df.to_csv("./data/det_language_data.csv", index=False)

In [9]:
claim_df = pd.read_csv("./data/det_language_data.csv")

In [16]:
language_iso_list = list(claim_df["language"].unique())
language_list = google_preprocessor.get_language_names(language_iso_list)
print("Language list:", ", ".join(language_list))

Language list: Spanish, Catalan, Galician, Portuguese, Somali, Italian, Undetermined, English, Romanian, Polish, Danish, French, Indonesian, Swedish, Latvian, Japanese, Hindi, Finnish, Lithuanian, Malagasy


In [26]:
print("{}% of texts whose language has been perfectly detected".format(
    round(claim_df.loc[claim_df["language_detection_confidence"] == 1].shape[0]/claim_df.shape[0] * 100, 2)
))

96.63% of texts whose language has been perfectly detected


In [27]:
langdet_confidence_distribution = claim_df.loc[claim_df["language_detection_confidence"] != 1]
fig = px.histogram(langdet_confidence_distribution, x="language_detection_confidence")
fig.show()

In [47]:
claim_df.loc[claim_df["language_detection_confidence"] < 0.6].head(10)

Unnamed: 0,text,claim,num_words,num_unique_words,language,language_detection_confidence
251,Deloitte o intentá por demanda deleita donde f...,0,18,9,es,0.584472
779,Ah!,0,1,1,so,0.482353
801,todas.,0,1,0,es,0.589844
1124,"Pronto, pronto, pronto.",0,3,0,it,0.524555
1142,Vamos.,0,1,0,es,0.5
1421,"Tenemos ocupadas camas UCI o 2,3 fronte o 6,4.",1,9,7,gl,0.536432
1593,"Claro, claro.",0,2,0,es,0.506769
1789,que,0,1,0,es,0.501961
1853,"Normal, no?",0,2,1,es,0.446595
2323,B.,0,1,1,en,0.589844
