# Limpieza datasets de paraphrasing

In [80]:
import pandas as pd
import numpy as np

In [25]:
df_1 = pd.read_csv('datasets/paraphrasing/english_task_c.csv')
df_1.head()

Unnamed: 0,text_0,text_1,sarcastic_id_ref,human_aggregated,human_votes
0,I see that your team played well today!,I'm sorry that your team didn't win yesterday.,0,0,5
1,"Anthony Taylor is such a fair referee, I wish ...",I hope Anthony Taylor is never put in charge o...,0,0,5
2,"the weather is gloomy, just raining and dull.",What a glorious weather today,1,1,5
3,People going out to get there boosters without...,Nice to see the sheep getting their boosters t...,1,1,5
4,"Really great weather we're having, love a bit ...",Really cold January so far - looking forward t...,0,0,5


In [37]:
print("Cantidad de datos: " + str(len(df_1)))
print("Cantidad de ejemplos donde coincide la etiqueta con los votos humanos: " + str(sum(df_1["human_aggregated"] == df_1["sarcastic_id_ref"])))

Cantidad de datos: 200
Cantidad de ejemplos donde coincide la etiqueta con los votos humanos: 194


In [99]:
df_2 = pd.read_csv('datasets/paraphrasing/train.En.csv')
df_2.drop(columns=["Unnamed: 0"], inplace=True)
df_2[df_2.columns[3:]] = df_2.iloc[:,3:].fillna(0).astype(int)
df_2.head()

Unnamed: 0,tweet,sarcastic,rephrase,sarcasm,irony,satire,understatement,overstatement,rhetorical_question
0,The only thing I got from college is a caffein...,1,"College is really difficult, expensive, tiring...",0,1,0,0,0,0
1,I love it when professors draw a big question ...,1,I do not like when professors don’t write out ...,1,0,0,0,0,0
2,Remember the hundred emails from companies whe...,1,"I, at the bare minimum, wish companies actuall...",0,1,0,0,0,0
3,Today my pop-pop told me I was not “forced” to...,1,"Today my pop-pop told me I was not ""forced"" to...",1,0,0,0,0,0
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,I would say Ted Cruz is an asshole and doesn’t...,1,0,0,0,0,0


In [96]:
print("Cantidad de datos: " + str(len(df_2)))
print("Cantidad de datos sarcasticos: " + str(sum(df_2["sarcastic"] == 1)))

Cantidad de datos: 3468
Cantidad de datos sarcasticos: 867


Medio pelo... Veamos que podemos hacer con esto

Una idea a discutir: podriamos hacer que aprenda en parte a reconocer sarcasmo (o distintos tipos de este) para que despues pueda reproducirlo. Si usamos esto en la primera parte de training y despues lo bombardeamos con muchisimos datos con estructura de chat?

Formato a llevar:

- text: el texto sarcástico
- is_sarcastic: booleano
- degree_of_sarcasm: un entero del 0 al 10 (puede ser nulo)
- paraphrase: el texto parafraseado sin sarcasmo si es que corresponde
- context: el contexto del que proviene si es que es una respuesta a un hilo
- type: one of "sarcasm", "satire", "rhetorical question", etc
- source: la fuente, de que dataset proviene
- task: la tarea para la cual se va a usar

In [46]:
def procesar_fila_dataframe_1(row):
    if row['sarcastic_id_ref'] == 0:
        texto_sarcastico = row['text_0']
        paraphrase = row['text_1']
    else:
        texto_sarcastico = row['text_1']
        paraphrase = row['text_0']    
        
    return {
        'text': texto_sarcastico,
        'is_sarcastic': 1,
        'deegree_of_sarcasm': None,
        'paraphrase': paraphrase,
        'context': None,
        'type': None,
        'source': 'paraphrasing/english_task_c',
        'task': 'paraphrasing'
    }

In [47]:
prueba = df_1.apply(procesar_fila_dataframe_1, axis=1)
df_procesado_1 = pd.DataFrame(prueba.tolist())

In [111]:
df_procesado_1

Unnamed: 0,text,is_sarcastic,deegree_of_sarcasm,paraphrase,context,type,source,task
0,I see that your team played well today!,1,,I'm sorry that your team didn't win yesterday.,,,paraphrasing/english_task_c,paraphrasing
1,"Anthony Taylor is such a fair referee, I wish ...",1,,I hope Anthony Taylor is never put in charge o...,,,paraphrasing/english_task_c,paraphrasing
2,What a glorious weather today,1,,"the weather is gloomy, just raining and dull.",,,paraphrasing/english_task_c,paraphrasing
3,Nice to see the sheep getting their boosters t...,1,,People going out to get there boosters without...,,,paraphrasing/english_task_c,paraphrasing
4,"Really great weather we're having, love a bit ...",1,,Really cold January so far - looking forward t...,,,paraphrasing/english_task_c,paraphrasing
...,...,...,...,...,...,...,...,...
195,"the tories betrayed the nation, what a surprise!",1,,"the tories betrayed the nation, as expected",,,paraphrasing/english_task_c,paraphrasing
196,Cant wait to spend the rest of my life waiting...,1,,Cant believe we have to spend the rest of our ...,,,paraphrasing/english_task_c,paraphrasing
197,Isn't it just amazing how competent the govern...,1,,"Everything is a total mess, how can anyone be ...",,,paraphrasing/english_task_c,paraphrasing
198,Thanks Boris Johnson for restricting travel ab...,1,,The reasoning behind the tightening of travel ...,,,paraphrasing/english_task_c,paraphrasing


In [108]:
def procesar_fila_dataframe_2(row):
    
    type = (np.array(df_2.columns[3:]) * np.array(row[3:])).sum()

    return {
        'text': row['tweet'],
        'is_sarcastic': row['sarcastic'],
        'deegree_of_sarcasm': None,
        'paraphrase': row['rephrase'] if row['sarcastic'] == 1 else None,
        'context': None,
        'type': type if type != "" else None,
        'source': 'paraphrasing/train.En',
        'task': 'paraphrasing'
    }

In [109]:
prueba_2 = df_2.apply(procesar_fila_dataframe_2, axis=1)
df_procesado_2 = pd.DataFrame(prueba_2.tolist())

In [110]:
df_procesado_2

Unnamed: 0,text,is_sarcastic,deegree_of_sarcasm,paraphrase,context,type,source,task
0,The only thing I got from college is a caffein...,1,,"College is really difficult, expensive, tiring...",,irony,paraphrasing/train.En,paraphrasing
1,I love it when professors draw a big question ...,1,,I do not like when professors don’t write out ...,,sarcasm,paraphrasing/train.En,paraphrasing
2,Remember the hundred emails from companies whe...,1,,"I, at the bare minimum, wish companies actuall...",,irony,paraphrasing/train.En,paraphrasing
3,Today my pop-pop told me I was not “forced” to...,1,,"Today my pop-pop told me I was not ""forced"" to...",,sarcasm,paraphrasing/train.En,paraphrasing
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,,I would say Ted Cruz is an asshole and doesn’t...,,sarcasm,paraphrasing/train.En,paraphrasing
...,...,...,...,...,...,...,...,...
3463,The population spike in Chicago in 9 months is...,0,,,,,paraphrasing/train.En,paraphrasing
3464,You'd think in the second to last English clas...,0,,,,,paraphrasing/train.En,paraphrasing
3465,I’m finally surfacing after a holiday to Scotl...,0,,,,,paraphrasing/train.En,paraphrasing
3466,Couldn't be prouder today. Well done to every ...,0,,,,,paraphrasing/train.En,paraphrasing


In [112]:
df_union = pd.concat([df_procesado_1, df_procesado_2], ignore_index=True)
df_union

Unnamed: 0,text,is_sarcastic,deegree_of_sarcasm,paraphrase,context,type,source,task
0,I see that your team played well today!,1,,I'm sorry that your team didn't win yesterday.,,,paraphrasing/english_task_c,paraphrasing
1,"Anthony Taylor is such a fair referee, I wish ...",1,,I hope Anthony Taylor is never put in charge o...,,,paraphrasing/english_task_c,paraphrasing
2,What a glorious weather today,1,,"the weather is gloomy, just raining and dull.",,,paraphrasing/english_task_c,paraphrasing
3,Nice to see the sheep getting their boosters t...,1,,People going out to get there boosters without...,,,paraphrasing/english_task_c,paraphrasing
4,"Really great weather we're having, love a bit ...",1,,Really cold January so far - looking forward t...,,,paraphrasing/english_task_c,paraphrasing
...,...,...,...,...,...,...,...,...
3663,The population spike in Chicago in 9 months is...,0,,,,,paraphrasing/train.En,paraphrasing
3664,You'd think in the second to last English clas...,0,,,,,paraphrasing/train.En,paraphrasing
3665,I’m finally surfacing after a holiday to Scotl...,0,,,,,paraphrasing/train.En,paraphrasing
3666,Couldn't be prouder today. Well done to every ...,0,,,,,paraphrasing/train.En,paraphrasing


Yo creo que va a servir bastante, falta ver si pongo etiquetas de usuario y asistente o como planteo la entrada. 