In [199]:
import json
import os
import random

from IPython.display import HTML, display
import pandas as pd

# Export data

In [8]:
train_essays = pd.read_csv("train_essays.csv")
train_essays = train_essays.query("prompt_id == 0")
train_essays.head()

Unnamed: 0,id,prompt_id,text,generated
0,0059830c,0,Cars. Cars have been around since they became ...,0
1,005db917,0,Transportation is a large necessity in most co...,0
2,008f63e3,0,"""America's love affair with it's vehicles seem...",0
3,00940276,0,How often do you ride in a car? Do you drive a...,0
4,00c39458,0,Cars are a wonderful thing. They are perhaps o...,0


In [38]:
with open("generated_essays_0.json") as jf:
    synt_texts = json.load(jf)
synt_texts_df = pd.DataFrame(synt_texts)
synt_texts_df.head()

Unnamed: 0,0
0,"global level, cars are a major contributor to..."
1,To inform fellow citizens about the advantage...
2,Smog. Traffic. Runaway emissions. Our love af...
3,Imagine a city where the streets are free of ...
4,Every day car usage has many negative conseque...


In [32]:
num_texts = len(synt_texts_df)
num_texts

50

In [39]:
# приводим датасеты в единый формат
synt_texts_df = synt_texts_df.rename(columns={synt_texts_df.columns[0]: "text"})
synt_texts_df["id"] = synt_texts_df.index
synt_texts_df = synt_texts_df.assign(prompt_id=0)
synt_texts_df = synt_texts_df.assign(generated=1)  # все тексты сгегерированы

In [40]:
i = random.randint(0, num_texts)
print(synt_texts_df.iloc[i].text)

Some  advantages for citizens in limiting car usage that can be drawn from the passage set include:

1) Reduced stress and improved quality of life. Residents of Vauban, Germany who chose to live in a car-free community report being "much happier" without cars. With less cars clogging the streets, citizens can walk, bike, and play more freely, reducing stress and creating a positive community atmosphere. 

2) Improved air quality and public health. Fewer cars means less vehicle emissions polluting the air, leading to reduced smog. For example, Bogota, Colombia has seen improvements in air quality after implementing a "Day Without Cars." And Paris had to issue driving bans due to dangerous smog levels caused in part by diesel emissions. Limiting cars can literally save lives and allow the public to breathe easier.  

3) Promotion of alternative sustainable transport. With cars off the streets, citizens are incentivized to use public transport, walk, or bike for their daily commutes. The

In [41]:
# убираем тег </essay>, стоящий в конце каждого эссе
synt_texts_df["text"] = synt_texts_df["text"].apply(
    lambda text: text.replace("</essay>", "")
)
print(synt_texts_df.iloc[i].text)

Some  advantages for citizens in limiting car usage that can be drawn from the passage set include:

1) Reduced stress and improved quality of life. Residents of Vauban, Germany who chose to live in a car-free community report being "much happier" without cars. With less cars clogging the streets, citizens can walk, bike, and play more freely, reducing stress and creating a positive community atmosphere. 

2) Improved air quality and public health. Fewer cars means less vehicle emissions polluting the air, leading to reduced smog. For example, Bogota, Colombia has seen improvements in air quality after implementing a "Day Without Cars." And Paris had to issue driving bans due to dangerous smog levels caused in part by diesel emissions. Limiting cars can literally save lives and allow the public to breathe easier.  

3) Promotion of alternative sustainable transport. With cars off the streets, citizens are incentivized to use public transport, walk, or bike for their daily commutes. The

In [42]:
synt_texts_df.head()

Unnamed: 0,text,id,prompt_id,generated
0,"global level, cars are a major contributor to...",0,0,1
1,To inform fellow citizens about the advantage...,1,0,1
2,Smog. Traffic. Runaway emissions. Our love af...,2,0,1
3,Imagine a city where the streets are free of ...,3,0,1
4,Every day car usage has many negative conseque...,4,0,1


In [43]:
# объединяем датасеты и перемешиваем порядок текстов
essays_for_labelling = pd.concat((synt_texts_df, train_essays.sample(num_texts)))
essays_for_labelling = essays_for_labelling.sample(frac=1)

In [44]:
# сохраняем датасет
essays_for_labelling.to_csv("essays_for_labelling.csv")

In [None]:
Так мы получили датасет, в котором 100 текстов: 50, написанных человеком, и 50, сгенерированных нейросетью

# Annotation

Разметка происходила в label studio. Подробности в README.

# Annotation analysis

In [49]:
annotated_df = pd.read_csv("annotation.csv")

In [50]:
annotated_df.head()

Unnamed: 0.1,Unnamed: 0,annotation_id,annotator,answer,created_at,generated,id,lead_time,other_features,prompt_id,value,text,updated_at
0,387,11,2,"[{""end"":1902,""text"":""moreover"",""start"":1894,""l...",2023-12-03T09:18:31.609268Z,0,4d2bb193,1807.014,"{""text"":[""lowercase"",""ошибки"",""есть цитаты""]}",0,Human,Over the past years massive car usage has incr...,2023-12-03T09:18:31.609292Z
1,24,27,2,"[{""end"":79,""text"":""several disadvantages"",""sta...",2023-12-03T16:25:23.496489Z,1,24,79.94,"{""text"":[""короткое введение"",""короткий текст"",...",0,AI,When relying solely on cars and other persona...,2023-12-03T16:25:23.496517Z
2,506,17,2,"[{""end"":227,""text"":""Paris,France"",""start"":215,...",2023-12-03T10:22:39.494931Z,0,62b480e1,311.635,"{""text"":[""слишком длинно"",""слишком правильный ...",0,AI,The culture of the car has been coming to an e...,2023-12-03T10:22:39.494956Z
3,12,29,2,,2023-12-03T16:27:46.866362Z,1,12,76.328,абзацы одинаковой длины,0,AI,Todays transportation system revolves around ...,2023-12-03T16:27:46.866389Z
4,44,9,2,"[{""end"":289,""text"":""reduces traffic congestion...",2023-12-03T08:48:27.640778Z,1,44,130.067,"{""text"":[""слишком обычный текст..."",""длинные п...",0,AI,Life without cars has many advantages for cit...,2023-12-03T08:48:27.640800Z


In [53]:
# датасет был размечен не полностью, извлекаем размеченные тексты
annotated_df = annotated_df[annotated_df.value != ""]

In [54]:
len(annotated_df)

25

Всего было размечено 25 текстов.

In [56]:
annotated_df["value"] = annotated_df.value.apply(lambda v: 1 if v == "AI" else 0)

In [66]:
annotated_df.groupby("value").agg({"text": "count"})

Unnamed: 0_level_0,text
value,Unnamed: 1_level_1
0,14
1,11


In [67]:
annotated_df.groupby("generated").agg({"text": "count"})

Unnamed: 0_level_0,text
generated,Unnamed: 1_level_1
0,14
1,11


In [203]:
annotated_df['correct'] = annotated_df.generated == annotated_df.value

In [204]:
incorrect = annotated_df[~annotated_df['correct']]

In [205]:
len(incorrect)

4

In [97]:
incorrect.id

2     62b480e1
21    52906497
22           3
23           2
Name: id, dtype: object

In [197]:
def display_text(text, spans):
    print(spans)
    if spans and isinstance(spans, str):
        spans = json.loads(spans)
    else:
        spans = []
    style = "AI{background-color:#FF0000} Human{background-color:#008000}"
    colored_text = list(text)
    num_paragraphs = 0
    for s in spans:
        start = s["start"]
        end = s["end"]
        label = s["labels"][0]
        num_linebreaks = text[:start].count("\n")
        start = start + num_linebreaks
        end = end + num_linebreaks
        colored_text[start] = f"<{label}>{colored_text[start]}"
        colored_text[end] = f"{colored_text[end]}</{label}>"
    content = "".join(colored_text).replace("\n", "<br>")
    html = f"<style>{style}</style> <div> {content} </div>"
    display(HTML(html))

In [198]:
for index, row in incorrect.iterrows():
    print(row.id)
    display_text(row.text, row.answer)
    print(f"Annotation: {row.value}")
    print(f"Ground truth: {row.generated}")
    print(f"Comments: {row.other_features}")
    print("\n-----\n")

62b480e1
[{"end":227,"text":"Paris,France","start":215,"labels":["AI"]},{"end":321,"text":"Bogota,colombia","start":306,"labels":["AI"]},{"end":1722,"text":".\"","start":1720,"labels":["AI"]},{"end":4974,"text":"The romantic city of Paris","start":4948,"labels":["AI"]},{"end":5164,"text":"The city of Bogota,Colombia","start":5137,"labels":["AI"]},{"end":5136,"text":"In Germany where they have developed a city where there is no car in sight and has everything within a walking or bike distance.","start":5008,"labels":["Human"]},{"end":1055,"text":"permitted,the","start":1042,"labels":["AI"]},{"end":3244,"text":"Paris will return to the romantic city","start":3206,"labels":["AI"]}]


Annotation: 1
Ground truth: 0
Comments: {"text":["слишком длинно","слишком правильный текст","повторяются The city of Bogota,Colombia, The romantic city of Paris"]}

-----

52906497
[{"end":3042,"text":"carsArticle 4.","start":3028,"labels":["AI"]},{"end":2639,"text":"Article 4.\\n","start":2629,"labels":["AI"]},{"end":3477,"text":"safetyArticle4","start":3463,"labels":["AI"]},{"end":936,"text":"playingArticle 1","start":920,"labels":["AI"]}]


Annotation: 1
Ground truth: 0
Comments: может просто съехал markdown оформление?

-----

3
nan


Annotation: 0
Ground truth: 1
Comments: nan

-----

2
[{"end":34,"text":"Smog. Traffic. Runaway emissions.","start":0,"labels":["Human"]}]


Annotation: 0
Ground truth: 1
Comments: {"text":["абзацы одинаковой длины","короткий текст","скучный текст"]}

-----



In [None]:
Сравним качество разметки у разных разметчиков.

In [209]:
annotated_df.groupby('annotator').agg({'text':'count', 'correct':'mean'})

Unnamed: 0_level_0,text,correct
annotator,Unnamed: 1_level_1,Unnamed: 2_level_1
2,21,0.904762
3,4,0.5
