# Checking length of poem distribution across LLMs

In [57]:
import json

# load the json file
with open('/content/baseline.json') as f:
    data = json.load(f)

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import re
from collections import Counter
nltk.download('stopwords')
nltk.download('punkt')

# remove punctuation
a = []
for i in data:
  tokens = list(map(lambda x: re.sub(r'[^\w\s]', '', x), i["poem"].split("\n")))

  # tokenize
  a += list(map(lambda x: len(word_tokenize(x)), tokens))

a_dict = dict(Counter(a))
# finding the total sum of them
total_sum = sum(a_dict.values())

# getting the percentage of each word from the above counter words
res_percentage = {key: round(value*100 / total_sum,2) for key,
value in a_dict.items()}

import pandas as pd
res_df = pd.DataFrame(res_percentage.items(), columns=['line_length', 'percentage'])
import plotly.express as px
default_color = "blue"
colors = {"5": '#7FD4C1'}


fig1 = px.bar(res_df, x='line_length', y='percentage',
             title="Distribution of length in poem line with Vistral (baseline)",
             text_auto=True)
fig1.update_layout(xaxis_title="Sentence length", yaxis_title="Probaility (in percent)")
# show numbers each number, no skip
fig1.update_traces(textfont_size = 14, textangle = 0, textposition = "outside")
fig1.update_xaxes(range=[0,11], tickvals=[-1,0,1,2,3,4,5,6,7,8,9,10,11])
fig1["data"][0]["marker"]["color"] = ['#8690FF' if c == 5 else '#7FD4C1' for c in fig1["data"][0]["x"]]
fig1.show()



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [59]:
import json

# load the json file
with open('/content/gemini_generated.json') as f:
    data = json.load(f)

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import re
from collections import Counter
nltk.download('stopwords')
nltk.download('punkt')

# remove punctuation
a = []
for i in data:
  tokens = list(map(lambda x: re.sub(r'[^\w\s]', '', x), i["text"].split("\n")))

  # tokenize
  a += list(map(lambda x: len(word_tokenize(x)), tokens))

a_dict = dict(Counter(a))
# finding the total sum of them
total_sum = sum(a_dict.values())

# getting the percentage of each word from the above counter words
res_percentage = {key: round(value*100 / total_sum,2) for key,
value in a_dict.items()}

import pandas as pd
res_df = pd.DataFrame(res_percentage.items(), columns=['line_length', 'percentage'])
import plotly.express as px
default_color = "blue"
colors = {"5": '#7FD4C1'}


fig2 = px.bar(res_df, x='line_length', y='percentage',
             title="Distribution of length in poem line with Gemini Pro",
             text_auto=True)
fig2.update_layout(xaxis_title="Sentence length", yaxis_title="Probaility (in percent)")
# show numbers each number, no skip
fig2.update_traces(textfont_size = 14, textangle = 0, textposition = "outside")
fig2.update_xaxes(range=[0,11], tickvals=[-1,0,1,2,3,4,5,6,7,8,9,10,11])
fig2["data"][0]["marker"]["color"] = ['#8690FF' if c == 5 else '#7FD4C1' for c in fig2["data"][0]["x"]]
fig2.show()



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [60]:
import json

# load the json file
with open('/content/vistralpoem.json') as f:
    data = json.load(f)

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import re
from collections import Counter
nltk.download('stopwords')
nltk.download('punkt')

# remove punctuation
a = []
for i in data:
  tokens = list(map(lambda x: re.sub(r'[^\w\s]', '', x), i["poem"].split("\n")))

  # tokenize
  a += list(map(lambda x: len(word_tokenize(x)), tokens))

a_dict = dict(Counter(a))
# finding the total sum of them
total_sum = sum(a_dict.values())

# getting the percentage of each word from the above counter words
res_percentage = {key: round(value*100 / total_sum,2) for key,
value in a_dict.items()}

import pandas as pd
res_df = pd.DataFrame(res_percentage.items(), columns=['line_length', 'percentage'])
import plotly.express as px
default_color = "blue"
colors = {"5": '#7FD4C1'}


fig3 = px.bar(res_df, x='line_length', y='percentage',
             title="Distribution of length in poem line with VistralPoem (Ours)",
             text_auto=True)
fig3.update_layout(xaxis_title="Sentence length", yaxis_title="Probaility (in percent)")
# show numbers each number, no skip
fig3.update_traces(textfont_size = 14, textangle = 0, textposition = "outside")
fig3.update_xaxes(range=[0,11], tickvals=[-1,0,1,2,3,4,5,6,7,8,9,10,11])
fig3["data"][0]["marker"]["color"] = ['#8690FF' if c == 5 else '#7FD4C1' for c in fig3["data"][0]["x"]]
fig3.show()



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [81]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig = make_subplots(rows=3, cols=1,subplot_titles=['Vistral (Baseline)',
                                                   "Gemini Pro",
                                                   "VistralPoem (Ours)"],
                    column_widths=[800],
                    row_heights=[1600,1600,1600],
                    vertical_spacing=0.5)

fig.append_trace(fig1["data"][0], row=1, col=1)
fig.append_trace(fig2["data"][0], row=2, col=1)
fig.append_trace(fig3["data"][0], row=3, col=1)


fig.update_layout(height=1000, width=800, title_text="Poem length distribution comparison")
# show numbers each number, no skip
fig.update_traces(textfont_size = 14, textangle = 0, textposition = "outside")
fig.update_xaxes(range=[0,11], tickvals=[-1,0,1,2,3,4,5,6,7,8,9,10,11])
fig.show()