In [1]:
import os
import textwrap
import re
import google.generativeai as genai

# ✅ Set Gemini API Key Securely
os.environ["GEMINI_API_KEY"] = "YOUR GEMINI API KEY"  # Replace with your actual API key
GEN_API_KEY = os.getenv("GEMINI_API_KEY")
if not GEN_API_KEY:
    raise ValueError("❌ Error: Missing Gemini API Key! Set it using os.environ['GEMINI_API_KEY'] = 'YOUR_KEY'")

genai.configure(api_key=GEN_API_KEY)

# Function to split text into chunks
def split_text(text, max_chunk_size=800):
    return textwrap.wrap(text, width=max_chunk_size, break_long_words=False, replace_whitespace=False)

# 📌 Function to generate a **concise 3-minute storyboard script** with 5-second timestamps
def generate_3min_storyboard_with_timestamps(text):
    try:
        model = genai.GenerativeModel("gemini-pro")
        prompt = f"""
        You are an expert in creating engaging, structured, and visually detailed **video storyboard scripts**.
        Your task is to **summarize the provided research into a structured, cinematic storyboard (~3 minutes total length, ~180 seconds)**.

        **📌 Requirements:**
        - **Concise and structured** like the example (Diabetes Detection storyboard).
        - Use **clear scene breakdowns** (e.g., "Scene 1 (0-5s): [Visuals] + [Audio]").
        - **Divide each scene into 5-second intervals**.
        - **6-7 scenes covering 3 minutes** (each scene will be broken into 5-second chunks).
        - Focus on **visual storytelling**, **smooth transitions**, and **clear narration**.
        - **Keep it engaging**, avoiding overly technical language.
        - Provide an **impactful call to action** at the end.

        **📜 Example Storyboard Format with 5-second timestamps:**
        ```
        **Scene 1 (0-5 seconds)**
        - **Visual:** [Describe first 5-second shot]
        - **Audio:** [Narration that introduces the topic concisely]

        **Scene 1 (5-10 seconds)**
        - **Visual:** [Continue next 5-second shot]
        - **Audio:** [Continue narration]

        **Scene 2 (30-35 seconds)**
        - **Visual:** [Describe how to introduce the problem]
        - **Audio:** [Briefly explain why the problem matters]
        ...
        ```
        Now, generate a similar **3-minute storyboard with 5-second timestamps** for the following research text:
        **Research Text:** {text}
        """

        response = model.generate_content(prompt)
        if response and response.text:
            print("✅ 3-Minute Storyboard with Timestamps generated successfully.")
            return response.text
        else:
            print("❌ Error: Empty response from Gemini API.")
            return None

    except Exception as e:
        print(f"❌ Error generating storyboard: {e}")
        return None

# Extract references to tables, figures, and equations
def extract_references(text):
    ref_pattern = r"(Table|Figure|Equation|Fig|Eq)\s?\d+"
    references = re.findall(ref_pattern, text)
    return list(set(references))  # Remove duplicates

# Function to process research paper text and generate a 3-minute storyboard with 5-second timestamps
def process_research_paper_with_timestamps(text):
    chunks = split_text(text, max_chunk_size=800)
    full_storyboard = ""
    all_references = set()

    for i, chunk in enumerate(chunks):
        print(f"\n🔹 Processing Chunk {i+1}/{len(chunks)}...\n")
        references = extract_references(chunk)

        storyboard = generate_3min_storyboard_with_timestamps(chunk)  # Generate 3-minute storyboard with 5-second timestamps

        if storyboard:
            print("\n🎬 **Generated 3-Minute Storyboard with 5-Second Timestamps:**\n")
            print(storyboard)
            full_storyboard += "\n" + storyboard

        print("\n🔷 References Detected:", references)
        all_references.update(references)

    return full_storyboard, all_references

# Example: Research paper text (Replace with actual text)
research_paper = """
4thMediterranean Conference on Embedded ComputingMECO - 2015Budva, Montenegro

An Intelligent System  for Diabetes Prediction

Zhilbert Tafa
Dep. of  Computer Science
University for Business and Technology
Prishtina, Kosovo
tafaul@t -com.me
 Nerxhivane Pervet ica
Bertran Karahoda
Dep. of Computer Science
University for Business and Technology
Prishtina, Kosovo
np32972@ubt -uni.net
b.karahoda@ubt -uni.net


Abstract—With the emerging increase of diabetes, that recently
affects around 346 million people, of which more than one-third
go undetected in early stage, a strong need for supporting the
medical deci sion-making process  is generated . A number of
researches have focused either in using one of the algorithms or
in the comparisons of the performances of algorithms on a given,
usual ly predefined and static datasets that are accessible through
the Internet.  This paper focuses on the joint implementation of
the support vector machine (SVM) and N aïve Bayes statistical
modeling , in the dataset acquired from the medical examinations
of 40 2 patients , in order to improve the computer -supported
diagnosis reliability . The dataset contains some attributes that
have not been previously used in computer -based evaluations .
The results show that the joint implementation of two algorithms
improves s ignificantly the overall reliability  of the system
outcome , which is crucial in the computer -supported diabetes
diagnostic process .
Keywords - algorithms;  diabetes ; joint implementation,  machine
learning ; Naïve Bayes ; support vector machine .
I.  INTRODUCTION
Diabetes is a chronic disease caused by the increase in
blood sugar , mainly either due to the less production or no
production of insulin in body ( type 1 diabetes ), or due to  the
fact that cells do not respond to the produced insulin  (type 2
diabetes) . In recent years, the number of diabetic patients has
increased drastically , as noted in [1],  mainly due to the aging
population and  irregular western food habits . According to the
World Health Organization , diabetes affects around 346
million people in the wo rld, with the prevalence of diabetes
type 2. Moreover, diabetes is the major cause for heart stroke,
kidney failure, lower -limb amputations and blindness. As
presented in [2 ], the absence of symptoms, or the absence of
recognition of the indicators in the patient’s data , may lead to
the pre -diabetes or diabetes condition that go es undetected
even in more than one-third of people that ar e later diagnosed
with diabetes.
During the clinical examinations of various forms, lots of
data are acquired from patients. The development of the
computer -based methods that would enable the high
probability recognition of pre -diabetic or diabetic condition can
be an efficient support t o the decision making in healthcare.  Machine learning is  the area of artifici al intelligence  that
uses the statistical analyse s, and is recognized to be a
promising area that, based on the given dataset of diabetics,  can
help in patient classification or probability prediction regarding
the patient’s pre-diabetic or diabetic condition.  The main
strength of these methods is contained in the ability of the
algorithms to learn from data and to use that knowledge for
later predictions and decisions. There are a number of machine
learning and statistical modeling approaches that so far have
been involved in various aspects of solving the problem .
According to [3 ], although other classifiers perform well, the
SVM outperforms other  classifiers with respect to accuracy,
sensitivity, specificity , and precision.
This paper presents a  joint Matlab implementation of the
SVM and N aïve Bayes methods in a new dataset acquired from
the patients examined for diabetes in Kosovo. The developed
diagnostic tool enables the in telligent computer -based
prediction on diabetes , based on the previously acquired
values. The statistical analysis shows the high accuracy of data
classification.  Also, the proposed joint implementation of two
algorithms aims to improve the  reliability of the decision by
using the power of both algorithms in minimizing their
individual weakness .
The rest of the paper is structured as follows. Related work
on the topic is presented in Section II . In section III, the
experimental  setup and the  implementation are  presented.
Results are shown and discussed in Section IV  while the
Section V concludes the work .
II. RELATED WORK
A number of research efforts  have  been directed in
involving the machine learning algorithms to the design of
intelligent healthcare applications,  especially in disease
detection.  Most of them have focused on heart disease, cancer
detection, and diabetes.  An analytical study of several
algorithms , focused on classification of d iabetes mellit us data ,
is presented in [1 ]. The algorithms have mostly  focused on
detection of pre -diabetes, which was recognized in [4 ] as a
relatively strong  indication for the future development of
diabetes. A recent study in this direction is given in [5 ], where
two machine learning techniques, namely SVM and ANN

4thMediterranean Conference on Embedded ComputingMECO - 2015Budva, Montenegro

(Artificia l Neural Networks) , are used to predict pre -diabetes
in Korean population.  A similar approach is used in studying
the correlation for hematological parameters and glucose level
for identification of diabetes  [6].
In general, the research focus is to conduc t some of the
supervised learning  algorithms on the given dataset and
extract the knowledge about the prediction of diabetes based
on given values of the appropriate attributes. In [7], the SVM
implementation gives the prediction accuracy of 94 %. Another
implementation of the SVM in detec ting the diabetes is given
in [8 ]. Here, t he SVM classifier , however , performs only 78 %
of accuracy . A method for prediction of diabetes by using
Bayesian network is given in [9 ] while the authors in [10]
separately use Naïve Bayes and k -nearest neighbor algorithm.
Most of the mentioned researches , rely on Pima Indian
database of diabetic, and therefore have the same attributes
and similar conclusions. Furthermore, they treat one or two
algorithms independently to compar e the efficiency of the
algorithms between each other.  Some studies, however,
recommend the hybrid use of a distance -based algorithm and a
statistical based method [11] or the combination of
classification and clustering [1].
In contrast to the most of the  mentioned materials and
methodologies,  this paper treats original dataset , with different
attributes,  extracted from the medical examinations in Kosovo.
Beside the algorithm performance evaluations, t he aim of the
paper  is not only to analyze the dataset and to provide a
supporting tool for diabetes detection, but also to improve the
derived decision reliability  by jointly using two machine
learning algorithms . This creates a more reliable zone (with
the answers yes/no on diabetes) and a “grey zone” that would
direct the decision making process to the further clinical
examinations.
III. MATERIALS AND IMPLEMENTATION METHODS
A. Dataset description
Dataset consists of 402 instances  taken from three different
locations in Kosovo. During the data a cquisition process, the
appropriate importance is given to the patient’s data privacy
and anonymity.  The attributes of the database are : BMI (body
mass index), glucose level before meal and after meal, the
systolic and diastolic blood pressure, the heredit arily factor, the
regular diet, and daily physical activities. The last two
attributes are evaluated as follows. Regarding the issue of
regular diet,  while relying on inputs from the medical
clinicians,  patients  were asked if they took  their meals in
appro ximately same equidistant daily intervals at least three
times a day and also if their meals were  not voluminous. With
these answers being positive , we consider  that a patient is
having  the regular diet. On the other hand , according to the
U.S. Center  for Disease Control and Prevention (CDCP), the
adult person  is considered to be physically active if he/she
conducts the 150 -200 min of physical activities a week.  With
the answers on the family history questions, and i n accordance
to the above given threshold s, the answers of the examinee regarding the  last three questions are mapped into two values: 1
and 0.
The range s of the values  of all attributes  are given in Table I.
TABLE I.  THE RANGES OF THE ATTRIBUTES
Attribute  Value range
From  To
BMI  15 40
Pre meal glucose  3.5 19
Post meal glucose  4.9 22.8
Diastolic blood pressure  55 110
Systolic blood pressure  90 200
Family history of diabetes  No (0)  Yes (1)
Regular diet  No (0)  Yes (1)
Physical activities  No (0)  Yes (1)

After acquiring the given initial data from patients, and
after the extensive laboratory examinations and continuous
monitoring, 80 of patie nts were diagnosed with type 2
Diabetes. The presence of diabetes i n an instance was labeled
with B oolean 1. The rest of patients were not diagnosed with
diabetes , which results in  number of 322 Boolean zeros in the
dataset .
B.  The SVM implementation
The SVM algorithm represents the instances as points in
space, mapped so that separate classes are divided by a clear
gap. The aim is to find the maximum -margin hyper plane – the
one that gives the greatest separation between the classes. The
instances that are closest to the maximum -margin hyper plane
are called support vectors. Support vectors are chosen based on
the po rtion of the dataset that represents the training set.
Support vectors of two classes enable the creation of two
parallel hyper planes. The larger the margin between these two
hyper planes, the better the generalization error of the
classifier.
Training d ata points can be represented in form:
{(X1, Y1), (X2, Y2)…., (Xn, Yn)}

where Xi is a k -dimensional vector and Yi is +1 or -1
denoting the class to which a given point belongs to.
The training data is then divided by a hyper plane of general
form:
𝑊∗𝑋+𝐵=0              (1)
Where W is k -dimensional vector, perpendicular to the hyper
plain and B is scalar. Two parallel hyper planes that belong to
two different classes can be described by equations:

4thMediterranean Conference on Embedded ComputingMECO - 2015Budva, Montenegro

𝑊∗𝑋+𝐵=1           (2)
𝑊∗𝑋+𝐵=−1           (3)
The distance between the hyper planes is 2/|W| so the aim
is to minimize |W|. The semantics behind the
multid imensional formulations  (1)-(3) is given in [12 ] and
further process of minimization of factor 2/|W| is explained in
[13].
C. The N aïve Bayes implementation
The N aïve Bayes classification is based on the probabilistic
Bayes theory. As noted in [11 ], the N aïve Bayes statistical
algorithm is a frequently used method in prediction problem.
The implementation of the statistical modeling is based on
linear function. T heoretically , it usually  means the appliance
of unrealistic assumption that the attributes are equally
important and independent. Th e real life dataset consists  of
attributes that are certainly not equally important or
indep endent, but, a s noted in [12 ] and also shown in this
paper, it leads to a simple scheme that, again, works
surpri singly well in practice.
While keeping in mind the above mentioned independency
assumption, the calculation of the probability tha t a given
record belongs t o class Y=C, can be calculated as the product
of probability that each of value s of the i record’s attributes
belong to class C, i.e.,

𝑃 𝑋 𝑌=𝐶 =  𝑃(𝑞
𝑖=1𝑋𝑖 𝑌=𝐶                                      (4)
The probability that a given value of the attribute belongs
to class 𝑦𝑗, when the dataset contains numerical inputs, can
often generally be calculated by using the Gaussian
distribution function, i.e.,
𝑃 𝑋𝑖=𝑥𝑖 𝑌=𝑦𝑗 =1
 2𝜋𝜎𝑖𝑗𝑒− 𝑥𝑖−𝜇𝑖𝑗 2
2𝜎𝑖𝑗2                               (5)

Finally, the probability that a given record will be classified in
class C, can be formulated with:

𝑃 𝐶 𝑥1,…𝑥𝑛 =𝑃 𝐶 𝑃 𝑥1,…,𝑥𝑛 𝐶
𝑃 𝑥1,…𝑥𝑛                                            (6)
D. The proposed architecture
The system consists of the following elements. Two
machines, namely SVM and Naïve Bayes classifier , build their
classifiers based on the training sets. Afterward, they are ready
to perform the classification of a given record. The output can
belong to one of two classes: class 0 (no diabetes) and class 1
(diabetes). If the outcomes from both classifiers are equal, the
record is classified as belonging to class 0 or class 1. If the
output is different, then the record is considered as still
unclassified (gre y zone).
The SVM algorithm is  implement ed by using
bioinformatics tool  in Matlab  while Naïve Bayes implementation is constructed manually, also in Matlab .
Matlab was  chosen due to its flexibility and ability to work
with various file formats. By using the  Matlab function, the
data are divided into  the training set and the testing set . The
classifier is extracted from the training set while the instances
from the testing set are tested on the derived classifier. The
SVM algorithm uses the polynomial kernel.  In order to
mitigate any bias caused by samples chosen for holdout, the
repeated holdout method  of error estimation is used. In this
direction, the process of classifier performance evaluation is
repeated 100 times with the classifier performance evaluate d
in each of the iteration.
IV. RESULTS  AND DISCUSSIONS
Prior to building the application, analyzing data, and
evaluating the classification performance, sample size is
conducted to analysis in accordance to the methodology
described in [14]. With the given nu mber of population, the
given response distribution of  diabetes in Kosovo, and  the
given confidence level of 98%, the margin of error that occurs
due to the limited number of examinee is expected to be in the
range of 1,62 %.
The developed  Matlab -based application works in two
modes: the assessment mode and the data acquisition mode.
The GUI is given in Fig. 1 .

Fig. 1: Diabetes diagnostic a pplication GUI
The data acquisition  mode enables for the addition of new
records, in order to increas e the sample size and the
classification accuracy. The  assessment mode gives the
prediction of pre -diabetic or diabetic condition on each of the
newly added record.
The prediction on new record is based on the execution of
both SVM and Naïve Bayes classifi cation  on previous
instances with different randomly chosen training sets . If both
classifiers classify the record as being positive or negative, then
we consider the given output of the classifier for a specific
record as having high reliability . Otherwis e, if the outputs  are
different , the patient needs to be further monitored on diabetes.
For the simplicity reasons, by incorporating the
stratification, the data are quasi -randomly split into  two equal
sets - 201 instances for training and 201 instances f or
evaluating the performances . In general, these two sets are only
approximately equal, since  the number s of instances  for


4thMediterranean Conference on Embedded ComputingMECO - 2015Budva, Montenegro

training and testing are  selected automatically by crossvalind
Matlab function. In order to find the average value of the
classifier s’ accuracy , the process of random selection of the
training set and test set  along with the classifier performance
evaluation on each random selection is repeated 100  times .
The results show the mean value of the SVM classifier
performance  - accuracy  of 95, 52 % while for the Naïve Bayes
classifier  the classifier accuracy is 94,  52%. Both values vary
in +/- 1% of classification performance margin during various
iterations.  This also shows for the high stability of classifier.
The average number  of correctly and incorrectly classified
records is calculated  for both classifiers and the results are
given in Table II  along with the structure of the sample
distribution for training and testing the system .
TABLE II.  CLASSIFICATION AND THE AVERAGE ACCURACY
 No. of
record s Train set
/ test set  No. of
correctly
classified  No. of
incorrectly
classified
records  Classifier
performance
(mean value)
SVM  402 201/201  192 9 95.52%
Naïve
Bayes  402 201/201  190 11 94.53 %

As can be noted, and as expected, in terms of the overall
average classification accuracy, SVM over performs the Naïve
Bayes classification, but the difference in classifier
performance is surprisingly small.
The performances are also estimated in terms of other
multi -class classification  measures such as precision and
recall. For a specific class, these two measures are calculated
as follows:

𝑃𝑟𝑒𝑐𝑖𝑠𝑖𝑜𝑛𝑋=𝑇𝑃𝑋
𝑇𝑃𝑋+𝐹𝑃𝑋           (7)

𝑅𝑒𝑐𝑎𝑙𝑙𝑋=𝑇𝑃𝑋
𝑇𝑃𝑋+𝐹𝑁𝑋                         (8)

The notations TP, FP and FN refer to the number of data
classified as true positive, false positive and false negative,
i.e., the data correctly classified in class X, the data that
belong to another class and are incorrectly classified in class
X, and the  data that should have been classified in class X
whereas are classified in another class..
The results regarding the precision and recall of classes
YES and NO are given in Table III.
TABLE III.  PRECISION AND RECALL
 Precision
(Class NO)  Recall
(ClassNO)  Precision
(Class YES)  Recall
(Class YES)
SVM  0.97 0.975  0.892  0.868
Naïve
Bayes  0.981  0.951  0.814  0.921

The joint implementation aims to improve the reliability of
the decision in case when both algorithms give the same outcome, when we consider the outcome to be valid.
Otherwise, when the outcomes are different, we consider the
decision to be unreliable and hence invalid.
After the repeated joint implementation of both algorithms,
the average rate of the valid outcomes turns out to be 94, 77 %
with the expected lower bound of approximately 90, 3 %. This
means that the major number (94, 77%) of new patients that
need to be classified regarding the likeness of diabetic
condition, will be classified the same way by two inde pendent
classifiers. The difference of 5.23 % in the results of
classifiers exists due to their individual weaknesses and can
also be related to the dataset. Among the valid  outcomes, the
derived accuracy of the dec ision is now improved up to 97,6
%, which  significantly over performs both classifiers
individually.
V. CONCLUSIONS AND FUTUR E WORK
The research efforts presented in this paper are focused in
developing and the evaluation of a computer -based support tool
for the diabetes  detection .
The presented approach is based on the joint
implementation of two algorithms in Matlab that have been
executed on the newly acquired dataset with the different
attributes as compared to the previous work in this field . The
algorithms are executed and eval uated independently but the
decision making is based on the joint outcomes from both
algorithms.  The aim of this approach is to make the decision
more reliable.
As shown in the paper, both SVM and naïve Bayes
algorithm have individually shown high overall classifier
performances  of 95 , 52% and 94, 52%, respectively.  The joint
implementation on the same, newly added record leads to one
of the three answers: a) the patient is diagnosed with diabetes
(or pre -diabetic condition), b) the patient is not d iagnosed as
having the mentioned condition, and c) the patient is further
directed to the additional clinical examinations.  If two
algorithms show different results, the answer is classified as
condition c). Otherwise, the accuracy of the answers a) or  b) ,
as shown in the paper, is improve d up to the value of 97,6%.
The presented methodology minimizes the false negative
answers, which is a crucial issue in medical diagnoses.
Finally, the construction approach, the architecture, and the
evaluation of a diab etes classification tool presented in this
paper, should provide an important guideline to further
construction of the similar applications on improving and
helping the decision making process in disease detection.  The
development of a user -friendly and wi dely accessible
application would enable the personal self -screening on
diabetic or pre -diabetic condition which is crucial to the disease
treatment performance.
The future work will focus on further quantitative
evaluation s of the developed tool  regarding the extensive
clinical examinations and results . Also, other methods should
be involved in finding the best fit in the sense of accuracy,
processing time, etc. The influence of cultural -related biases
(such as those related to the nutrition struc ture and habits)
"""

# Run the processing function
final_storyboard, detected_references = process_research_paper_with_timestamps(research_paper)

# Print results
print("\n🎬 **Final 3-Minute Storyboard with 5-Second Timestamps:**\n")
print(final_storyboard)

if detected_references:
    print("\n📌 References Detected:\n")
    print(", ".join(detected_references))



🔹 Processing Chunk 1/27...

✅ 3-Minute Storyboard with Timestamps generated successfully.

🎬 **Generated 3-Minute Storyboard with 5-Second Timestamps:**

**Scene 1 (0-5 seconds)**
- **Visual:** Opening shot of a montage of people living with diabetes, showcasing the impact on their lives.
- **Audio:** "Diabetes affects millions worldwide, with many cases going undetected."

**Scene 1 (5-10 seconds)**
- **Visual:** Close-up of a person checking their blood sugar levels.
- **Audio:** "Early detection is crucial for effective management."

**Scene 1 (10-15 seconds)**
- **Visual:** Animated data visualization displaying the prevalence of diabetes globally.
- **Audio:** "Existing methods for detection have limitations in accuracy and timeliness."

**Scene 2 (15-20 seconds)**
- **Visual:** Transition to a laboratory setting, where researchers are developing an intelligent diabetes prediction system.
- **Audio:** "Our system aims to bridge this gap by providing early, accurate detection."

*