In [3]:

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow_hub as hub
from sentence_transformers import SentenceTransformer, util


In [8]:
# Universal Sentence Encoder'ı yükle
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
use_model = hub.load(module_url)

# Sentence-Transformers modellerini yükle
roberta_model = SentenceTransformer('stsb-roberta-large')
bert_model = SentenceTransformer('sentence-transformers/bert-base-nli-mean-tokens')


In [11]:

# Process for Question 1
df_q1 = pd.read_excel('removed_stopwords.xlsx', sheet_name='Q1')

# Assuming answers are in the third column
answers_q1 = df_q1.iloc[:, 2].fillna('').tolist()

# Define your query for this question here
query_q1 = 'Different physical features of compounds with the same chemical composition are due to differences in their molecular arrangement or structure, which affect intermolecular forces and crystal structures.'

# USE ile vektörleri hesapla
use_vectors = use_model(answers_q1 + [query_q1])
use_query_vector = use_vectors[-1]
use_cosine_similarities = np.inner(use_vectors[:-1], use_query_vector)

# ST modeli ile vektörleri hesapla
roberta_vectors = roberta_model.encode(answers_q1 + [query_q1], convert_to_tensor=True)
roberta_query_vector = roberta_vectors[-1]
roberta_cosine_similarities = util.pytorch_cos_sim(roberta_vectors[:-1], roberta_query_vector)

# ST modeli 2 ile vektörleri hesapla
bert_vectors = bert_model.encode(answers_q1 + [query_q1], convert_to_tensor=True)
bert_query_vector = bert_vectors[-1]
bert_cosine_similarities = util.pytorch_cos_sim(bert_vectors[:-1], bert_query_vector)

# Notları hesapla ve DataFrame'e ekle
df_q1['USE Score'] = np.round(use_cosine_similarities * 20).astype(int)  # USE skoru, tamsayı olarak
df_q1['Roberta Score'] = np.round(roberta_cosine_similarities.numpy().flatten() * 20).astype(int)  # ST skoru, tamsayı olarak
df_q1['Bert Score'] = np.round(bert_cosine_similarities.numpy().flatten() * 20).astype(int)  # ST2 skoru, tamsayı olarak


# İlk birkaç satırı göster
df_q1.head()


Unnamed: 0,Email,Column1,Same chemical composition but different physical features of compounds are due to differences in what ?,Unnamed: 3,Unnamed: 4,Unnamed: 5,USE Score,Roberta Score,Bert Score
0,20160807006,:::,nuclears theory 3D structure cube added remove...,,20160807006,20,6,9,10
1,20160807009,:::,count elements . Mostly carbon dioxide .,,20160807009,0,4,6,7
2,20170808017,:::,Allotrops diamond graphit chemical composition...,,20170808017,20,11,11,12
3,20170808045,:::,name izonom isonome caused difference connecti...,,20170808045,12,7,11,12
4,20190808008,:::,elements energy pyhsical features different re...,,20190808008,0,8,10,12


In [12]:

# Process for Question 2
df_q2 = pd.read_excel('removed_stopwords.xlsx', sheet_name='Q2')

# Assuming answers are in the third column
answers_q2 = df_q2.iloc[:, 2].fillna('').tolist()

# Define your query for this question here
query_q2 = 'Sensitive and Accurate measurements can be made with Gases in Reaction. Gases are also easy to capture and observe. You could get numbers using the PV=nRT Equation.'

# USE ile vektörleri hesapla
use_vectors2 = use_model(answers_q2 + [query_q2])
use_query_vector2 = use_vectors2[-1]
use_cosine_similarities2 = np.inner(use_vectors2[:-1], use_query_vector2)

# ST modeli ile vektörleri hesapla
roberta_vectors2 = roberta_model.encode(answers_q2 + [query_q2], convert_to_tensor=True)
roberta_query_vector2 = roberta_vectors2[-1]
roberta_cosine_similarities2 = util.pytorch_cos_sim(roberta_vectors2[:-1], roberta_query_vector2)

# ST modeli 2 ile vektörleri hesapla
bert_vectors2 = bert_model.encode(answers_q2 + [query_q2], convert_to_tensor=True)
bert_query_vector2 = bert_vectors2[-1]
bert_cosine_similarities2 = util.pytorch_cos_sim(bert_vectors2[:-1], bert_query_vector2)

# Notları hesapla ve DataFrame'e ekle
df_q2['USE Score'] = np.round(use_cosine_similarities2 * 20).astype(int)  # USE skoru, tamsayı olarak
df_q2['Roberta Score'] = np.round(roberta_cosine_similarities2.numpy().flatten() * 20).astype(int)  # ST skoru, tamsayı olarak
df_q2['Bert Score'] = np.round(bert_cosine_similarities2.numpy().flatten() * 20).astype(int)  # ST2 skoru, tamsayı olarak


# İlk birkaç satırı göster
df_q2.head()


Unnamed: 0,Email,Column1,Why scientist choose to study gases for Chemistry ?,Unnamed: 3,Unnamed: 4,Unnamed: 5,USE Score,Roberta Score,Bert Score
0,20160807006,:::,", volume , weight , size calculasion air easy",,20160807006,20,5,9,13
1,20160807009,:::,allowed measurement gas Analytical Balance . W...,,20160807009,10,9,10,13
2,20170808017,:::,Observing gases easy others . Also barometer t...,,20170808017,20,12,14,17
3,20170808045,:::,easly working gases . expand change volume . v...,,20170808045,10,6,9,10
4,20190808008,:::,easy calculate everything time gas important,,20190808008,20,7,13,13


In [13]:

# Process for Question 3
df_q3 = pd.read_excel('removed_stopwords.xlsx', sheet_name='Q3')

# Assuming answers are in the third column
answers_q3 = df_q3.iloc[:, 2].fillna('').tolist()

# Define your query for this question here
query_q3 = 'Morgan chose the common fruit fly, Drosophila melanogaster, as a model organism to study genetics because fruit flies have a short life cycle, reproduce quickly and in large numbers, and have easily observable physical traits that can be used to study genetic inheritance. His motivation was to understand the basic principles of inheritance and the role of genes in determining physical traits, which he believed could help explain the mechanisms of evolution and human genetics.'

# USE ile vektörleri hesapla
use_vectors3 = use_model(answers_q3 + [query_q3])
use_query_vector3 = use_vectors3[-1]
use_cosine_similarities3 = np.inner(use_vectors3[:-1], use_query_vector3)

# ST modeli ile vektörleri hesapla
roberta_vectors3 = roberta_model.encode(answers_q3 + [query_q3], convert_to_tensor=True)
roberta_query_vector3 = roberta_vectors3[-1]
roberta_cosine_similarities3 = util.pytorch_cos_sim(roberta_vectors3[:-1], roberta_query_vector3)

# ST modeli 2 ile vektörleri hesapla
bert_vectors3 = bert_model.encode(answers_q3 + [query_q3], convert_to_tensor=True)
bert_query_vector3 = bert_vectors3[-1]
bert_cosine_similarities3 = util.pytorch_cos_sim(bert_vectors3[:-1], bert_query_vector3)

# Notları hesapla ve DataFrame'e ekle
df_q3['USE Score'] = np.round(use_cosine_similarities3 * 20).astype(int)  # USE skoru, tamsayı olarak
df_q3['Roberta Score'] = np.round(roberta_cosine_similarities3.numpy().flatten() * 20).astype(int)  # ST skoru, tamsayı olarak
df_q3['Bert Score'] = np.round(bert_cosine_similarities3.numpy().flatten() * 20).astype(int)  # ST2 skoru, tamsayı olarak


# İlk birkaç satırı göster
df_q3.head()


Unnamed: 0,Email,Column1,Morgan Studies Common Fruit Fly to study genetics. What was the motivation for his work and why did he choose Fruit Flies,Unnamed: 3,Unnamed: 4,Unnamed: 5,USE Score,Roberta Score,Bert Score
0,20160807006,:::,division easly atoms control atoms,,20160807006,0,3,7,5
1,20160807009,:::,Fruit Fly nearly genetics like humans . simila...,,20160807009,6,13,14,15
2,20170808017,:::,Morgan choose fruit flies short life cycle . g...,,20170808017,6,8,12,11
3,20170808045,:::,morgan choose fruit fly lifetime 30 days . mak...,,20170808045,0,7,8,10
4,20190808008,:::,want cahnge something fruit fly dont take pole...,,20190808008,0,9,11,11


In [14]:

# Process for Question 4
df_q4 = pd.read_excel('removed_stopwords.xlsx', sheet_name='Q4')

# Assuming answers are in the third column
answers_q4 = df_q4.iloc[:, 2].fillna('').tolist()

# Define your query for this question here
query_q4 = 'Biochemistry isolates the protein of the functional organ and studies the chemical composition of the puree. Genetics study the population and observe how traits are being transmitted from parents to offspring through genes. Molecular Biology isolates the gene (DNA) and arrives at the protein form for biochemistry.'

# USE ile vektörleri hesapla
use_vectors4 = use_model(answers_q4 + [query_q4])
use_query_vector4 = use_vectors4[-1]
use_cosine_similarities4 = np.inner(use_vectors4[:-1], use_query_vector4)

# ST modeli ile vektörleri hesapla
roberta_vectors4 = roberta_model.encode(answers_q4 + [query_q4], convert_to_tensor=True)
roberta_query_vector4 = roberta_vectors4[-1]
roberta_cosine_similarities4 = util.pytorch_cos_sim(roberta_vectors4[:-1], roberta_query_vector4)

# ST modeli 2 ile vektörleri hesapla
bert_vectors4 = bert_model.encode(answers_q4 + [query_q4], convert_to_tensor=True)
bert_query_vector4 = bert_vectors4[-1]
bert_cosine_similarities4 = util.pytorch_cos_sim(bert_vectors4[:-1], bert_query_vector4)

# Notları hesapla ve DataFrame'e ekle
df_q4['USE Score'] = np.round(use_cosine_similarities4 * 20).astype(int)  # USE skoru, tamsayı olarak
df_q4['Roberta Score'] = np.round(roberta_cosine_similarities4.numpy().flatten() * 20).astype(int)  # ST skoru, tamsayı olarak
df_q4['Bert Score'] = np.round(bert_cosine_similarities4.numpy().flatten() * 20).astype(int)  # ST2 skoru, tamsayı olarak


# İlk birkaç satırı göster
df_q4.head()


Unnamed: 0,Email,Column1,"Explain the relationship between biochemistry, molecular biology and genetics",Unnamed: 3,Unnamed: 4,Unnamed: 5,USE Score,Roberta Score,Bert Score
0,20160807006,:::,theoriticl thinking less science vitalist orga...,,20160807006,0,9,9,11
1,20160807009,:::,important relationship interested biology . Us...,,20160807009,0,12,10,15
2,20170808017,:::,"terms ( chemistry , biochemistry , molecular b...",,20170808017,5,10,11,13
3,20170808045,:::,biochemistry studies dna part . genetics studi...,,20170808045,10,12,12,16
4,20190808008,:::,change something without breaking genetic code...,,20190808008,10,11,9,13


In [15]:

# Process for Question 5
df_q5 = pd.read_excel('removed_stopwords.xlsx', sheet_name='Q5')

# Assuming answers are in the third column
answers_q5 = df_q5.iloc[:, 2].fillna('').tolist()

# Define your query for this question here
query_q5 = 'According to Adam Smith, wealth increases through the division of labor, specialization, and free trade.'

# USE ile vektörleri hesapla
use_vectors5 = use_model(answers_q5 + [query_q5])
use_query_vector5 = use_vectors5[-1]
use_cosine_similarities5 = np.inner(use_vectors5[:-1], use_query_vector5)

# ST modeli ile vektörleri hesapla
roberta_vectors5 = roberta_model.encode(answers_q5 + [query_q5], convert_to_tensor=True)
roberta_query_vector5 = roberta_vectors5[-1]
roberta_cosine_similarities5 = util.pytorch_cos_sim(roberta_vectors5[:-1], roberta_query_vector5)

# ST modeli 2 ile vektörleri hesapla
bert_vectors5 = bert_model.encode(answers_q5 + [query_q5], convert_to_tensor=True)
bert_query_vector5 = bert_vectors5[-1]
bert_cosine_similarities5 = util.pytorch_cos_sim(bert_vectors5[:-1], bert_query_vector5)

# Notları hesapla ve DataFrame'e ekle
df_q5['USE Score'] = np.round(use_cosine_similarities5 * 20).astype(int)  # USE skoru, tamsayı olarak
df_q5['Roberta Score'] = np.round(roberta_cosine_similarities5.numpy().flatten() * 20).astype(int)  # ST skoru, tamsayı olarak
df_q5['Bert Score'] = np.round(bert_cosine_similarities5.numpy().flatten() * 20).astype(int)  # ST2 skoru, tamsayı olarak


# İlk birkaç satırı göster
df_q5.head()


Unnamed: 0,Email,Column1,How do wealth increases according to Smith ?,Unnamed: 3,Unnamed: 4,Unnamed: 5,USE Score,Roberta Score,Bert Score
0,20160807006,:::,"wealth increases division labory , work , job ...",,20160807006,20,8,13,17
1,20160807009,:::,Adam Smith believe wealth enhanced division la...,,20160807009,20,15,17,18
2,20170808017,:::,According Smith division labors key increasing...,,20170808017,20,13,16,16
3,20170808045,:::,according smith wealth icreases effecting temp...,,20170808045,0,7,11,12
4,20190808008,:::,posion gas,,20190808008,0,-1,2,6


In [16]:

# Save the results to a new Excel file
with pd.ExcelWriter('graded_answers.xlsx') as writer:
    for i in range(1, 6):
        df = globals()[f'df_q{i}']
        df.to_excel(writer, sheet_name=f'Q{i}', index=False)
