#Book Corpus vs BERT

In [1]:
import pandas as pd

# Read the CSV file into a pandas DataFrame
df = pd.read_csv('/content/filtered_entity_probabilities_statistics_BERT.csv')  # Replace 'your_file.csv' with your actual file path

# Step 2: Convert "average_probability" to a float while preserving the decimal places
#df['average_probability'] = df['average_probability'].apply(lambda x: "{:.50f}".format(x))
df['average_probability'] = df['average_probability'].astype(float)


# Create a dictionary with 'Entity' as keys and 'API Response' as values
entity_api_dict = pd.Series(df['average_probability'].values, index=df['Entity']).to_dict()

# Print the resulting dictionary
print(entity_api_dict)


{'-LSB- -RSB- China': 1.3617827290595928e-25, '-LSB- -RSB- This kind of discarded electronic trash': 6.197162184880156e-36, '1983 Contest': 1.2629439204418916e-05, 'A European diplomatic source': 2.77895603210003e-09, 'A Google departure': 3.809429427276641e-09, 'A Nomad of the Time Streams': 8.738412465456384e-16, 'A Philippine politician whose family was allegedly murdered to stop him running for governor': 4.6132410304406105e-33, 'A Russian diver': 4.015039032962122e-05, 'A big attraction on the Thames': 1.4799178726897695e-11, 'A car': 0.0188094021513399, 'A conversion rate': 6.751120030956136e-05, 'A couple months': 0.0002606698240004, 'A court in Morocco on Monday': 1.656337863098758e-13, 'A culture': 0.006398284317665, 'A department spokesman': 2.0017663407611016e-05, 'A faction from within the disbanded NLD': 5.706459806150257e-17, 'A fifth strikeout': 6.320645292030662e-06, 'A fifth synagogue': 1.9724579165226106e-07, 'A friend': 0.0737491580147807, 'A gigantic robot': 0.00016

In [2]:
import pickle

from scipy.stats import spearmanr, pearsonr, kendalltau

with open('/content/final_combined.pkl', 'rb') as f:  # Replace 'your_pickle_file.pkl' with your pickle file path
    phrase_counter = pickle.load(f)

book_corpus_entities = dict(phrase_counter)

# Step 3: Map the occurrences from the pickle data to the "Entity" column in the DataFrame
# We assume the 'phrase_counter' is a dictionary where the keys are phrases (same as 'Entity' column)
# and the values are their respective counts (or occurrence counts)
df['Book Corpus Count'] = df['Entity'].map(phrase_counter)

# Step 4: Handle missing data if any entities in the CSV don't exist in the phrase_counter
# For example, replace NaN values with 0 or another default value
df['Book Corpus Count'].fillna(0, inplace=True)

# Step 5: Calculate Spearman correlation between the 'API Response' and 'Phrase Count' columns
# Extract the 'API Response' and 'Phrase Count' columns as lists
api_response_values = df['average_probability']
phrase_count_values = df['Book Corpus Count']

spearman_corr, _ = spearmanr(api_response_values, phrase_count_values)
pearson_corr, _ = pearsonr(api_response_values, phrase_count_values)
kendall_corr, _ = kendalltau(api_response_values, phrase_count_values)

# Step 7: Print the correlation coefficients
print(f"Spearman correlation: {spearman_corr}")
print(f"Pearson correlation: {pearson_corr}")
print(f"Kendall correlation: {kendall_corr}")

Spearman correlation: 0.5706135671969552
Pearson correlation: 0.23324780128452624
Kendall correlation: 0.46312242751482335


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Book Corpus Count'].fillna(0, inplace=True)


#Book Corpus vs BART

In [3]:
import pandas as pd

# Read the CSV file into a pandas DataFrame
df = pd.read_csv('/content/filtered_entity_probabilities_statistics_BART.csv')  # Replace 'your_file.csv' with your actual file path

# Step 2: Convert "average_probability" to a float while preserving the decimal places
#df['average_probability'] = df['average_probability'].apply(lambda x: "{:.50f}".format(x))
df['average_probability'] = df['average_probability'].astype(float)


# Create a dictionary with 'Entity' as keys and 'API Response' as values
entity_api_dict = pd.Series(df['average_probability'].values, index=df['Entity']).to_dict()

# Print the resulting dictionary
print(entity_api_dict)

{'-LSB- -RSB- China': 9.168375650185551e-31, '-LSB- -RSB- This kind of discarded electronic trash': 1.0336012288226478e-49, '1983 Contest': 7.048868120173073e-11, 'A European diplomatic source': 3.678502438874193e-06, 'A Google departure': 2.2006337725224413e-07, 'A Nomad of the Time Streams': 2.3139995411410022e-24, 'A Philippine politician whose family was allegedly murdered to stop him running for governor': 6.89962678240069e-28, 'A Russian diver': 8.666463392720231e-10, 'A big attraction on the Thames': 4.767327455146247e-13, 'A car': 0.0012926603441997, 'A conversion rate': 0.0020895493909016, 'A couple months': 4.06641462722582e-05, 'A court in Morocco on Monday': 2.98197696412306e-07, 'A culture': 0.0076351587218798, 'A department spokesman': 2.9455541751335244e-05, 'A faction from within the disbanded NLD': 4.884685673668085e-18, 'A fifth strikeout': 3.1740234344334996e-10, 'A fifth synagogue': 1.6938633643628996e-08, 'A friend': 0.0321100866691423, 'A gigantic robot': 6.714915

In [4]:
import pickle

from scipy.stats import spearmanr, pearsonr, kendalltau

with open('/content/final_combined.pkl', 'rb') as f:  # Replace 'your_pickle_file.pkl' with your pickle file path
    phrase_counter = pickle.load(f)

book_corpus_entities = dict(phrase_counter)

# Step 3: Map the occurrences from the pickle data to the "Entity" column in the DataFrame
# We assume the 'phrase_counter' is a dictionary where the keys are phrases (same as 'Entity' column)
# and the values are their respective counts (or occurrence counts)
df['Book Corpus Count'] = df['Entity'].map(phrase_counter)

# Step 4: Handle missing data if any entities in the CSV don't exist in the phrase_counter
# For example, replace NaN values with 0 or another default value
df['Book Corpus Count'].fillna(0, inplace=True)

# Step 5: Calculate Spearman correlation between the 'API Response' and 'Phrase Count' columns
# Extract the 'API Response' and 'Phrase Count' columns as lists
api_response_values = df['average_probability']
phrase_count_values = df['Book Corpus Count']

spearman_corr, _ = spearmanr(api_response_values, phrase_count_values)
pearson_corr, _ = pearsonr(api_response_values, phrase_count_values)
kendall_corr, _ = kendalltau(api_response_values, phrase_count_values)

# Step 7: Print the correlation coefficients
print(f"Spearman correlation: {spearman_corr}")
print(f"Pearson correlation: {pearson_corr}")
print(f"Kendall correlation: {kendall_corr}")

Spearman correlation: 0.3591229499066282
Pearson correlation: 0.15778866555663157
Kendall correlation: 0.27960259890709305


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Book Corpus Count'].fillna(0, inplace=True)


#Book Corpus vs LLAMA

In [5]:
import pandas as pd

# Read the CSV file into a pandas DataFrame
df = pd.read_csv('/content/filtered_entity_probabilities_statistics_LLAMA.csv')  # Replace 'your_file.csv' with your actual file path

# Step 2: Convert "average_probability" to a float while preserving the decimal places
#df['average_probability'] = df['average_probability'].apply(lambda x: "{:.50f}".format(x))
df['average_probability'] = df['average_probability'].astype(float)


# Create a dictionary with 'Entity' as keys and 'API Response' as values
entity_api_dict = pd.Series(df['average_probability'].values, index=df['Entity']).to_dict()

# Print the resulting dictionary
print(entity_api_dict)

{'-LSB- -RSB- China': 2.3027226861839965e-19, '-LSB- -RSB- This kind of discarded electronic trash': 2.7802449539537922e-34, '1983 Contest': 1.4917642265940342e-09, 'A European diplomatic source': 7.62350994249725e-10, 'A Google departure': 2.8552958287303927e-11, 'A Nomad of the Time Streams': 1.224410406875199e-13, 'A Philippine politician whose family was allegedly murdered to stop him running for governor': 2.248071433729363e-26, 'A Russian diver': 1.644132070851383e-07, 'A big attraction on the Thames': 1.0736811591334095e-13, 'A car': 0.0006694229843207, 'A conversion rate': 5.4962481415693625e-05, 'A couple months': 2.0253051875971206e-07, 'A court in Morocco on Monday': 3.7769207014930856e-14, 'A culture': 3.338098515018747e-05, 'A department spokesman': 2.9256700353851955e-11, 'A faction from within the disbanded NLD': 1.5496098180739036e-15, 'A fifth strikeout': 2.183497591672332e-09, 'A fifth synagogue': 3.810262252620383e-12, 'A friend': 0.0063276128391188, 'A gigantic robo

In [6]:
import pickle

from scipy.stats import spearmanr, pearsonr, kendalltau

with open('/content/final_combined.pkl', 'rb') as f:  # Replace 'your_pickle_file.pkl' with your pickle file path
    phrase_counter = pickle.load(f)

book_corpus_entities = dict(phrase_counter)

# Step 3: Map the occurrences from the pickle data to the "Entity" column in the DataFrame
# We assume the 'phrase_counter' is a dictionary where the keys are phrases (same as 'Entity' column)
# and the values are their respective counts (or occurrence counts)
df['Book Corpus Count'] = df['Entity'].map(phrase_counter)

# Step 4: Handle missing data if any entities in the CSV don't exist in the phrase_counter
# For example, replace NaN values with 0 or another default value
df['Book Corpus Count'].fillna(0, inplace=True)

# Step 5: Calculate Spearman correlation between the 'API Response' and 'Phrase Count' columns
# Extract the 'API Response' and 'Phrase Count' columns as lists
api_response_values = df['average_probability']
phrase_count_values = df['Book Corpus Count']

spearman_corr, _ = spearmanr(api_response_values, phrase_count_values)
pearson_corr, _ = pearsonr(api_response_values, phrase_count_values)
kendall_corr, _ = kendalltau(api_response_values, phrase_count_values)

# Step 7: Print the correlation coefficients
print(f"Spearman correlation: {spearman_corr}")
print(f"Pearson correlation: {pearson_corr}")
print(f"Kendall correlation: {kendall_corr}")

Spearman correlation: 0.44949778835387105
Pearson correlation: -0.002036866589801021
Kendall correlation: 0.36206865099670005


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Book Corpus Count'].fillna(0, inplace=True)


#Book Corpus vs API

In [8]:
import pandas as pd

# Read the CSV file into a pandas DataFrame
df = pd.read_csv('/content/entity_api_response_STRICT_UPTO_2024.csv')  # Replace 'your_file.csv' with your actual file path

# Convert the 'API Response' column to integers (assuming it's in string format)
#df['average_probability'] = df['average_probability'].apply(lambda x: int(x.strip(" '")) if x.strip(" '").isdigit() else None)
df['average_probability'] = df['average_probability'].astype(float)

# Create a dictionary with 'Entity' as keys and 'API Response' as values
entity_api_dict = pd.Series(df['average_probability'].values, index=df['Entity']).to_dict()

# Print the resulting dictionary
print(entity_api_dict)


{'corruption': 337000000.0, 'a man capable of being dangerous': 10.0, 'the beginning of this episode': 4490000.0, 'Pennsylvania, Michigan, Colorado, Ohio and New York': 5.0, 'defects': 346000000.0, 'Hypertensive crisis': 459000.0, 'the above picture': 1940000.0, 'suit': 969000000.0, "Ty Votaw, executive director of the IGF's Olympic Golf Committee, which has coordinated golf's Olympic bid": 0.0, 'the Khmer Rouge': 849000.0, 'four schools': 2690000.0, 'years': 9950000000.0, 'the Cleveland suburb of Kirtland': 54.0, 'This meeting': 24400000.0, 'classical physics': 1420000.0, 'The village': 193000000.0, 'public anger': 771000.0, 'the rise in rates': 279000.0, 'future issues': 5440000.0, 'the spinning rate of silk': 3.0, 'the measure': 105000000.0, 'classic, "This Time I Know It\'s For Real ", under the name Young Divas': 3.0, 'some corporate masseurs': 0.0, 'Caloundra in southern Queensland': 241.0, 'that work': 303000000.0, 'other rich countries': 66600.0, 'An older brother and sister': 

In [9]:
import pickle

from scipy.stats import spearmanr, pearsonr, kendalltau

with open('/content/final_combined.pkl', 'rb') as f:  # Replace 'your_pickle_file.pkl' with your pickle file path
    phrase_counter = pickle.load(f)

book_corpus_entities = dict(phrase_counter)

# Step 3: Map the occurrences from the pickle data to the "Entity" column in the DataFrame
# We assume the 'phrase_counter' is a dictionary where the keys are phrases (same as 'Entity' column)
# and the values are their respective counts (or occurrence counts)
df['Book Corpus Count'] = df['Entity'].map(phrase_counter)

# Step 4: Handle missing data if any entities in the CSV don't exist in the phrase_counter
# For example, replace NaN values with 0 or another default value
df['Book Corpus Count'].fillna(0, inplace=True)

# Step 5: Calculate Spearman correlation between the 'API Response' and 'Phrase Count' columns
# Extract the 'API Response' and 'Phrase Count' columns as lists
api_response_values = df['average_probability']
phrase_count_values = df['Book Corpus Count']

spearman_corr, _ = spearmanr(api_response_values, phrase_count_values)
pearson_corr, _ = pearsonr(api_response_values, phrase_count_values)
kendall_corr, _ = kendalltau(api_response_values, phrase_count_values)

# Step 7: Print the correlation coefficients
print(f"Spearman correlation: {spearman_corr}")
print(f"Pearson correlation: {pearson_corr}")
print(f"Kendall correlation: {kendall_corr}")

Spearman correlation: 0.5833686072719012
Pearson correlation: 0.39864254880730376
Kendall correlation: 0.48840915802659834


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Book Corpus Count'].fillna(0, inplace=True)
