In [1]:
import os
import re
import time
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import Counter, defaultdict
from itertools import chain, product
from joblib import Parallel, delayed
from multiprocessing import cpu_count

import tqdm
import spacy
import gensim

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from langdetect import detect

from sklearn.manifold import TSNE

import embeddings_functions
from itertools import combinations


from nltk.corpus import wordnet as wn

import spacy
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
#!python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to /home/jupyter-
[nltk_data]     berl03@vse.cz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jupyter-
[nltk_data]     berl03@vse.cz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jupyter-
[nltk_data]     berl03@vse.cz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/jupyter-
[nltk_data]     berl03@vse.cz/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Yelp Dataset preparation
- https://www.kaggle.com/datasets/mexwell/yelp-review-dataset
- Remove interpunctions
- Convert to lower case
- Select just english texts
- Remove stopwords
- Remove unusual words
- Tokenize

In [None]:
train =pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [None]:
df = pd.concat([train,test])
df_proc = df[["text"]].reset_index()
print("dataset was read")

# Function to preprocess text
def preprocess_text(text):
    text = re.sub(r'[\W\d_]', ' ', text)  # Remove non-word characters, digits, and underscores
    text = text.lower()  # Convert text to lowercase
    return text

# Apply preprocessing
df_proc['text'] = df_proc['text'].apply(preprocess_text)

df_proc = df_proc.dropna()
max_length = 30
df_proc = df_proc[df_proc['text'].apply(len) >= max_length] # jinak nešel aplikovat language detection


languages = []
for text in df_proc['text']:
    try:
        language = detect(text)
    except:
        language = 'unknown'
    languages.append(language)
df_proc['language'] = languages

df_proc = df_proc[df_proc['language'] == 'en']

# Tokenize text
df_proc['text_tok'] = df_proc['text'].apply(word_tokenize)

# Remove stop words
stop_words = set(stopwords.words('english'))
df_proc['text_tok'] = df_proc['text_tok'].apply(lambda tokens: [word for word in tokens if word not in stop_words])

# Flatten list of tokens and create a frequency distribution
all_tokens = [word for tokens in df_proc['text_tok'] for word in tokens]
freq_dist = Counter(all_tokens)

# Determine thresholds for what is considered "unusual"
min_freq = 5
max_freq = 0.1 * len(df_proc)

# Filter out unusual words
df_proc['text_tok'] = df_proc['text_tok'].apply(lambda tokens: [word for word in tokens if min_freq <= freq_dist[word] <= max_freq])

df_proc = df_proc.drop_duplicates(["text"])

print(len(df_proc))

In [None]:
# Uložení dataframe do Parquet
df_proc.to_parquet('df_proc.parquet', engine='pyarrow')

In [3]:
df_proc = pd.read_parquet("df_proc.parquet") 

In [4]:
df_proc.head(2)

Unnamed: 0,index,text,language,text_tok
0,0,dr goldberg offers everything i look for in a...,en,"[dr, goldberg, offers, everything, look, gener..."
1,1,unfortunately the frustration of being dr go...,en,"[unfortunately, frustration, dr, goldberg, pat..."


## All unique words

In [5]:
flattened_list = list(chain(*df_proc['text_tok'].tolist()))
print(len(flattened_list))
unique_list = list(set(flattened_list))
print(len(unique_list))

34701335
66288


## MEN dataset

In [6]:
men = pd.read_csv("men.txt", sep="\t")
men["score"] = men["score"]/100
men["word1"] = men["word1"].astype(str)

words_men = list(set(list(men["word1"].values)+list(men["word2"].values)))
print(len(words_men))

words_men = [word for word in words_men if word in unique_list]
print(len(words_men))

751
731


## Choose random words + words from MEN

In [7]:
random.seed(42)
random_words_list = random.sample(unique_list, 3000)

df_random = pd.DataFrame(random_words_list,columns = ["word"])
df_random["is_men"] = 0
df_men = pd.DataFrame(words_men,columns = ["word"])
df_men["is_men"] = 1
df_tot = pd.concat([df_random,df_men])

df_tot.to_csv("random_words_list.csv")

In [8]:
total_words = words_men+random_words_list
word_pairs = list(combinations(total_words, 2))
df_word_pairs = pd.DataFrame(word_pairs, columns=['First', 'Last'])

In [9]:
df_word_pairs

Unnamed: 0,First,Last
0,flood,pod
1,flood,neon
2,flood,chapel
3,flood,vintage
4,flood,red
...,...,...
6958310,anybody,cabane
6958311,anybody,bofa
6958312,unreasonable,cabane
6958313,unreasonable,bofa


## Derive Dataset Characteristics Features

In [10]:
sentences = df_proc["text_tok"].to_list()
total_word_count = df_proc['text'].apply(len).sum()

In [16]:
def build_vocab_and_co_occurrence(window_size, sentences, df_word_pairs, total_word_count):
    
    def calc_co_occurrence_matrix(sentences, window_size):
        frekvence_slov = Counter()
        d = defaultdict(int)
        vocab = set()  
        for text in tqdm(sentences):
            frekvence_slov.update(text)
            for i, token in enumerate(text):
                vocab.add(token)
                next_tokens = text[i+1 : i+1+window_size]
                for next_token in next_tokens:
                    key = tuple(sorted([next_token, token]))
                    d[key] += 1             
        vocab = sorted(vocab)
        vocab_index = {word: i for i, word in enumerate(vocab)}
        co_occurrence_matrix = np.zeros((len(vocab), len(vocab)), dtype=int)

        for (token1, token2), value in tqdm(d.items()):
            index1 = vocab_index[token1]
            index2 = vocab_index[token2]
            co_occurrence_matrix[index1, index2] = value
            co_occurrence_matrix[index2, index1] = value
        return co_occurrence_matrix, vocab_index, frekvence_slov
        
    def calc_columns(df_word_pairs, vocab_index, co_occurrence_matrix, frekvence_slov, window_size, total_word_count):
        frequencies = []
        frequencies_first = []
        frequencies_last = []
        correlations = []
        
        valid_indices = []
        
        for i, row in tqdm(df_word_pairs.iterrows(), total=df_word_pairs.shape[0]):
            word1, word2 = row["First"], row["Last"]
            index_w1, index_w2 = vocab_index.get(word1), vocab_index.get(word2)
            
            if index_w1 is not None and index_w2 is not None:
                valid_indices.append(i)
                frequency = co_occurrence_matrix[index_w1, index_w2]
                frequencies.append(frequency)
                frequencies_first.append(frekvence_slov.get(word1, 0))
                frequencies_last.append(frekvence_slov.get(word2, 0))
                correlation = np.corrcoef(co_occurrence_matrix[index_w1], co_occurrence_matrix[index_w2])[0, 1]
                correlations.append(correlation)
            else:
                # Skip this pair if either word is not in the vocabulary
                continue
            
        # Now use valid_indices to filter df_word_pairs and assign values
        df_word_pairs = df_word_pairs.iloc[valid_indices].copy()

        if 'frequency_w1' not in df_word_pairs.columns: # only once - not depended on window
            df_word_pairs["no_words_corpus"] = total_word_count
            df_word_pairs['frequency_w1'] = frequencies_first
            df_word_pairs['frequency_w2'] = frequencies_last
            df_word_pairs["tf_rel_word1"] = df_word_pairs["frequency_w1"] / df_word_pairs["no_words_corpus"] * 100
            df_word_pairs["tf_rel_word2"] = df_word_pairs["frequency_w2"] / df_word_pairs["no_words_corpus"] * 100
            df_word_pairs['word1_length'] = df_word_pairs['First'].apply(len)
            df_word_pairs['word2_length'] = df_word_pairs['Last'].apply(len)

        if "fraq_w1_w2" not in df_word_pairs.columns:
            df_word_pairs["fraq_w1_w2"] = df_word_pairs["frequency_w1"] / df_word_pairs["frequency_w2"]
            
        df_word_pairs['frequency_of_cooc_w_'+str(window_size)] = frequencies
        df_word_pairs["corr_w_" + str(window_size)] = correlations
        df_word_pairs["freq_cooc_mult_fraq_w1_w2_w_"+str(window_size)] = df_word_pairs['frequency_of_cooc_w_'+str(window_size)] * df_word_pairs["fraq_w1_w2"]

        return df_word_pairs

    co_occurrence_matrix, vocab_index, frekvence_slov = calc_co_occurrence_matrix(sentences, window_size)
    df_word_pairs = calc_columns(df_word_pairs, vocab_index, co_occurrence_matrix, frekvence_slov, window_size, total_word_count)
    return df_word_pairs

In [17]:
for window_size in [3,5,8]:
    df_word_pairs = build_vocab_and_co_occurrence(window_size, sentences,df_word_pairs,total_word_count)

100%|██████████| 691190/691190 [01:23<00:00, 8310.13it/s] 
100%|██████████| 21502623/21502623 [00:15<00:00, 1402932.79it/s]
100%|██████████| 6958315/6958315 [27:00<00:00, 4293.21it/s]
100%|██████████| 691190/691190 [01:53<00:00, 6104.51it/s]
100%|██████████| 30080963/30080963 [00:20<00:00, 1480443.11it/s]
100%|██████████| 6958315/6958315 [27:20<00:00, 4242.75it/s]
100%|██████████| 691190/691190 [02:43<00:00, 4218.68it/s]
100%|██████████| 39881412/39881412 [00:24<00:00, 1596818.10it/s]
100%|██████████| 6958315/6958315 [27:35<00:00, 4202.48it/s]


In [18]:
df_word_pairs

Unnamed: 0,First,Last,no_words_corpus,frequency_w1,frequency_w2,tf_rel_word1,tf_rel_word2,word1_length,word2_length,fraq_w1_w2,frequency_of_cooc_w_3,corr_w_3,freq_cooc_mult_fraq_w1_w2_w_3,frequency_of_cooc_w_5,corr_w_5,freq_cooc_mult_fraq_w1_w2_w_5,frequency_of_cooc_w_8,corr_w_8,freq_cooc_mult_fraq_w1_w2_w_8
0,flood,pod,509377579,228,305,0.000045,0.000060,5,3,0.747541,0,0.279081,0.0,0,0.332584,0.0,0,0.393259,0.000000
1,flood,neon,509377579,228,1034,0.000045,0.000203,5,4,0.220503,0,0.216630,0.0,0,0.323404,0.0,0,0.417006,0.000000
2,flood,chapel,509377579,228,624,0.000045,0.000123,5,6,0.365385,0,0.142466,0.0,0,0.195536,0.0,0,0.262472,0.000000
3,flood,vintage,509377579,228,1784,0.000045,0.000350,5,7,0.127803,0,0.261986,0.0,0,0.347716,0.0,0,0.405814,0.000000
4,flood,red,509377579,228,26879,0.000045,0.005277,5,3,0.008482,0,0.181231,0.0,0,0.283348,0.0,1,0.380737,0.008482
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6958310,anybody,cabane,509377579,1435,12,0.000282,0.000002,7,6,119.583333,0,0.058877,0.0,0,0.084616,0.0,0,0.144840,0.000000
6958311,anybody,bofa,509377579,1435,70,0.000282,0.000014,7,4,20.500000,0,0.174527,0.0,0,0.220021,0.0,0,0.307855,0.000000
6958312,unreasonable,cabane,509377579,525,12,0.000103,0.000002,12,6,43.750000,0,0.052785,0.0,0,0.088481,0.0,0,0.135014,0.000000
6958313,unreasonable,bofa,509377579,525,70,0.000103,0.000014,12,4,7.500000,0,0.208968,0.0,0,0.252200,0.0,0,0.307578,0.000000


In [19]:
df_word_pairs.to_csv("final_df_datasets_features.csv")

In [7]:
df_word_pairs=pd.read_csv("final_df_datasets_features.csv")

## Lexical properties

In [8]:
final_res = df_word_pairs.copy()

In [9]:
final_res["first_second"] = final_res["First"] + "_" + final_res["Last"]

In [10]:
men["first_second"] = men["word1"] + "_" + men["word2"]
f1 = men.merge(final_res, on = ["first_second"], how = "inner")
men["first_second"] = men["word2"] + "_" + men["word1"]
f2 = men.merge(final_res, on = ["first_second"], how = "inner")
fin = pd.concat([f1,f2])

In [11]:
fin

Unnamed: 0.1,word1,word2,score,first_second,Unnamed: 0,First,Last,no_words_corpus,frequency_w1,frequency_w2,...,fraq_w1_w2,frequency_of_cooc_w_3,corr_w_3,freq_cooc_mult_fraq_w1_w2_w_3,frequency_of_cooc_w_5,corr_w_5,freq_cooc_mult_fraq_w1_w2_w_5,frequency_of_cooc_w_8,corr_w_8,freq_cooc_mult_fraq_w1_w2_w_8
0,river,water,0.49,river_water,1639782,river,water,509377579,1885,42608,...,0.044241,83,0.189442,3.671963,136,0.295877,6.016710,207,0.399837,9.157787
1,rain,storm,0.49,rain_storm,1122299,rain,storm,509377579,1675,666,...,2.515015,35,0.494082,88.025526,41,0.622359,103.115616,47,0.734905,118.205706
2,cat,kittens,0.49,cat_kittens,1348876,cat,kittens,509377579,2711,94,...,28.840426,6,0.423349,173.042553,11,0.508188,317.244681,15,0.568514,432.606383
3,cat,feline,0.48,cat_feline,1348828,cat,feline,509377579,2711,46,...,58.934783,2,0.306128,117.869565,7,0.440324,412.543478,10,0.524349,589.347826
4,beach,sand,0.48,beach_sand,1475391,beach,sand,509377579,2996,1247,...,2.402566,56,0.403957,134.543705,79,0.551711,189.802727,98,0.662852,235.451484
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1486,cheetah,phone,0.03,phone_cheetah,1649761,phone,cheetah,509377579,24611,37,...,665.162162,0,0.135797,0.000000,0,0.181042,0.000000,0,0.221936,0.000000
1487,jellyfish,rally,0.02,rally_jellyfish,2284209,rally,jellyfish,509377579,123,132,...,0.931818,0,0.137634,0.000000,0,0.176777,0.000000,0,0.244290,0.000000
1488,military,tomato,0.02,tomato_military,762213,tomato,military,509377579,11952,1046,...,11.426386,0,0.060476,0.000000,0,0.119615,0.000000,0,0.195487,0.000000
1489,festival,whiskers,0.01,whiskers_festival,1139213,whiskers,festival,509377579,18,1734,...,0.010381,0,0.113956,0.000000,0,0.134689,0.000000,0,0.170930,0.000000


In [19]:
def get_lexical_properties(word):
    synonyms = set()
    antonyms = set()
    hyponyms = set()
    hypernyms = set()
    derivations = set()
    synsets = wn.synsets(word)
    
    for syn in synsets:
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
            if lemma.antonyms():
                antonyms.add(lemma.antonyms()[0].name())
            derivations.update(deriv.name() for deriv in lemma.derivationally_related_forms())
        
        hyponyms.update(lemma.name() for hyponym in syn.hyponyms() for lemma in hyponym.lemmas())
        hypernyms.update(lemma.name() for hypernym in syn.hypernyms() for lemma in hypernym.lemmas())
    
    return synonyms, antonyms, hyponyms, hypernyms, derivations, len(synsets)

def get_pos_tags(word):
    doc = nlp(word)
    return [token.pos_ for token in doc]

def get_dependencies(word):
    doc = nlp(word)
    return [(token.text, token.dep_, token.head.text) for token in doc]

def get_dependency_types(word):
    doc = nlp(word)
    return [token.dep_ for token in doc]

def get_head_words(word):
    doc = nlp(word)
    return [token.head.text for token in doc]

def get_dependency_count(word):
    doc = nlp(word)
    return len([token.dep_ for token in doc])

def get_lemma(word):
    doc = nlp(word)
    return [token.lemma_ for token in doc][0]  

def get_prefixes(word, n=3):
    return [word[:i] for i in range(1, n+1)]

def get_suffixes(word, n=3):
    return [word[-i:] for i in range(1, n+1)]

def process_single_pair(row):
    first_word = row['word1']
    second_word = row['word2']
    
    syn1, ant1, hypo1, hyper1, deriv1, poly1 = get_lexical_properties(first_word)
    syn2, ant2, hypo2, hyper2, deriv2, poly2 = get_lexical_properties(second_word)


    syn_match = len(syn1.intersection(syn2))
    ant_match = len(ant1.intersection(ant2))

    hypo_match = len(hypo1.intersection(hypo2))
    hyper_match = len(hyper1.intersection(hyper2))


    deriv_match = len(deriv1.intersection(deriv2))

    # POS tags
    pos1 = get_pos_tags(first_word)
    pos2 = get_pos_tags(second_word)
    common_pos_tags = len(set(pos1).intersection(set(pos2)))
    same_pos_tags = pos1 == pos2

    # Dependencies
    dep1 = get_dependencies(first_word)
    dep2 = get_dependencies(second_word)
    dep_types1 = get_dependency_types(first_word)
    dep_types2 = get_dependency_types(second_word)
    head_words1 = get_head_words(first_word)
    head_words2 = get_head_words(second_word)
    dep_count1 = get_dependency_count(first_word)
    dep_count2 = get_dependency_count(second_word)
    common_dep_types = len(set(dep_types1).intersection(set(dep_types2)))
    common_head_words = len(set(head_words1).intersection(set(head_words2)))
    dep_count_diff = abs(dep_count1 - dep_count2)

    # Lemmas
    lemma1 = get_lemma(first_word)
    lemma2 = get_lemma(second_word)
    common_lemma = lemma1 == lemma2

    # Prefixes and Suffixes
    prefixes1 = set(get_prefixes(first_word))
    prefixes2 = set(get_prefixes(second_word))
    suffixes1 = set(get_suffixes(first_word))
    suffixes2 = set(get_suffixes(second_word))
    common_prefixes = len(prefixes1.intersection(prefixes2))
    common_suffixes = len(suffixes1.intersection(suffixes2))


    result = {
        'syn_match': syn_match,
        'syn_match_perc': syn_match / max(len(syn1), len(syn2)) if max(len(syn1), len(syn2)) != 0 else 0,
        'syn1_count': len(syn1),
        'syn2_count': len(syn2),
        'are_synonyms': second_word in syn1,
        'fraq_syn1_syn2': len(syn1) / len(syn2) if len(syn2) != 0 else 0,
        'syn_match_mult_fraq_syn': syn_match * (len(syn1) / len(syn2) if len(syn2) != 0 else 0),

        'ant_match': ant_match,
        'ant_match_perc': ant_match / max(len(ant1), len(ant2)) if max(len(ant1), len(ant2)) != 0 else 0,
        'ant1_count': len(ant1),
        'ant2_count': len(ant2),
        'are_antonyms': second_word in ant1,
        'fraq_ant1_ant2': len(ant1) / len(ant2) if len(ant2) != 0 else 0,
        'ant_match_mult_fraq_ant': ant_match * (len(ant1) / len(ant2) if len(ant2) != 0 else 0),

        'hypo_match': hypo_match,
        'hypo_match_perc': hypo_match / max(len(hypo1), len(hypo2)) if max(len(hypo1), len(hypo2)) != 0 else 0,
        'hypo1_count': len(hypo1),
        'hypo2_count': len(hypo2),
        'is_hyponym': bool(hypo1.intersection({second_word})),
        'fraq_hypo1_hypo2': len(hypo1) / len(hypo2) if len(hypo2) != 0 else 0,
        'hypo_match_mult_fraq_hypo': hypo_match * (len(hypo1) / len(hypo2) if len(hypo2) != 0 else 0),

        'hyper_match': hyper_match,
        'hyper_match_perc': hyper_match / max(len(hyper1), len(hyper2)) if max(len(hyper1), len(hyper2)) != 0 else 0,
        'hyper1_count': len(hyper1),
        'hyper2_count': len(hyper2),
        'is_hypernym': bool(hyper1.intersection({second_word})),
        'fraq_hyper1_hyper2': len(hyper1) / len(hyper2) if len(hyper2) != 0 else 0,
        'hyper_match_mult_fraq_hyper': hyper_match * (len(hyper1) / len(hyper2) if len(hyper2) != 0 else 0),

        'polysemy1': poly1, 
        'polysemy2': poly2, 

        'are_homonyms': any(s1.name().split('.')[0] == s2.name().split('.')[0] for s1 in wn.synsets(first_word) for s2 in wn.synsets(second_word)),
        'common_homonyms': len(set(s1.name().split('.')[0] for s1 in wn.synsets(first_word)).intersection(set(s2.name().split('.')[0] for s2 in wordnet.synsets(second_word)))),
        'common_homonyms_perc': len(set(s1.name().split('.')[0] for s1 in wn.synsets(first_word)).intersection(set(s2.name().split('.')[0] for s2 in wn.synsets(second_word)))) / max(len(wordnet.synsets(first_word)), len(wordnet.synsets(second_word))) if max(len(wordnet.synsets(first_word)), len(wordnet.synsets(second_word))) != 0 else 0,

        'first_word_lemma': lemma1,
        'second_word_lemma': lemma2,
        'common_lemma': common_lemma,
        'common_prefixes': common_prefixes,
        'common_suffixes': common_suffixes,
        'first_word_derivations': deriv1,
        'second_word_derivations': deriv2,
        'are_derivationally_related': second_word in deriv1 or first_word in deriv2,

        'word1_pos_tags': pos1,
        'word2_pos_tags': pos2,
        'common_pos_tags': common_pos_tags,
        'same_pos_tags': same_pos_tags,

        'first_word_dependencies': dep1,
        'second_word_dependencies': dep2,
        'first_word_dep_types': dep_types1,
        'second_word_dep_types': dep_types2,
        'first_word_head_words': head_words1,
        'second_word_head_words': head_words2,
        'first_word_dep_count': dep_count1,
        'second_word_dep_count': dep_count2,
        'common_dep_types': common_dep_types,
        'common_head_words': common_head_words,
        'dep_count_diff': dep_count_diff
    }
    
    return result

def process_lexical_relationships(df):
    results = []
    
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_single_pair, row) for _, row in df.iterrows()]
        
        for future in tqdm(as_completed(futures), total=len(futures)):
            results.append(future.result())

    result_df = pd.DataFrame(results)
    return pd.concat([df.reset_index(drop=True), result_df], axis=1)

In [20]:
fin

Unnamed: 0.1,word1,word2,score,first_second,Unnamed: 0,First,Last,no_words_corpus,frequency_w1,frequency_w2,...,fraq_w1_w2,frequency_of_cooc_w_3,corr_w_3,freq_cooc_mult_fraq_w1_w2_w_3,frequency_of_cooc_w_5,corr_w_5,freq_cooc_mult_fraq_w1_w2_w_5,frequency_of_cooc_w_8,corr_w_8,freq_cooc_mult_fraq_w1_w2_w_8
0,river,water,0.49,river_water,1639782,river,water,509377579,1885,42608,...,0.044241,83,0.189442,3.671963,136,0.295877,6.016710,207,0.399837,9.157787
1,rain,storm,0.49,rain_storm,1122299,rain,storm,509377579,1675,666,...,2.515015,35,0.494082,88.025526,41,0.622359,103.115616,47,0.734905,118.205706
2,cat,kittens,0.49,cat_kittens,1348876,cat,kittens,509377579,2711,94,...,28.840426,6,0.423349,173.042553,11,0.508188,317.244681,15,0.568514,432.606383
3,cat,feline,0.48,cat_feline,1348828,cat,feline,509377579,2711,46,...,58.934783,2,0.306128,117.869565,7,0.440324,412.543478,10,0.524349,589.347826
4,beach,sand,0.48,beach_sand,1475391,beach,sand,509377579,2996,1247,...,2.402566,56,0.403957,134.543705,79,0.551711,189.802727,98,0.662852,235.451484
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1486,cheetah,phone,0.03,phone_cheetah,1649761,phone,cheetah,509377579,24611,37,...,665.162162,0,0.135797,0.000000,0,0.181042,0.000000,0,0.221936,0.000000
1487,jellyfish,rally,0.02,rally_jellyfish,2284209,rally,jellyfish,509377579,123,132,...,0.931818,0,0.137634,0.000000,0,0.176777,0.000000,0,0.244290,0.000000
1488,military,tomato,0.02,tomato_military,762213,tomato,military,509377579,11952,1046,...,11.426386,0,0.060476,0.000000,0,0.119615,0.000000,0,0.195487,0.000000
1489,festival,whiskers,0.01,whiskers_festival,1139213,whiskers,festival,509377579,18,1734,...,0.010381,0,0.113956,0.000000,0,0.134689,0.000000,0,0.170930,0.000000


In [21]:
def process_chunk(df_chunk, chunk_number):
    res = process_lexical_relationships(df_chunk)
    res.to_csv(f"features_lexical_new_chunk_{chunk_number}.csv", index=False)
    return res

chunk_size = int(len(fin) * 1)

for i in range(10):
    start_idx = i * chunk_size
    end_idx = (i + 1) * chunk_size if (i + 1) * chunk_size <= len(fin) else len(fin)
    df_chunk = fin.iloc[start_idx:end_idx]
    process_chunk(df_chunk, i)

100%|██████████| 2976/2976 [02:51<00:00, 17.32it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
