# FastText Embedding Creation

In [12]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import re
import string
from gensim.models import FastText

# Scarica risorse necessarie di NLTK
nltk.download('punkt')

# Carica il dataset
df = pd.read_csv('../../../data/New dataset/LSTM/no_preprocessing/train_tweets_LSTM_no_new.csv')

# Funzione di preprocessing
def preprocess(text):
    tokens = word_tokenize(text)  # Tokenizza il testo
    return tokens

# Applica il preprocessing
df['tokenized_tweet'] = df['tweet_text'].apply(preprocess)

# Crea una lista di liste di token
tokenized_sentences = df['tokenized_tweet'].tolist()

# Inizializza il modello FastText
fast_model = FastText(vector_size=100, window=10, min_count=5, sg=1, negative=10)

# Costruisci il vocabolario
fast_model.build_vocab(corpus_iterable=tokenized_sentences)

# Addestra il modello
fast_model.train(corpus_iterable=tokenized_sentences, total_examples=len(tokenized_sentences), epochs=20)

# Salva il modello addestrato (opzionale)
fast_model.save('fasttext_model')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/chiarapiccolo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[('fathers', 0.8293688893318176), ('grandfather', 0.8073168396949768), ('gather', 0.6720890402793884), ('mother', 0.604293704032898), ('grandmother', 0.5891567468643188), ('cure', 0.5657365918159485), ('fate', 0.5648766160011292), ('shah', 0.54682457447052), ('bfs', 0.5322533845901489), ('mothers', 0.5314666628837585)]


In [14]:
# Esempio di utilizzo del modello per trovare parole simili
similar_words = fast_model.wv.most_similar('pussy')
print(similar_words)

[('pussies', 0.6965669393539429), ('trash', 0.674778401851654), ('wayne', 0.6701973080635071), ('411pain', 0.6674219965934753), ('hoe', 0.6223486065864563), ('ass', 0.6170661449432373), ('vagina', 0.6070759296417236), ('fuck', 0.5998162627220154), ('yoself', 0.599560558795929), ('shutup', 0.599079430103302)]


In [9]:
fast_model.wv.most_similar(['mum'])

[('t', -0.08697278797626495)]

In [15]:
import requests
url = "https://raw.githubusercontent.com/nicholas-leonard/word2vec/master/questions-words.txt"
test_file = 'questions-words.txt'
questions = requests.get(url).content.decode()
with open(test_file,mode='w',encoding='utf-8') as outputfile:
    outputfile.write(questions)
print(questions[:1000])

: capital-common-countries
Athens Greece Baghdad Iraq
Athens Greece Bangkok Thailand
Athens Greece Beijing China
Athens Greece Berlin Germany
Athens Greece Bern Switzerland
Athens Greece Cairo Egypt
Athens Greece Canberra Australia
Athens Greece Hanoi Vietnam
Athens Greece Havana Cuba
Athens Greece Helsinki Finland
Athens Greece Islamabad Pakistan
Athens Greece Kabul Afghanistan
Athens Greece London England
Athens Greece Madrid Spain
Athens Greece Moscow Russia
Athens Greece Oslo Norway
Athens Greece Ottawa Canada
Athens Greece Paris France
Athens Greece Rome Italy
Athens Greece Stockholm Sweden
Athens Greece Tehran Iran
Athens Greece Tokyo Japan
Baghdad Iraq Bangkok Thailand
Baghdad Iraq Beijing China
Baghdad Iraq Berlin Germany
Baghdad Iraq Bern Switzerland
Baghdad Iraq Cairo Egypt
Baghdad Iraq Canberra Australia
Baghdad Iraq Hanoi Vietnam
Baghdad Iraq Havana Cuba
Baghdad Iraq Helsinki Finland
Baghdad Iraq Islamabad Pakistan
Baghdad Iraq Kabul Afghanistan
Baghdad Iraq London England


In [22]:
fasttext_wAp_analogy = fast_model.wv.evaluate_word_analogies(test_file)

In [25]:
# Extracting the accuracy and category results
total_accuracy = fasttext_wAp_analogy[0]
category_results = fasttext_wAp_analogy[1]

# Processing the data
sections = []
accuracies = []
total_correct = 0
total_incorrect = 0

for entry in category_results:
    section = entry['section']
    correct = len(entry['correct'])
    incorrect = len(entry['incorrect'])
    total = correct + incorrect
    accuracy = correct / total if total > 0 else 0
    sections.append(section)
    accuracies.append(accuracy)
    total_correct += correct
    total_incorrect += incorrect

# Total accuracy
total_accuracy = total_correct / (total_correct + total_incorrect)

# Creating DataFrame
df = pd.DataFrame({
    'Section': sections,
    'Accuracy': accuracies
})

print("Accuracy per Section:")
print(df)

Accuracy per Section:
                        Section  Accuracy
0      capital-common-countries  0.000000
1                 capital-world  0.000000
2                      currency  0.000000
3                 city-in-state  0.000000
4                        family  0.128205
5     gram1-adjective-to-adverb  0.754167
6                gram2-opposite  1.000000
7             gram3-comparative  0.400735
8             gram4-superlative  0.311111
9      gram5-present-participle  0.266082
10  gram6-nationality-adjective  0.560000
11             gram7-past-tense  0.104615
12                 gram8-plural  0.688312
13           gram9-plural-verbs  0.447619
14               Total accuracy  0.391387


In [19]:

# Stampa i risultati
total_accuracy = fasttext_wAp_analogy[0]
category_results = fasttext_wAp_analogy[1]

print(f"Total Accuracy: {total_accuracy:.4f}")

for category, results in category_results.items():
    correct = results['correct']
    incorrect = results['incorrect']
    accuracy = correct / (correct + incorrect) if (correct + incorrect) > 0 else 0
    print(f"Category: {category}, Correct: {correct}, Incorrect: {incorrect}, Accuracy: {accuracy:.4f}")


Total Accuracy: 0.3914


AttributeError: 'list' object has no attribute 'items'

In [None]:
df = pd.read_csv('../../../data/New dataset/LSTM/no_preprocessing/train_tweets_LSTM_no_new.csv')
df['tweet_text'].to_csv('text_data.txt', index=False, header=False)

test_file = 'questions-words.txt'


# Addestra il modello FastText
model = fasttext.train_unsupervised('text_data.txt', model='skipgram', dim=300, minCount=5, epoch=10)

def evaluate_word_analogies(model, analogy_file):
    correct = 0
    total = 0
    
    with open(analogy_file, 'r') as f:
        for line in f:
            words = line.strip().split()
            if len(words) != 4:
                continue  # Skip lines that don't have exactly 4 words
            word1, word2, word3, word4 = words
            
            # Ensure all words are in the vocabulary
            if all(word in model.words for word in [word1, word2, word3, word4]):
                predicted_word = model.get_analogies(word1, word2, word3, 1)[0][1]
                if predicted_word == word4:
                    correct += 1
                total += 1
    
    accuracy = correct / total if total > 0 else 0
    return accuracy

# Esempio di analogia: king - man + woman = ?
accuracy = evaluate_word_analogies(model, 'questions-words.txt')
print(f"Accuracy: {accuracy:.4f}")
