In [1]:
import pandas as pd
df_train=pd.read_csv("C:/Users/hp/Downloads/embedding_data_after_preproocessing.csv")

In [2]:
df_train.head()

Unnamed: 0,category,sub_category,crimeaditionalinfo
0,Online and Social Media Related Crime,Cyber Bullying Stalking Sexting,continue received random call abusive message ...
1,Online Financial Fraud,Fraud CallVishing,fraudster continuously messaging asking pay mo...
2,Online Gambling Betting,Online Gambling Betting,acting like police demanding money adding sect...
3,Online and Social Media Related Crime,Online Job Fraud,job applied job interview telecalling resource...
4,Online Financial Fraud,Fraud CallVishing,received call lady stating send new phone vivo...


In [3]:
import numpy as np
from collections import defaultdict
import re
from typing import List, Dict, Set, Tuple
import random
import pandas as pd
from tqdm import tqdm
import time
import logging
import os
from sklearn.model_selection import train_test_split


In [4]:


class FastTextModel:
    def __init__(self, vector_size=100, window=5, min_count=5, neg_samples=5, 
                 learning_rate=0.05, min_n=3, max_n=6):
        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.neg_samples = neg_samples
        self.learning_rate = learning_rate
        self.min_n = min_n
        self.max_n = max_n
        
        self.word_vectors = {}
        self.context_vectors = {}
        self.ngram_vectors = {}
        self.word_frequencies = {}
        self.vocab = set()
        self.total_words = 0
        
    def generate_ngrams(self, word):
        """Generate character n-grams for a word"""
        ngrams = []
        word = f"<{word}>"  # Add boundary markers
        for n in range(self.min_n, min(self.max_n + 1, len(word) + 1)):
            for i in range(len(word) - n + 1):
                ngrams.append(word[i:i+n])
        return ngrams
    
    def initialize_vector(self):
        """Initialize a new vector with small random values"""
        return np.random.uniform(-0.5/self.vector_size, 0.5/self.vector_size, 
                               (self.vector_size,))
    
    def build_vocab(self, sentences):
        """Build vocabulary and initialize vectors"""
        # Count word frequencies
        print("Building vocabulary...")
        for sentence in tqdm(sentences):
            for word in sentence:
                self.word_frequencies[word] = self.word_frequencies.get(word, 0) + 1
                self.total_words += 1
        
        # Filter by minimum count and create vocabulary
        self.vocab = {word for word, freq in self.word_frequencies.items() 
                     if freq >= self.min_count}
        
        # Initialize word and context vectors
        print("Initializing vectors...")
        for word in tqdm(self.vocab):
            self.word_vectors[word] = self.initialize_vector()
            self.context_vectors[word] = self.initialize_vector()
            
            # Initialize n-gram vectors
            for ngram in self.generate_ngrams(word):
                if ngram not in self.ngram_vectors:
                    self.ngram_vectors[ngram] = self.initialize_vector()
    
    def get_word_vector(self, word):
        """Get vector representation of a word (including subword information)"""
        if word in self.word_vectors:
            vector = self.word_vectors[word].copy()
        else:
            vector = np.zeros(self.vector_size)
            
        # Add subword information
        ngrams = self.generate_ngrams(word)
        ngram_count = 0
        for ngram in ngrams:
            if ngram in self.ngram_vectors:
                vector += self.ngram_vectors[ngram]
                ngram_count += 1
        
        if ngram_count > 0:
            vector /= (ngram_count + 1)  # +1 for word vector if exists
            
        return vector
    
    def negative_sampling(self, n):
        """Sample negative examples based on word frequency"""
        neg_samples = []
        word_list = list(self.vocab)
        frequencies = np.array([self.word_frequencies[word] for word in word_list])
        probs = frequencies ** 0.75
        probs = probs / probs.sum()
        
        while len(neg_samples) < n:
            sample = np.random.choice(word_list, p=probs)
            if sample not in neg_samples:
                neg_samples.append(sample)
                
        return neg_samples
    
    def train_pair(self, target, context, negative=True):
        """Train on a single target-context pair"""
        # Get target vector
        target_vector = self.get_word_vector(target)
        
        if negative:
            # Negative sampling
            contexts = [context] + self.negative_sampling(self.neg_samples)
            labels = [1] + [0] * self.neg_samples
        else:
            contexts = [context]
            labels = [1]
            
        loss = 0
        # Update for each context
        for ctx, label in zip(contexts, labels):
            context_vector = self.context_vectors[ctx]
            
            # Forward pass
            score = np.dot(target_vector, context_vector)
            prob = 1 / (1 + np.exp(-score))  # sigmoid
            
            # Compute error
            error = label - prob
            loss += -label * np.log(prob) - (1 - label) * np.log(1 - prob)
            
            # Compute gradients
            grad = error * self.learning_rate
            
            # Update vectors
            if target in self.word_vectors:
                self.word_vectors[target] += grad * context_vector
            
            # Update n-gram vectors
            for ngram in self.generate_ngrams(target):
                if ngram in self.ngram_vectors:
                    self.ngram_vectors[ngram] += grad * context_vector
            
            self.context_vectors[ctx] += grad * target_vector
            
        return loss
    
    def train_epoch(self, sentences):
        """Train for one epoch"""
        total_loss = 0
        total_pairs = 0
        
        for sentence in tqdm(sentences):
            for i, target in enumerate(sentence):
                # Define context window
                start = max(0, i - self.window)
                end = min(len(sentence), i + self.window + 1)
                
                # Train on context words
                for j in range(start, end):
                    if i != j and sentence[j] in self.vocab:
                        loss = self.train_pair(target, sentence[j])
                        total_loss += loss
                        total_pairs += 1
                        
        return total_loss / total_pairs if total_pairs > 0 else float('inf')
    
    def train(self, sentences, epochs=8):
        """Train the model"""
        print("Starting training...")
        losses = []
        
        for epoch in range(epochs):
            start_time = time.time()
            epoch_loss = self.train_epoch(sentences)
            losses.append(epoch_loss)
            
            print(f"\nEpoch {epoch+1}/{epochs}")
            print(f"Loss: {epoch_loss:.4f}")
            print(f"Time: {time.time() - start_time:.2f} seconds")
        
        return losses
    
    def save_vectors(self, filename):
        """Save word vectors to file"""
        print(f"Saving vectors to {filename}")
        with open(filename, 'w', encoding='utf-8') as f:
            # Write header
            f.write(f"{len(self.vocab)} {self.vector_size}\n")
            
            # Write word vectors
            for word in self.vocab:
                vector = self.get_word_vector(word)
                vector_str = ' '.join(str(x) for x in vector)
                f.write(f"{word} {vector_str}\n")



In [5]:
def preprocess_text(text):
    """Basic text preprocessing"""
    return text.lower().split()

In [None]:
def main():
    # Load your dataset
    df = df_train.copy()
    
    # Preprocess text
    sentences = [preprocess_text(text) for text in df['crimeaditionalinfo'].values if isinstance(text, str)]
    
    # Initialize model
    model = FastTextModel(
        vector_size=100,
        window=5,
        min_count=5,
        neg_samples=5,
        learning_rate=0.05,
        min_n=3,
        max_n=6
    )
    
    # Build vocabulary
    model.build_vocab(sentences)
    
    # Train model
    losses = model.train(sentences, epochs=3)
    
    # Save vectors
    model.save_vectors('fasttext_vectors.txt')
    
    # Plot training loss
    import matplotlib.pyplot as plt
    plt.figure(figsize=(10, 5))
    plt.plot(losses)
    plt.title('Training Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.savefig('training_loss.png')
    plt.close()

if __name__ == "__main__":
    main()

Building vocabulary...


100%|██████████| 77539/77539 [00:00<00:00, 83161.49it/s]


Initializing vectors...


100%|██████████| 16556/16556 [00:00<00:00, 20827.29it/s]


Starting training...


  1%|          | 967/77539 [7:43:35<615:18:25, 28.93s/it]   