In [21]:
!pip install gensim


Defaulting to user installation because normal site-packages is not writeable


In [1]:
#TF-IDF
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# Simple class to store term and its importance score
class TermScore:
    def __init__(self, term, score):
        self.term = term
        self.score = score

    def __str__(self):
        # Format like: word(0.123)
        return f"{self.term}({self.score:.3f})"

def clean_text(text):
    """
    Clean up text by removing special characters and converting to lowercase
    """
    # Remove anything that's not a letter or space
    cleaned = re.sub(r'[^a-zA-Z ]+', '', text)
    return cleaned.lower().strip()

def clean_texts_in_dataframe(df, text_column='text'):
    """
    Clean all texts in a DataFrame column
    """
    if text_column not in df.columns:
        print(f"Error: '{text_column}' column not found!")
        return None

    # Make a copy so we don't change the original
    df_copy = df.copy()
    df_copy[text_column] = df_copy[text_column].apply(clean_text)
    return df_copy
def clean_texts_in_dataframe2(df, text_column):
    df[text_column] = df[text_column].astype(str)  # Ensure all texts are strings
    df[text_column] = df[text_column].apply(word_tokenize)  # Tokenize into words
    return df

class TextProcessor:
    """
    Main class to handle text processing using TF-IDF
    """
    def __init__(self, max_features=None):
        # Initialize the TF-IDF vectorizer
        self.vectorizer = TfidfVectorizer(
            max_features=max_features,
            stop_words='english',

        )

    def calculate_tfidf(self, documents):
        # Convert documents to TF-IDF matrix
        tfidf_matrix = self.vectorizer.fit_transform(documents)

        # Get the words (features) that correspond to the matrix columns
        feature_names = self.vectorizer.get_feature_names_out()

        return {
            'matrix': tfidf_matrix.todense(),
            'words': feature_names
        }

    def process_documents(self, doc_ids, documents, max_words=100, show_scores=True):
      """
      Process documents, keep only the most important words based on TF-IDF, and maintain doc_ids.
      """
      # First clean the texts
      cleaned_docs = [clean_text(doc) for doc in documents]

      # Calculate TF-IDF
      result = self.calculate_tfidf(cleaned_docs)
      matrix = result['matrix']
      words = result['words']

      processed_documents = []

      # Go through each document
      for doc_idx, doc_scores in enumerate(matrix):
          # Create a dictionary of word:score pairs for this document
          word_scores = {words[word_idx]: float(score) for word_idx, score in enumerate(doc_scores.T) if float(score) > 0}

          # Sort words by score and keep top ones
          sorted_words = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)
          top_words = dict(sorted_words[:max_words])

          # Process original document words
          processed_words = []
          for word in cleaned_docs[doc_idx].split():
              if word in top_words:
                  if show_scores:
                      # Add the word with its score
                      term = TermScore(word, top_words[word])
                      processed_words.append(str(term))
                  else:
                      processed_words.append(word)

          processed_documents.append((doc_ids[doc_idx], ' '.join(processed_words)))

      return processed_documents

    def get_word_scores(self, result, doc_index, min_score=0.0):
        """
        Get all words and their importance scores for a specific document
        """
        doc_scores = result['matrix'][doc_index]

        # Create list of word-score pairs
        word_scores = []
        for word_idx, score in enumerate(doc_scores.T):
            score_value = float(score)
            if score_value >= min_score:
                word_scores.append(
                    TermScore(result['words'][word_idx], score_value)
                )

        # Sort by score (highest first)
        return sorted(word_scores, key=lambda x: x.score, reverse=True)

def process_file(input_file, output_file, text_column='text', max_words=100):
    """
    Process text data from a file and save results
    """
    try:
        # Read the file (works with CSV or TSV)
        if input_file.endswith('.tsv'):
            df = pd.read_csv(input_file, sep='\t')
        else:
            df = pd.read_csv(input_file)

        # Process the texts
        processor = TextProcessor()
        df = clean_texts_in_dataframe(df, text_column)
        texts = df[text_column].tolist()

        # Get TF-IDF scores and process
        processed_texts = processor.process_documents(texts, max_words)

        # Update and save
        df[text_column] = processed_texts
        df.to_csv(output_file, sep='\t', index=False)

        return df

    except FileNotFoundError:
        print(f"Error: Couldn't find file: {input_file}")
        return None
import numpy as np
from sklearn.preprocessing import normalize

def normalize_tfidf(tfidf_matrix):
    normalized_matrix = normalize(tfidf_matrix, norm='l1', axis=1)
    return normalized_matrix

def normalize_tfidf(tfidf_matrix):
    # Now normalize
    normalized_matrix = normalize(tfidf_matrix, norm='l1', axis=1)
    return normalized_matrix



#function from github
import gensim
from gensim.models import Phrases

import gensim
from gensim.models import Phrases

def addBiandTri(docs):
    # Add bigrams and trigrams to docs (only ones that appear 5 times or more).
    bigram = Phrases(docs, min_count=5)
    trigram = Phrases(bigram[docs])

    # Initialize an empty list to store the modified documents
    modified_docs = []

    for idx in range(len(docs)):
        # Tokenize the current document
        doc_tokens = docs[idx].split()

        # Add bigrams and trigrams to the token list
        for token in bigram[doc_tokens]:
            if '_' in token:
                doc_tokens.append(token)
        for token in trigram[doc_tokens]:
            if '_' in token:
                doc_tokens.append(token)

        # Join the tokens back into a string and add to modified_docs
        modified_docs.append(' '.join(doc_tokens))

    return modified_docs

def replace_keys_with_lowercase(word_embeddings):
  lowercased_embeddings = {}
  for key, value in word_embeddings.items():
    lowercased_key = key.lower()
    lowercased_embeddings[lowercased_key] = value

  return lowercased_embeddings




In [None]:
"""
TF-IDF


"""
import pandas as pd

#file_pathx = '/content/complaintDocs_BertForClass.txt'
file_path = 'data\complaintDocs_BertForClass.txt'

complaint_df = pd.read_csv(file_path, sep='\t', names=['docid', 'text', 'MotionResultCode'])
real_df = pd.DataFrame(complaint_df, columns=['docid','text'])

real_df = pd.DataFrame(complaint_df, columns=['docid', 'text'])  # Keep docid
new_df = clean_texts_in_dataframe(real_df)


# Existing code to clean texts and process documents
processor = TextProcessor()
new_df = clean_texts_in_dataframe(real_df)
new_texts = new_df["text"].tolist()

# Get TF-IDF scores
result = processor.calculate_tfidf(new_texts)
matrix = result['matrix']
words = result['words']

# Initialize a list to hold the word-weight dictionaries for each document
document_word_weights = []

# Iterate over each document in the TF-IDF matrix
for doc_index in range(matrix.shape[0]):
    doc_vector = matrix[doc_index]
    scores = np.squeeze(np.asarray(doc_vector))
    non_zero_indices = np.nonzero(scores)[0]
    word_scores = {words[idx]: scores[idx] for idx in non_zero_indices}
    document_word_weights.append({'docid': new_df['docid'].iloc[doc_index], 'word_weights': word_scores})

processed_docs_with_ids = processor.process_documents(new_df['docid'].tolist(), new_df['text'].tolist(), max_words=5, show_scores=True)

# Convert processed documents to a DataFrame
processed_df = pd.DataFrame(processed_docs_with_ids, columns=['docid', 'processed_text'])


  file_path = 'data\complaintDocs_BertForClass.txt'
  word_scores = {words[word_idx]: float(score) for word_idx, score in enumerate(doc_scores.T) if float(score) > 0}


        docid                                     processed_text
0       docid                                        text(1.000)
1          77  lucille(0.205) amente(0.499) bank(0.387) bank(...
2         453  baumax(0.230) ashdon(0.240) baumax(0.230) baum...
3         579  duchess(0.445) hamburger(0.190) duchess(0.445)...
4         669  bryce(0.260) ridley(0.265) suffield(0.391) pug...
...       ...                                                ...
8212  9989009  asher(0.558) kuriansky(0.212) asher(0.558) ash...
8213  9992499  kenya(0.241) hairston(0.252) kenya(0.241) hair...
8214  9996383  haynes(0.525) norwalk(0.280) paus(0.485) hayne...
8215  9999634  lindsay(0.333) hornyak(0.376) lindsay(0.333) h...
8216  9999747  gregory(0.337) laflamme(0.774) marie(0.198) la...

[8217 rows x 2 columns]


In [16]:
def normalize_tfidf(tfidf_matrix, document_word_weights):
    # Now normalize
    normalized_matrix = normalize(tfidf_matrix, norm='l1', axis=1)
    
    #create lists to store data for DataFrame
    doc_ids = []
    words_list = []
    weights_list = []
    
    #process
    for i, doc in enumerate(document_word_weights):
        doc_id = doc['docid']
        word_weights = doc['word_weights']
        
        #process each word
        for j, word in enumerate(word_weights.keys()):
            doc_ids.append(doc_id)
            words_list.append(word)
            weights_list.append(normalized_matrix[i][j])
    
    # Create DataFrame with the collected data
    normalized_df = pd.DataFrame({
        'docid': doc_ids,
        'word': words_list,
        'normalized_weight': weights_list
    })
    
    # Sort by docid and weight value for better readability
    normalized_df = normalized_df.sort_values(['docid', 'normalized_weight'], ascending=[True, False])
    
    return normalized_df

unfiltered_df =  pd.read_csv('tfidf_weights.csv')



In [None]:
import pandas as pd
from sklearn.preprocessing import normalize
import numpy as np
import re

#weights are averaged amoungst 1 in each doc id
def normalize_tfidf_from_csv(dataset):
    
    
    doc_ids = []
    words = []
    weights = []

    
    for index, row in dataset.iterrows():
        docid = row['docid']
        processed_text = str(row['processed_text'])  # Ensure processed_text is a string

        # Extract words and weights using regex
        matches = re.findall(r'(\w+)\((0\.\d+)\)', processed_text)

        for word, weight in matches:
            doc_ids.append(docid)
            words.append(word)
            weights.append(float(weight))

    
    parsed_df = pd.DataFrame({
        'docid': doc_ids,
        'word': words,
        'weight': weights
    })

    #pivot table to create a TF-IDF matrix
    tfidf_matrix = parsed_df.pivot_table(index='docid', columns='word', values='weight', fill_value=0)

    #normalize the TF-IDF matrix
    normalized_matrix = normalize(tfidf_matrix, norm='l1', axis=1)

    
    normalized_df = pd.DataFrame(normalized_matrix, index=tfidf_matrix.index, columns=tfidf_matrix.columns)


    normalized_long_df = normalized_df.reset_index().melt(id_vars='docid', var_name='word', value_name='normalized_weight')

    normalized_long_df = normalized_long_df[normalized_long_df['normalized_weight'] > 0]

    normalized_long_df = normalized_long_df.sort_values(['docid', 'normalized_weight'], ascending=[True, False])

    return normalized_long_df


uploaded_dataset = pd.read_csv('tfidf_weights.csv')

# Apply 
normalized_result = normalize_tfidf_from_csv(uploaded_dataset)

# Display
normalized_result
normalized_result.to_csv("tf-idf_final.csv", index=False)


Unnamed: 0,docid,word,normalized_weight
4270240,77,amente,0.269730
9320620,77,bank,0.209189
98790360,77,pathway,0.208108
99151688,77,paved,0.202162
78588840,77,lucille,0.110811
...,...,...,...
132213199,15966454,tree,0.416667
36198495,15966454,diprospero,0.177564
63823663,15966454,jaclyn,0.159615
132221411,15966454,trees,0.123718


In [35]:
normalized_result.to_csv("tf-idf_final.csv", index=False)

In [22]:
!pip install nltk
import nltk
from nltk.tokenize import word_tokenize
nltk.download('all')

Defaulting to user installation because normal site-packages is not writeable


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\aarya\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\aarya\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\aarya\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     C:\Users\aarya\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\aarya\AppData\Roaming\nltk_data...
[

True

In [None]:

import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
import nltk
import re
from gensim.models import Word2Vec

def clean_text(text):
    if isinstance(text, str):
        cleaned = re.sub(r'[^a-zA-Z\s]', '', text.lower())
        return cleaned
    return text

def clean_texts_in_dataframe2(df, text_column):
    df = df.copy()
    df[text_column] = df[text_column].astype(str)
    df[text_column] = df[text_column].apply(word_tokenize)
    return df

try:

    file_path = 'data/complaintDocs_BertForClass.txt'
    comp_docs = pd.read_csv(file_path, sep='\t', names=['docid', 'text', 'MotionResultCode'])
    docs = clean_texts_in_dataframe2(comp_docs, text_column='text')
    
    seed_value = 42
    model = Word2Vec(
        sentences=docs['text'].tolist(),
        min_count=1,
        workers=4,
        seed=seed_value
    )
    
    word_embeddings = {}
    
    for doc_id, text in zip(comp_docs['docid'], docs['text']):
        for word in text:
            if word in model.wv:
                cleaned_word = clean_text(word)
                if cleaned_word:
                    if cleaned_word not in word_embeddings:
                        word_embeddings[cleaned_word] = {
                            'embeddings': model.wv[word].copy(),
                            'doc_ids': set()
                        }
                    word_embeddings[cleaned_word]['doc_ids'].add(doc_id)
    
    
    words = []
    doc_ids = []
    embeddings = []
    
    for word, data in word_embeddings.items():
        words.append(word)
        doc_ids.append(list(data['doc_ids']))  #Convert set to list
        embeddings.append(data['embeddings'].tolist())  #Convert numpy array to list
    
   
    word_embeddings_df = pd.DataFrame({
        'word': words,
        'doc_ids': doc_ids,
        'embeddings': embeddings
    })
    
    # Display first few rows
    print("\nFirst few rows of the DataFrame:")
    print(word_embeddings_df.head())
    

    word_embeddings_df.to_csv('word_embeddings.csv', index=False)
    
    import pickle
    with open('word_embeddings.pkl', 'wb') as f:
        pickle.dump(word_embeddings_df, f)

except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
except Exception as e:
    print(f"An error occurred: {str(e)}")


First few rows of the DataFrame:
       word                                            doc_ids  \
0      text  [1671258, 4740931, 12574479, 12779586, 30395, ...   
1    return  [12737652, 1131488, 3268722, 10489856, 300793,...   
2      date  [12737652, 1131488, 3268722, 10489856, 300793,...   
3     april  [637652, 12737652, 10265791, 6920937, 2008994,...   
4  superior  [12702224, 1131488, 12737652, 3268722, 1048985...   

                                          embeddings  
0  [0.4270448386669159, 0.3264201879501343, -0.20...  
1  [1.222374439239502, 1.10366690158844, 0.982666...  
2  [-3.2353107929229736, -0.6487149596214294, -1....  
3  [3.2186858654022217, -0.9272747039794922, -2.5...  
4  [-0.42789825797080994, -2.5005075931549072, 0....  


In [39]:
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import pandas as pd
from nltk.tokenize import word_tokenize
import numpy as np




# Load your data
file_path = 'data\complaintDocs_BertForClass.txt'
df = pd.read_csv(file_path, sep='\t', names=['docid', 'text', 'MotionResultCode'])

# Tokenize 
df['text'] = df['text'].astype(str)
df['text'] = df['text'].apply(word_tokenize)

# Prepare data 
documents = [TaggedDocument(words=text, tags=[str(docid)]) for docid, text in zip(df['docid'], df['text'])]

# Initialize
model = Doc2Vec(vector_size=100, min_count=1, workers=4, window=5, epochs=10)  # Customize as needed

# Build vocabulary
model.build_vocab(documents)

# Train
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

# Save 
model.save("doc2vec.model")

# Create a dictionary 
doc_embeddings = {}
for doc in documents:
    doc_id = doc.tags[0]  # Assuming docid is the tag
    doc_embedding = model.infer_vector(doc.words)
    doc_embeddings[doc_id] = doc_embedding

#document embeddings to a DataFrame
doc2vec_embeddings_df = pd.DataFrame(list(doc_embeddings.items()), columns=['docid', 'embeddings'])

#Save embeddings 

doc2vec_embeddings_df = doc2vec_embeddings_df.drop(index=0)
doc2vec_embeddings_df.head()
doc2vec_embeddings_df.to_csv("doc2vec_embeddings.csv", index=False)

#print(doc2vec_embeddings_df)

  file_path = 'data\complaintDocs_BertForClass.txt'


In [33]:
def calculate_weighted_embeddings(normalized_weights_df, word_embeddings_df):

    # Create a dictionary
    word_embedding_dict = dict(zip(word_embeddings_df.word, word_embeddings_df.embeddings))
    
    # Initialize dict
    document_embeddings = {}
    
    #group the normalized weights by docid
    for doc_id, doc_group in normalized_weights_df.groupby('docid'):
        print(f"Processing Document {doc_id}")
        
        #get the embedding dimension from the first available word embedding
        embedding_dim = len(next(iter(word_embedding_dict.values())))
        
        #Initialize document embedding vector
        doc_embedding = np.zeros(embedding_dim)
        total_weight = 0
        
        #Process each word in the document
        for _, row in doc_group.iterrows():
            word = row['word']
            weight = row['normalized_weight']
            
            #Check 
            if word in word_embedding_dict:
                #Get word embedding and multiply weight
                word_embedding = np.array(word_embedding_dict[word])
                weighted_embedding = weight * word_embedding
                
                #add to doc embed
                doc_embedding += weighted_embedding
                total_weight += weight
                print(f"  Word: {word}, Weight: {weight}, Contribution added to document embedding.")
            else:
                print(f"  Word '{word}' not found in word embeddings, skipping.")
        
        #match equation made
        if total_weight > 0:
            doc_embedding /= total_weight
            document_embeddings[doc_id] = doc_embedding
            print(f"Final normalized embedding for Document {doc_id}")
        else:
            print(f"  Skipping normalization for Document {doc_id} due to zero total weight.")
            
    return document_embeddings


#Calculate weighted embeddings
document_embeddings = calculate_weighted_embeddings(normalized_result, word_embeddings_df)

# Convert to DataFrame 
doc_embeddings_df = pd.DataFrame([
    {'docid': doc_id, 'embedding': embedding.tolist()} 
    for doc_id, embedding in document_embeddings.items()])

Processing Document 77
  Word: amente, Weight: 0.2697297297297297, Contribution added to document embedding.
  Word: bank, Weight: 0.2091891891891892, Contribution added to document embedding.
  Word: pathway, Weight: 0.2081081081081081, Contribution added to document embedding.
  Word: paved, Weight: 0.20216216216216215, Contribution added to document embedding.
  Word: lucille, Weight: 0.1108108108108108, Contribution added to document embedding.
Final normalized embedding for Document 77
Processing Document 453
  Word: premises, Weight: 0.2599549211119459, Contribution added to document embedding.
  Word: servants, Weight: 0.202854996243426, Contribution added to document embedding.
  Word: agents, Weight: 0.18407212622088656, Contribution added to document embedding.
  Word: ashdon, Weight: 0.18031555221637866, Contribution added to document embedding.
  Word: baumax, Weight: 0.1728024042073629, Contribution added to document embedding.
Final normalized embedding for Document 453
P

In [None]:
doc_embeddings_df.to_csv("combined_embeddings.csv")