In [1]:
print("-----------------Step 1--------------------")

import pandas as pd

# Load the dataset from Excel file in the same directory
file_path = "word2vec.xlsx"
data = pd.read_excel(file_path)

# Show the first few rows of the dataset to verify
print(data.head())

# Extract the 'Project Description' column
descriptions = data['Project Description'].dropna()  # Drop any missing values
print(descriptions.head())  # Preview the first few project descriptions

print("-------------------------------")

import nltk
nltk.download('punkt_tab')

print("-----------------Step 2--------------------")

import re
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')  # This will download the necessary files for tokenization

# Function to preprocess the text
def preprocess_text(text):
    # Remove non-alphabetical characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize the text into words
    return tokens

# Apply the preprocessing function to each project description
preprocessed_descriptions = descriptions.apply(preprocess_text)

# Show the first few tokenized project descriptions
print(preprocessed_descriptions.head())

print("-----------------Step 4--------------------")

from gensim.models import Word2Vec

# Train the Word2Vec model on the project descriptions
model = Word2Vec(sentences=preprocessed_descriptions, vector_size=100, window=5, min_count=1, sg=0)

# Save the model for later use
model.save("project_description_word2vec.model")

# Example: Get the vector for a specific word (e.g., 'roof')
vector = model.wv['roof']
print(vector)

print("-----------------Step 5.1--------------------")

vector = model.wv['roof']  # Example word
print(vector)

print("-----------------Step 5.2--------------------")

similar_words = model.wv.most_similar('roof', topn=5)  # Get the 5 most similar words to "roof"
print(similar_words)

print("-----------------Step 5.3--------------------")

def get_description_vector(description):
    # Get the word vectors for each word in the description and average them
    vectors = [model.wv[word] for word in description if word in model.wv]
    return sum(vectors) / len(vectors) if vectors else [0] * model.vector_size

# Apply the function to get vectors for all descriptions
description_vectors = preprocessed_descriptions.apply(get_description_vector)

# Show the vector for the first project description
print(description_vectors.head())

print("-----------------Step 6--------------------")
# Convert the description vectors to a DataFrame
vector_df = pd.DataFrame(description_vectors.tolist())

# Add the project identifiers (if needed) to the dataframe
vector_df['Project ID'] = data['Project Building Identifier']

# Save the vectors to a new Excel file
vector_df.to_excel("project_description_vectors.xlsx", index=False)


-----------------Step 1--------------------
   Project Geographic District  Project Building Identifier  \
0                             2                        M833   
1                            24                        Q093   
2                            19                        K420   
3                            17                        KBOR   
4                            19                        KBDA   

                                 Project School Name Project Type   \
0                     HS FOR ECONOMICS & FINANCE - M       SCA CIP   
1                                   I.S. 93 - QUEENS       SCA CIP   
2  THE URBAN ASSEMBLY SCHOOL FOR COLLABORATIVE HE...       SCA CIP   
3                       SAINT JOHN'S DAY CARE CENTER       SCA CIP   
4            URBAN STRATEGIES DAY CARE CENTER SITE 1       SCA CIP   

                                 Project Description Project Phase Name  \
0                               ABSORBER REPLACEMENT             Design   
1  ACC

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


-----------------Step 2--------------------
0                              [absorber, replacement]
1    [accessibility, windows, roofs, exterior, maso...
2    [accessible, ramp, electrificationheat, pump, ...
3              [acs, early, learn, lease, improvement]
4              [acs, early, learn, lease, improvement]
Name: Project Description, dtype: object
-----------------Step 4--------------------
[ 0.00510611  0.16850239 -0.06138306 -0.10382131  0.10342672 -0.27180538
  0.12863693  0.2290357  -0.08175565 -0.12137519 -0.09737651 -0.30195385
 -0.01738104  0.08206824  0.15996227 -0.1588904   0.06103717 -0.08887465
  0.02260787 -0.29004353  0.12284648  0.22849824  0.03159002 -0.05515201
  0.09408773  0.01532273 -0.18718761 -0.13174324 -0.18475121  0.05475518
  0.1918283  -0.02048223  0.1675077  -0.29044643 -0.18235889  0.2285707
 -0.02408082  0.00393576 -0.03898205 -0.2093294  -0.05027055 -0.16426294
 -0.03126666  0.02647832  0.05236022 -0.06331066 -0.1016949  -0.12398817
  0.12971635 