In [None]:
# Task-1: Cleaning the data file

In [104]:
# Clean the data file by removing non-text (e.g. emojis, smart quotes) and regularizing text 
#(e.g. tokenization, lower casing, stemming, lemmatizing, POS tagging, stop word removal, removing punctuation, spelling correction)
import demoji
import json
import spacy
from spacy.lang.en import English
from spellchecker import SpellChecker
from nltk.stem.porter import PorterStemmer

path= "faqsFromPdf.json"
with open(path, 'r') as json_file:
    text = json.load(json_file)

categories = []
all_text = ""
for key, faq_list in text.items():
    group_text = ""

    for faq_item in faq_list:
        all_text += faq_item["question"] + " " + faq_item["answer"] + " "
        group_text += faq_item["question"] + faq_item["answer"] 
    categories.append(group_text)

# Removing emojis
clean_text = demoji.replace(all_text,"")
#remove smart quotes
clean_text = clean_text.replace("“", "\"").replace("”","\"")
# convert text to lower-case
clean_text = clean_text.lower()

spell = SpellChecker()
stemmer = PorterStemmer()

# Find and correct spelling errors
corrected_text = []
clean_text=clean_text.split()
for word in clean_text:
    # Check if the word is misspelled
    if spell.unknown([word]):
        # Get the corrected version of the word
        corrected_word = spell.correction(word)
        # Check if the corrected word is not None
        if corrected_word is not None:
            corrected_text.append(corrected_word)
        else:
            # If the correction is None, keep the original word
            corrected_text.append(word)
    else:
        corrected_text.append(word)
# Join the corrected words back into a string
corrected_text = " ".join(corrected_text)   

#Tokenzing using Spacy with removing white spaces, stop words, and punctuations
nlp = spacy.load('en_core_web_sm')
doc = nlp(corrected_text)

# Lemmatize and stem the words
lemmatized_and_stemmed_words = []
for token in doc:
    lemma = token.lemma_
    stem = stemmer.stem(token.text)  # Use Porter Stemmer
    lemmatized_and_stemmed_words.append((token.text, lemma, stem))

clean_words = [token.text for token in doc if not (token.is_space or token.is_stop or token.is_punct)]
posArray = [(token.text, token.pos_) for token in doc if not (token.is_space or token.is_stop or token.is_punct)]

# print(clean_words)
with open("cleaned_data.txt", "w", encoding='utf-8') as txt_file:
     txt_file.write(str(clean_words))


In [None]:
# Task-2: Counting BoW on pre-processed data. 

In [105]:
import random
import pandas as pd
import nltk

# Extract the most common words
all_words = nltk.FreqDist(clean_words)
max_words = 1000
word_features = [word for word, _ in all_words.most_common(max_words)]

def document_features(document):
    features = {}
    for word in word_features:
        # print(document)
        features[word] = document.count(word)
    return features


# Create a list of documents as pairs of (text, category)
documents = [(text, category) for text, category in zip(clean_words, categories)]
random.shuffle(documents)

# Collect features for each document
featuresets = [(document_features(category), category) for word, category in documents]

# Convert featuresets to a DataFrame and save to a CSV file for Count BoW
df_featuresets = pd.DataFrame([features for features, _ in featuresets], columns=word_features)
df_featuresets['category'] = [category for _, category in featuresets]
df_featuresets.to_csv('count_bow.csv', index=False)

# Display the Count BoW representation
print(df_featuresets.head(10))

   program  students  housing  campus  pacific  yellow  ribbon  units  \
0       63        18        0       2        4       0       0      4   
1        0         0        0       0        0       0       0      0   
2        4        20       23       9        2       0       0     10   
3        1         1        0       0        0       0       0      2   
4        0        18       33      59        4       0       0     11   
5        8         4        0       0        0       0       0      2   
6        1         1        0       0        0       0       0     13   

   available  student  ...  law  behalf  certification  following  trademark  \
0         18       24  ...    0       0              0          0          0   
1          0        0  ...    0       0              0          0          0   
2         14       34  ...    0       0              0          0          0   
3          0        3  ...    0       0              0          0          0   
4          7   

In [None]:
# Task-3: Computing TF-IDF vectors on pre-processed data

In [106]:
# Compute TF-IDF vectors on pre-processed data.
from sklearn.feature_extraction.text import TfidfVectorizer


# Initialize the TF-IDF vectorizer with parameters
tfidf_vectorizer = TfidfVectorizer(max_features=100, stop_words='english')

# Fit and transform the corpus to obtain TF-IDF features
tfidf_matrix = tfidf_vectorizer.fit_transform(categories)

# Get the TF-IDF feature names (words)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

# Convert the TF-IDF matrix to a dense array for easier manipulation
tfidf_matrix_dense = tfidf_matrix.toarray()

# Create a DataFrame to display the TF-IDF values
import pandas as pd
tfidf_df = pd.DataFrame(tfidf_matrix_dense, columns=tfidf_feature_names)

print(tfidf_df)

# Save the TF-IDF data to a CSV file
tfidf_df.to_csv('tfidf_data.csv', index=False)

         11      able  academic  admission  admissions  agreement       aid  \
0  0.000000  0.025422  0.000000   0.000000    0.000000   0.000000  0.000000   
1  0.007967  0.013834  0.055337   0.214378    0.027668   0.000000  0.095605   
2  0.019874  0.069018  0.051763   0.000000    0.000000   0.129179  0.000000   
3  0.000000  0.166770  0.000000   0.000000    0.000000   0.000000  0.000000   
4  0.000000  0.000000  0.040029   0.215755    0.120086   0.138315  0.000000   
5  0.000000  0.000000  0.000000   0.000000    0.338568   0.000000  0.129987   
6  0.096232  0.000000  0.033420   0.000000    0.033420   0.019246  0.019246   

   apartments  applicants  application  ...  transferring   tuition     units  \
0    0.051384    0.000000     0.000000  ...      0.000000  0.000000  0.107816   
1    0.000000    0.157202     0.127474  ...      0.000000  0.031868  0.021335   
2    0.151128    0.000000     0.019874  ...      0.000000  0.000000  0.079830   
3    0.000000    0.000000     0.000000  ...

In [None]:
# Task-4: Perform integer encoding and one-hot encoding on one of the pre-processed data files

In [113]:
# Perform integer encoding
import numpy 
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder


# for this example, we'll just look at the first document, and 
# the first 50 words
data = documents[0][1]
# print(data)
# Split the text into words
values = data.split()
# print(values)
short_values = (values[:50])

# first encode words as integers
# every word in the vocabulary gets a unique number
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(short_values)

# look at the first 50 encodings
print(integer_encoded)

with open("Integer_Encoding.txt", 'w') as file:
    for integer in integer_encoded:
        file.write(str(integer) + "\n")

[ 3 20  9 27 23 20  9 27 22  2  4  5  1  0 21 17 15 25 28 30  7 26 10  6
  2  4  5  1  0 21 17 15 25 28 30  7 26 10  6 12 18 11  8 16 29 24 13 14
 19  8]


In [116]:
# convert the integer encoding to onehot encoding
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)

print(onehot_encoded)

# invert the first vector so that we can see the original word it encodes
inverted = label_encoder.inverse_transform([argmax(onehot_encoded[0, :])])

print(inverted)

with open("One-hot_Encoding.txt", 'w') as file:
    for encoding in onehot_encoded:
        line = " ".join(map(str, encoding))
        file.write(line + "\n")

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
['How']




In [None]:
# Task-5: find the words that are the most similar to the word 'campus' it in one of the pre-processed data files

In [120]:
import gensim
import nltk
from nltk.corpus import movie_reviews
from gensim.models import Word2Vec

# Extract the tokens from the doc object and store them in a list
tokens = clean_words

# Create and train the Word2Vec model
model = gensim.models.Word2Vec(sentences=[tokens], min_count=1, vector_size=100, sg=0)

# Find similar words to 'graduate'
similar_words = model.wv.most_similar(positive=['campus'], topn=25)

# Display the similar words
for word, score in similar_words:
    print(f"{word}: {score}")


yellow: 0.7226392030715942
students: 0.7035462260246277
ribbon: 0.6817548871040344
station: 0.6632461547851562
university: 0.6621088981628418
program: 0.6616090536117554
housing: 0.6605948209762573
available: 0.6577374935150146
units: 0.6506310105323792
service: 0.6304279565811157
pacific: 0.6235244274139404
office: 0.62343829870224
residential: 0.6178445219993591
space: 0.6149711012840271
apartments: 0.6142014861106873
admissions: 0.6053345799446106
sacramento: 0.5987273454666138
award: 0.5973877310752869
include: 0.5966724157333374
field: 0.5951021909713745
residence: 0.5939453840255737
va: 0.5935418605804443
roommate: 0.5931534767150879
trimester: 0.5897179245948792
number: 0.5890599489212036
