In [None]:
# 1. Data cleaning

In [49]:
# Clean the data file by removing non-text (e.g. emojis, smart quotes) and regularizing text 
#(e.g. tokenization, lower casing, stemming, lemmatizing, POS tagging, stop word removal, removing punctuation, spelling correction)
import demoji
import json
import spacy
from spacy.lang.en import English
from spellchecker import SpellChecker
from nltk.stem.porter import PorterStemmer

path= "faqsFromPdf.json"
with open(path, 'r') as json_file:
    text = json.load(json_file)

categories = []
all_text = ""
for key, faq_list in text.items():
    group_text = ""

    for faq_item in faq_list:
        all_text += faq_item["question"] + " " + faq_item["answer"] + " "
        group_text += faq_item["question"] + faq_item["answer"] 
    categories.append(group_text)

# Removing emojis
clean_text = demoji.replace(all_text,"")
#remove smart quotes
clean_text = clean_text.replace("“", "\"").replace("”","\"")
# convert text to lower-case
clean_text = clean_text.lower()

spell = SpellChecker()
stemmer = PorterStemmer()

# Find and correct spelling errors
corrected_text = []
clean_text=clean_text.split()
for word in clean_text:
    # Check if the word is misspelled
    if spell.unknown([word]):
        # Get the corrected version of the word
        corrected_word = spell.correction(word)
        # Check if the corrected word is not None
        if corrected_word is not None:
            corrected_text.append(corrected_word)
        else:
            # If the correction is None, keep the original word
            corrected_text.append(word)
    else:
        corrected_text.append(word)
# Join the corrected words back into a string
corrected_text = " ".join(corrected_text)   

#Tokenzing using Spacy with removing white spaces, stop words, and punctuations
nlp = spacy.load('en_core_web_sm')
doc = nlp(corrected_text)

# Lemmatize and stem the words
lemmatized_and_stemmed_words = []
for token in doc:
    lemma = token.lemma_
    stem = stemmer.stem(token.text)  # Use Porter Stemmer
    lemmatized_and_stemmed_words.append((token.text, lemma, stem))

clean_words = [token.text for token in doc if not (token.is_space or token.is_stop or token.is_punct)]
posArray = [(token.text, token.pos_) for token in doc if not (token.is_space or token.is_stop or token.is_punct)]

# print(clean_words)
with open("cleaned_data.txt", "w", encoding='utf-8') as txt_file:
     txt_file.write(str(clean_words))

In [None]:
# 2. Performing a binary classification related to Stockton housing or not

In [1]:
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load your JSON data
path= "faqsFromPdfCh9.json"
with open(path, 'r') as file:
    data = json.load(file)

# Extract questions and labels
questions = []
labels = []

for category, faqs in data.items():
    for faq in faqs:
        questions.append(faq['question'])
        # Set the label to 1 if it's related to Stockton housing, else set it to 0
        labels.append(1 if category == 'Stockton-Housing' else 0)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(questions, labels, test_size=0.2, random_state=42)

In [None]:
# 3. Computing TF-IDF vectors on the text data
# Creating the vectorizer and then using the vectorizer to Convert the text to TF-IDF format

In [2]:
# Create a CountVectorizer to convert text into a numerical format
vectorizer = CountVectorizer()

# Fit the vectorizer on the training data and transform the data
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [None]:
# 4.Binary classification problem with the Naïve Bayes classifier.
# Classifying the documents into stockton-housing and non-stockton-housing with multinomial Naïve Bayes from scikit-learn

In [44]:
# train the naive bayes classifier
from sklearn.naive_bayes import MultinomialNB
# Initialize the classifier and train it
classifier = MultinomialNB()
classifier.fit(X_train_vectorized, y_train)


In [None]:
# Finding accuracy based on test set

In [45]:
y_pred = classifier.predict(X_test_vectorized)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9666666666666667


In [46]:
# View the results as a confusion matrix
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, y_pred,normalize=None)
print(conf_matrix)

[[27  0]
 [ 1  2]]


In [None]:
#  5.Binary classification problem with the SVC classifier.

In [47]:
# Train an SVC classifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

# classifier = SVC()
# Create a pipeline with a TF-IDF vectorizer and a linear SVC classifier
svc_tfidf = Pipeline([
    ("tfidf_vectorizer", TfidfVectorizer(stop_words="english", max_features=100)),
    ("linear_svc", SVC(kernel="linear"))
])

# Train the pipeline
svc_tfidf.fit(X_train, y_train)

# Predict labels on the test set
y_pred = svc_tfidf.predict(X_test)

# classifier.fit(X_train_vectorized, y_train)

# Predict labels on the test set
# y_pred = classifier.predict(X_test_vectorized)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
# print(report)

Accuracy: 0.9333333333333333


In [48]:
# View the results as a confusion matrix
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, y_pred,normalize=None)
print(conf_matrix)

[[27  0]
 [ 2  1]]
