<a href="https://colab.research.google.com/github/angho8/Architecture-Analysis/blob/main/English_491_Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary libraries if they're not already available in your Colab environment
!pip install requests beautifulsoup4 nltk spacy sklearn

import csv
import requests
from bs4 import BeautifulSoup
import nltk
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from google.colab import files
import gensim
from gensim import corpora
import string


Collecting sklearn
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


ModuleNotFoundError: No module named 'gensim'

In [None]:
# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')
nlp = spacy.load("en_core_web_sm")


In [None]:
# Function to read content from a .txt file
def read_text_from_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        return ""

In [None]:
# Function to process and analyze the text
def analyze_text(text):
    # Tokenize the text into words
    tokens = word_tokenize(text)

    # Convert to lowercase and filter out non-alphabetic words and stopwords
    words = [word.lower() for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]

    # Frequency distribution of words
    freq_dist = FreqDist(filtered_words)

    # Sentiment analysis using VADER
    sia = SentimentIntensityAnalyzer()
    sentiment_score = sia.polarity_scores(text)

    return freq_dist, sentiment_score

In [None]:
# Named Entity Recognition (NER) to extract key architectural entities
def extract_entities(text):
    doc = nlp(text)
    entities = {"architects": [], "buildings": [], "periods": []}
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            entities["architects"].append(ent.text)
        elif ent.label_ == "FAC":  # Facility, usually building names
            entities["buildings"].append(ent.text)
        elif ent.label_ == "TIME":  # Time period
            entities["periods"].append(ent.text)

    return entities

In [None]:
# Cosine similarity to measure the similarity between two articles
def calculate_similarity(texts):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(texts)
    cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix)

    return cosine_sim


In [None]:
# Topic modeling using LDA (Latent Dirichlet Allocation)
def perform_topic_modeling(texts, num_topics=5):
    # Tokenize and remove stopwords
    stop_words = set(stopwords.words('english'))
    processed_texts = []

    for text in texts:
        tokens = word_tokenize(text.lower())
        tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
        processed_texts.append(tokens)

    # Create a dictionary and corpus
    dictionary = corpora.Dictionary(processed_texts)
    corpus = [dictionary.doc2bow(text) for text in processed_texts]

    # Apply LDA
    lda_model = gensim.models.LdaMulticore(corpus, num_topics=num_topics, id2word=dictionary, passes=10)
    topics = lda_model.print_topics(num_words=5)

    return topics

In [None]:
# Function to write results to a CSV file
def write_results_to_csv(results, filename='output_results.csv'):
    # Define CSV headers
    headers = ['Filename', 'Most Common Words', 'Sentiment', 'Architects', 'Buildings', 'Periods', 'Topics']

    # Write to CSV file
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(headers)

        for result in results:
            writer.writerow(result)

    print(f"Results saved to {filename}")

In [None]:
# Main function to process multiple text files
def main():
    # Upload .txt files for offline analysis
    uploaded = files.upload()

    # List to store articles from .txt files
    articles = []

    # Process uploaded .txt files
    for filename in uploaded.keys():
        print(f"Processing file: {filename}...")

        # Read the text from the file
        text = read_text_from_file(filename)

        if text:
            articles.append(text)
            print(f"Analyzing text from {filename}...")
            freq_dist, sentiment_score = analyze_text(text)
            print(f"Most common words in {filename}: {freq_dist.most_common(10)}")
            print(f"Sentiment analysis for {filename}: {sentiment_score}")
            entities = extract_entities(text)
            print(f"Extracted entities from {filename}: {entities}")
            print("-" * 50)

    # Compare similarities between the articles
    compare_articles(articles)

    # Perform topic modeling to discover themes in the articles
    topics = perform_topic_modeling(articles)
    print("Discovered Topics from the Articles:")
    for idx, topic in enumerate(topics):
        print(f"Topic {idx + 1}: {topic}")

In [None]:
# Function to compare multiple articles
def compare_articles(articles):
    print("Comparing articles for architectural similarities...")
    similarities = calculate_similarity(articles)

    for i in range(len(articles)):
        print(f"Similarity with article {i + 1}: {similarities[0][i]}")

# Run the script
if __name__ == "__main__":
    main()