In [5]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import spacy
import spacy.cli
spacy.cli.download("en_core_web_sm")


# Load SpaCy's English model
nlp = spacy.load("en_core_web_sm")

# Folder path
folder_path = 'C:/Users/Administrator/Desktop/MAN7916/local_data/About'

# Read .txt files into a DataFrame
file_data = []
for file in os.listdir(folder_path):
    if file.endswith('.txt'):
        filepath = os.path.join(folder_path, file)
        with open(filepath, 'r', encoding='utf-8') as f:
            text = f.read()
        file_data.append({
            'name': os.path.splitext(file)[0],
            'filepath': filepath,
            'text': text
        })

df = pd.DataFrame(file_data)

# Preprocessing functions
def preprocess_text(doc, remove_stopwords):
    tokens = []
    for token in doc:
        if token.is_alpha:  # Keep only alphabetic tokens
            if not remove_stopwords or (remove_stopwords and not token.is_stop):
                tokens.append(token.text.lower())
    return tokens



[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [6]:
# Apply SpaCy preprocessing
preprocessed_ws = []
preprocessed_wos = []

for doc in nlp.pipe(df['text']):
    preprocessed_ws.append(preprocess_text(doc, remove_stopwords=False))
    preprocessed_wos.append(preprocess_text(doc, remove_stopwords=True))

df['preprocessed_ws'] = preprocessed_ws
df['preprocessed_wos'] = preprocessed_wos

# Count word frequencies
def get_word_frequencies(token_lists):
    counter = Counter()
    for tokens in token_lists:
        counter.update(tokens)
    return counter

counter_ws = get_word_frequencies(df['preprocessed_ws'])
counter_wos = get_word_frequencies(df['preprocessed_wos'])

# Plot top 100 words
def plot_top_words(counter, title, output_path):
    most_common = counter.most_common(100)
    words, counts = zip(*most_common)

    plt.figure(figsize=(20, 8))
    plt.bar(words, counts)
    plt.title(title, fontsize=16)
    plt.xticks(rotation=90)
    plt.xlabel("Words", fontsize=12)
    plt.ylabel("Frequency", fontsize=12)
    plt.tight_layout()
    plt.savefig(output_path)
    plt.close()

# Output folder for assignment
output_folder = 'C:/Users/Administrator/Desktop/MAN7916/assignments/submissions/assignment_4'
os.makedirs(output_folder, exist_ok=True)

# Save figures
plot_top_words(counter_ws, "Top 100 Words (With Stopwords)", os.path.join(output_folder, 'common100_ws.jpg'))
plot_top_words(counter_wos, "Top 100 Words (Without Stopwords)", os.path.join(output_folder, 'common100_wos.jpg'))

print("Processing complete. Figures saved in the 'assignment_4' folder.")


Processing complete. Figures saved in the 'assignment_4' folder.


innovation

In [7]:
# Step 4: Filter by Innovativeness Dictionary
# List of words from the 'Innovativeness' dictionary
innovativeness_dict = [
    "ad lib", "adroit", "adroitness", "bright idea", "clever", "cleverness", "conceive", "concoct", "concoction", 
    "concoctive", "conjure up", "creative", "creativity", "develop", "developed", "dream", "dream up", "expert", 
    "formulation", "freethinker", "genesis", "genius", "gifted", "hit upon", "imagination", "imaginative", "improvise", 
    "ingenious", "ingenuity", "innovate", "innovated", "innovates", "innovating", "innovation", "innovations", 
    "innovative", "innovativeness", "introduced", "introducing", "introduction", "introductions", "invent", "invented", 
    "invention", "inventive", "inventiveness", "inventor", "launch", "launched", "launching", "master stroke", "mastermind", 
    "metamorphose", "metamorphosis", "neoteric", "neoterism", "neoterize", "new capabilities", "new capability", 
    "new compounds", "new content", "new core areas", "new course", "new directions", "new family", "new features", 
    "new generation", "new generations", "new idea", "new ideas", "new line of business", "new medicine", "new medicines", 
    "new molecular entities", "new pharmaceuticals", "new platform", "new process", "new processes", "new product", 
    "new products", "new solutions", "new systems", "new technique", "new techniques", "new technologies", "new technology", 
    "new therapies", "new thinking", "new tools", "new treatments", "new ways", "new wrinkle", "new-generation", 
    "new-product", "next generation", "next-generation", "novation", "novel", "novelty", "patent", "patented", "patents", 
    "process development", "product development", "product launch", "product launches", "proprietary", "prototype", 
    "prototyping", "push the envelope", "R&D", "radical", "re-engineering", "reformulated", "refreshed", "reinvent", 
    "re-invent", "reinvented", "reinventing", "reinvention", "reinvents", "released", "renewal", "renewing", "research", 
    "reshape", "reshaped", "reshapes", "reshaping", "resourceful", "resourcefulness", "restyle", "restyling", 
    "revolutionary", "revolutionize", "revolutionized", "roll out", "rolled out", "see things", "technologically advanced", 
    "think up", "trademark", "transform", "transformation", "transformed", "transforming", "visualize"]


# Filter Counter for only the words in the 'Innovativeness' dictionary
filtered_innovativeness = {word: count for word, count in counter_ws.items() if word in innovativeness_dict}

# Sort by frequency and get the top 20 (if available)
sorted_innovativeness = dict(sorted(filtered_innovativeness.items(), key=lambda x: x[1], reverse=True)[:20])

# Plot top Innovativeness words
def plot_innovativeness_words(sorted_innov, output_path):
    words = list(sorted_innov.keys())
    counts = list(sorted_innov.values())

    plt.figure(figsize=(10, 6))
    plt.plot(words, counts, marker='o')
    plt.title("Top Innovativeness Words")
    plt.xlabel("Words")
    plt.ylabel("Frequency")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(output_path)
    plt.close()

# Save innovativeness chart
innov_chart_path = os.path.join(output_folder, 'innov_freqs.jpg')
plot_innovativeness_words(sorted_innovativeness, innov_chart_path)

# Add 'innov_ws' and 'innov_perwd_ws' columns
def count_innov_words(words, innov_dict):
    return sum(words.count(word) for word in innov_dict)

df['innov_ws'] = df['preprocessed_ws'].apply(lambda x: count_innov_words(x, innovativeness_dict))
df['innov_perwd_ws'] = df['innov_ws'] / df['preprocessed_ws'].apply(len)

# Save the dataset as CSV
output_csv = os.path.join(output_folder, 'innov_aussie_data.csv')
df.to_csv(output_csv, index=False)

print("Innovativeness analysis complete. Files saved in the 'assignment_4' folder.")


Innovativeness analysis complete. Files saved in the 'assignment_4' folder.
