In [1]:

# Today a friend asked me to help him read through a bunch of essays for scholarships for our fraternity (Sigma Chi). 
# We have to read 800+ word documents. I thought it would be useful to analyze the text in each of the documents and quantify
# the character, or quality, of each essay so I can rank them, and thus, filter out the best ones from the whole bunch.
# Below you can see my attampts to achieve this objective.


In [2]:

import os
import nltk
from textblob import TextBlob
import spacy
from docx import Document
from collections import Counter
import readability
from langdetect import detect

# Download NLTK resources
nltk.download('punkt')

# Load English language model for spaCy
nlp = spacy.load("en_core_web_sm")

# Emotion Analysis using TextBlob
def analyze_emotion(text):
    blob = TextBlob(text)
    emotions = {'positive': 0, 'neutral': 0, 'negative': 0}
    for sentence in blob.sentences:
        sentiment = sentence.sentiment.polarity
        if sentiment > 0:
            emotions['positive'] += 1
        elif sentiment < 0:
            emotions['negative'] += 1
        else:
            emotions['neutral'] += 1
    return emotions


# Language Proficiency Analysis using spaCy
def analyze_language_proficiency(text):
    doc = nlp(text)
    # Calculate vocabulary richness (unique word count / total word count)
    unique_words = set(token.text.lower() for token in doc if token.is_alpha)
    vocabulary_richness = len(unique_words) / len(doc)
    # Calculate average sentence length (total word count / sentence count)
    sentence_count = len(list(doc.sents))
    total_word_count = len(doc)
    avg_sentence_length = total_word_count / sentence_count
    # Calculate average word length (total character count / total word count)
    total_char_count = sum(len(token.text) for token in doc if token.is_alpha)
    avg_word_length = total_char_count / total_word_count
    return vocabulary_richness, avg_sentence_length, avg_word_length

# Coherence Analysis
def analyze_coherence(text):
    coherence_score = len(text.split()) / 1000  # Example: coherence based on word count
    return coherence_score

# Grammar Analysis
def analyze_grammar(text):
    doc = nlp(text)
    grammar_errors = 0
    for token in doc:
        if token.pos_ == "VERB" and token.tag_ != "VBG" and token.tag_ != "VBN":
            grammar_errors += 1
    grammar_score = max(0, 1 - (grammar_errors / len(doc)))
    return grammar_score

# Readability Analysis
def analyze_readability(text):
    # Example: Calculate Flesch Reading Ease Score
    return readability.getmeasures(text, lang='en')['readability grades']['FleschReadingEase']

# Plagiarism Detection
# You can integrate a plagiarism detection library or service here

# Path to directory containing .docx files
essays_directory = 'C:\\Users\\ryan_\\Desktop\\essays'

# Loop through all files in the directory
for filename in os.listdir(essays_directory):
    if filename.endswith(".docx"):  # Assuming essays are in docx files
        file_path = os.path.join(essays_directory, filename)
        # Open the .docx file
        doc = Document(file_path)
        # Read the text from paragraphs
        essay_text = '\n'.join([paragraph.text for paragraph in doc.paragraphs])
        # Analyze the essay
        emotion_result = analyze_emotion(essay_text)
        #style_result = analyze_style(essay_text)
        vocabulary_richness, avg_sentence_length, avg_word_length = analyze_language_proficiency(essay_text)
        coherence_score = analyze_coherence(essay_text)
        grammar_score = analyze_grammar(essay_text)
        readability_score = analyze_readability(essay_text)
        # Detect language
        language = detect(essay_text)
        # Print the filename along with the analyzed results
        print(f"Essay: {filename}")
        print("Emotion Analysis:")
        for emotion, count in emotion_result.items():
            print(f"{emotion.capitalize()}: {count}")
        print("\nLanguage Proficiency Analysis:")
        print(f"Vocabulary Richness: {vocabulary_richness:.2f}")
        print(f"Average Sentence Length: {avg_sentence_length:.2f}")
        print(f"Average Word Length: {avg_word_length:.2f}")
        print("\nCoherence Score:", coherence_score)
        print("Grammar Score:", grammar_score)
        print("\nReadability Score:", readability_score)
        print("\nLanguage:", language)
        print("\n")
        

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ryan_\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Essay: background, interest, and talent.docx
Emotion Analysis:
Positive: 19
Neutral: 3
Negative: 4

Language Proficiency Analysis:
Vocabulary Richness: 0.39
Average Sentence Length: 30.40
Average Word Length: 3.64

Coherence Score: 0.833
Grammar Score: 0.9089912280701754

Readability Score: -1.8814790925266465

Language: en


Essay: change the world.docx
Emotion Analysis:
Positive: 10
Neutral: 1
Negative: 1

Language Proficiency Analysis:
Vocabulary Richness: 0.48
Average Sentence Length: 35.33
Average Word Length: 4.12

Coherence Score: 0.374
Grammar Score: 0.9080188679245284

Readability Score: -50.640661444734505

Language: en


Essay: child of immigrant parents.docx
Emotion Analysis:
Positive: 11
Neutral: 8
Negative: 7

Language Proficiency Analysis:
Vocabulary Richness: 0.41
Average Sentence Length: 25.11
Average Word Length: 4.21

Coherence Score: 0.611
Grammar Score: 0.8997050147492626

Readability Score: -75.87461709886546

Language: en


Essay: never give up.docx
Emotion Analy

In [8]:

import os
import nltk
import spacy
from docx import Document
from collections import Counter
import readability

# Load English language model for spaCy
nlp = spacy.load("en_core_web_sm")

# Language Proficiency Analysis using spaCy
def analyze_language_proficiency(text):
    doc = nlp(text)
    # Calculate vocabulary richness (unique word count / total word count)
    unique_words = set(token.text.lower() for token in doc if token.is_alpha)
    vocabulary_richness = len(unique_words) / len(doc)
    # Calculate coherence score (example: coherence based on sentence length)
    sentence_lengths = [len(sent) for sent in doc.sents]
    coherence_score = sum(sentence_lengths) / len(sentence_lengths)
    # Calculate grammar score (example: ratio of verbs to total words)
    verb_count = len([token for token in doc if token.pos_ == "VERB"])
    grammar_score = verb_count / len(doc)
    return vocabulary_richness, coherence_score, grammar_score

# Readability Analysis
def analyze_readability(text):
    # Example: Calculate Flesch Reading Ease Score
    return readability.getmeasures(text, lang='en')['readability grades']['FleschReadingEase']

# Assign rankings based on scores
def assign_rank(score, scores_list):
    percentile = sum(score > s for s in scores_list) / len(scores_list)
    rank = round(percentile * 10) + 1  # Adjusted to start from rank 1
    return rank

# Path to directory containing .docx files
essays_directory = 'C:\\Users\\ryan_\\Desktop\\essays'

# Store scores for each feature for all documents
vocabulary_richness_scores = []
coherence_scores = []
grammar_scores = []
readability_scores = []

# Loop through all files in the directory
for filename in os.listdir(essays_directory):
    if filename.endswith(".docx"):  # Assuming essays are in docx files
        file_path = os.path.join(essays_directory, filename)
        # Open the .docx file
        doc = Document(file_path)
        # Read the text from paragraphs
        essay_text = '\n'.join([paragraph.text for paragraph in doc.paragraphs])
        # Analyze the essay
        vocabulary_richness, coherence_score, grammar_score = analyze_language_proficiency(essay_text)
        readability_score = analyze_readability(essay_text)
        # Store scores for each feature
        vocabulary_richness_scores.append(vocabulary_richness)
        coherence_scores.append(coherence_score)
        grammar_scores.append(grammar_score)
        readability_scores.append(readability_score)

        # Assign ranks based on scores
        vocabulary_richness_rank = assign_rank(vocabulary_richness, vocabulary_richness_scores)
        coherence_rank = assign_rank(coherence_score, coherence_scores)
        grammar_rank = assign_rank(grammar_score, grammar_scores)
        readability_rank = assign_rank(readability_score, readability_scores)

        # Print ranks for each feature
        print(filename)
        print(f"Vocabulary Richness Rank: {vocabulary_richness_rank}")
        print(f"Coherence Rank: {coherence_rank}")
        print(f"Grammar Rank: {grammar_rank}")
        print(f"Readability Rank: {readability_rank}")
        print('----------')


background, interest, and talent.docx
Vocabulary Richness Rank: 1
Coherence Rank: 1
Grammar Rank: 1
Readability Rank: 1
----------
change the world.docx
Vocabulary Richness Rank: 6
Coherence Rank: 6
Grammar Rank: 6
Readability Rank: 1
----------
child of immigrant parents.docx
Vocabulary Richness Rank: 4
Coherence Rank: 1
Grammar Rank: 1
Readability Rank: 1
----------
never give up.docx
Vocabulary Richness Rank: 6
Coherence Rank: 3
Grammar Rank: 1
Readability Rank: 6
----------
open topic.docx
Vocabulary Richness Rank: 5
Coherence Rank: 1
Grammar Rank: 1
Readability Rank: 9
----------
personal hardship.docx
Vocabulary Richness Rank: 3
Coherence Rank: 3
Grammar Rank: 8
Readability Rank: 1
----------
preserving your culture.docx
Vocabulary Richness Rank: 7
Coherence Rank: 5
Grammar Rank: 1
Readability Rank: 4
----------
something had a big impact on your life.docx
Vocabulary Richness Rank: 7
Coherence Rank: 2
Grammar Rank: 10
Readability Rank: 6
----------
study abroad.docx
Vocabulary Ri

In [14]:

import os
import nltk
import spacy
from docx import Document
from collections import Counter
import readability

# Load English language model for spaCy
nlp = spacy.load("en_core_web_sm")

# Language Proficiency Analysis using spaCy
def analyze_language_proficiency(text):
    doc = nlp(text)
    # Calculate vocabulary richness (unique word count / total word count)
    unique_words = set(token.text.lower() for token in doc if token.is_alpha)
    vocabulary_richness = len(unique_words) / len(doc)
    # Calculate coherence score (example: coherence based on sentence length)
    sentence_lengths = [len(sent) for sent in doc.sents]
    coherence_score = sum(sentence_lengths) / len(sentence_lengths)
    # Calculate grammar score (example: ratio of verbs to total words)
    verb_count = len([token for token in doc if token.pos_ == "VERB"])
    grammar_score = verb_count / len(doc)
    return vocabulary_richness, coherence_score, grammar_score

# Readability Analysis
def analyze_readability(text):
    # Example: Calculate Flesch Reading Ease Score
    return readability.getmeasures(text, lang='en')['readability grades']['FleschReadingEase']

# Assign rankings based on scores
def assign_rank(score, scores_list, reverse=False):
    sorted_scores = sorted(scores_list, reverse=reverse)
    percentile = sorted_scores.index(score) / len(sorted_scores)
    rank = round(percentile * 10) + 1  # Adjusted to start from rank 1
    return rank

# Path to directory containing .docx files
essays_directory = 'C:\\Users\\ryan_\\Desktop\\essays'

# Store scores for each feature for all documents
vocabulary_richness_scores = []
coherence_scores = []
grammar_scores = []
readability_scores = []

# Loop through all files in the directory
for filename in os.listdir(essays_directory):
    if filename.endswith(".docx"):  # Assuming essays are in docx files
        file_path = os.path.join(essays_directory, filename)
        # Open the .docx file
        doc = Document(file_path)
        # Read the text from paragraphs
        essay_text = '\n'.join([paragraph.text for paragraph in doc.paragraphs])
        # Analyze the essay
        vocabulary_richness, coherence_score, grammar_score = analyze_language_proficiency(essay_text)
        readability_score = analyze_readability(essay_text)
        # Store scores for each feature
        vocabulary_richness_scores.append(vocabulary_richness)
        coherence_scores.append(coherence_score)
        grammar_scores.append(grammar_score)
        readability_scores.append(readability_score)

        # Assign ranks based on scores
        vocabulary_richness_rank = assign_rank(min(vocabulary_richness_scores), vocabulary_richness_scores, reverse=True)
        coherence_rank = assign_rank(min(coherence_scores), coherence_scores, reverse=True)
        grammar_rank = assign_rank(min(grammar_scores), grammar_scores, reverse=True)
        readability_rank = assign_rank(max(readability_scores), readability_scores)

        # Print ranks for each feature
        print(filename)
        print(f"Vocabulary Richness Rank: {vocabulary_richness_rank}")
        print(f"Coherence Rank: {coherence_rank}")
        print(f"Grammar Rank: {grammar_rank}")
        print(f"Readability Rank: {readability_rank}")
        print('----------')


background, interest, and talent.docx
Vocabulary Richness Rank: 1
Coherence Rank: 1
Grammar Rank: 1
Readability Rank: 1
----------
change the world.docx
Vocabulary Richness Rank: 6
Coherence Rank: 6
Grammar Rank: 6
Readability Rank: 6
----------
child of immigrant parents.docx
Vocabulary Richness Rank: 8
Coherence Rank: 8
Grammar Rank: 8
Readability Rank: 8
----------
never give up.docx
Vocabulary Richness Rank: 9
Coherence Rank: 9
Grammar Rank: 9
Readability Rank: 9
----------
open topic.docx
Vocabulary Richness Rank: 9
Coherence Rank: 9
Grammar Rank: 9
Readability Rank: 9
----------
personal hardship.docx
Vocabulary Richness Rank: 9
Coherence Rank: 9
Grammar Rank: 9
Readability Rank: 9
----------
preserving your culture.docx
Vocabulary Richness Rank: 10
Coherence Rank: 10
Grammar Rank: 10
Readability Rank: 10
----------
something had a big impact on your life.docx
Vocabulary Richness Rank: 10
Coherence Rank: 10
Grammar Rank: 10
Readability Rank: 10
----------
study abroad.docx
Vocabu

In [None]:

#  What's the take away here???

#  Scoring Metrics: The current scoring metrics may not fully capture the nuances of writing quality. While the 
#  implemented metrics like coherence, vocabulary richness, grammar, and readability are important, they might not 
#  cover all aspects that contribute to the overall quality of an essay. Consider additional or alternative metrics 
#  that better reflect your perception of writing quality.

#  Weighting of Features: The importance of each feature in determining the overall quality may vary. Currently, the 
#  code treats all features equally in determining ranks. However, certain features may be more critical indicators of 
#  quality than others. Consider assigning different weights to features based on their importance in your assessment.

#  Perhaps the most important...
#  Subjectivity of Evaluation: Writing quality is inherently subjective and can vary based on individual preferences, 
#  perspectives, and standards. Your perception of an essay's quality may differ from the algorithm's ranking, especially 
#  if certain aspects are prioritized differently.
        

In [None]:

# source of data for essays
# https://www.collegeessayguy.com/blog/scholarship-essay-examples


In [None]:

# I created a 'fake' essay, which basically just had the text that you see below.

# ...
# Worst one.
# Nothing, nothing, nothing.
# umb, dumber, and dumbest!
