In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd


In [9]:
def detect_essay_subjects(essays):
    # Create a CountVectorizer to convert the essays into a matrix of word counts
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform(essays)

    # Use LatentDirichletAllocation to perform topic modeling
    n_topics = 5
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
    lda.fit(X)

    # Print the top 10 words for each topic
    feature_names = vectorizer.get_feature_names_out()
    for topic_idx, topic in enumerate(lda.components_):
        print(f"Topic #{topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-11:-1]]))
        print()

    # Return the predicted topics for each essay
    return lda.transform(X)





In [12]:
def detect_essay_subjects2(essays):
    # Vectorize the essays
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform(essays)

    # Train an LDA model
    lda = LatentDirichletAllocation(n_components=5, random_state=0)
    lda.fit(X)

    # Get the topic words for each topic
    feature_names = vectorizer.get_feature_names_out()
    topic_words = []
    for topic_idx, topic in enumerate(lda.components_):
        top_words_idx = topic.argsort()[:-11:-1]
        top_words = [feature_names[i] for i in top_words_idx]
        topic_words.append(top_words)

    # Get the most likely topic for each essay
    essay_topics = []
    for essay in essays:
        essay_vector = vectorizer.transform([essay])
        topic_proportions = lda.transform(essay_vector)
        most_likely_topic = topic_proportions.argmax()
        essay_topics.append(topic_words[most_likely_topic])

    return essay_topics

In [3]:
df = pd.read_excel("./asap-aes/training_set_rel3.xlsx")
df.head()

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,...,rater2_trait3,rater2_trait4,rater2_trait5,rater2_trait6,rater3_trait1,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6
0,1,1,"Dear local newspaper, I think effects computer...",4.0,4.0,,8.0,,,,...,,,,,,,,,,
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5.0,4.0,,9.0,,,,...,,,,,,,,,,
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4.0,3.0,,7.0,,,,...,,,,,,,,,,
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5.0,5.0,,10.0,,,,...,,,,,,,,,,
4,5,1,"Dear @LOCATION1, I know having computers has a...",4.0,4.0,,8.0,,,,...,,,,,,,,,,


In [10]:
# Assume your vector of essays is stored in a variable called 'essays'
essays = df['essay']

essay_topics = detect_essay_subjects(essays)


Topic #0:
people computers computer time friends caps1 like use just caps2

Topic #1:
building dirigibles state empire mast builders mooring dirigible obstacles faced

Topic #2:
caps1 author mood parents story family home time got like

Topic #3:
cyclist water setting features road affect hot affected story hills

Topic #4:
books people book offensive read library think like things just



In [13]:
essay_topics2 = detect_essay_subjects2(essays)

In [None]:
print(essay_topics2)