In [1]:
import nltk
import pandas as pd

In [2]:
# 1st, I will check the available data. This data contains the texts with their respective Author. 
texts_with_author = pd.read_csv("data/train.csv")
texts_with_author.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [3]:
# The ids are not important, we need to group and join all the texts by author
# The stop words inside this texts will help beacuse we are not trying to understand the intetion of the author.
# Every person uses this stop words with more or less frecuency than others.
texts_groupby_author = texts_with_author.groupby("author")["text"].apply(' '.join).reset_index()

texts_groupby_author

Unnamed: 0,author,text
0,EAP,"This process, however, afforded me no means of..."
1,HPL,It never once occurred to me that the fumbling...
2,MWS,How lovely is spring As we looked from Windsor...


In [4]:
# We transform every text to lowercase to eliminate the tokenization of same words with capitalized letters.
texts_groupby_author["text"] = texts_groupby_author.text.str.lower()

texts_groupby_author

Unnamed: 0,author,text
0,EAP,"this process, however, afforded me no means of..."
1,HPL,it never once occurred to me that the fumbling...
2,MWS,how lovely is spring as we looked from windsor...


In [5]:
# We can use a dict to save the word frequencies for each author
word_frequencies_by_author = {}

for _, row in texts_groupby_author.iterrows():
    author = row["author"]
    text = row["text"]
    tokens = nltk.tokenize.word_tokenize(text)
    frequency = nltk.FreqDist(tokens)
    word_frequencies_by_author[author] = frequency

In [6]:
sentence = "Still, as I urged our leaving Ireland with such inquietude and impatience, my father thought it best to yield."
sentence = sentence.lower()
sentence_tokens = nltk.tokenize.word_tokenize(sentence)

for author in word_frequencies_by_author.keys():
    total = 0
    for word in sentence_tokens:
        total += word_frequencies_by_author[author].freq(word) 
    print(total, author)

0.2824342312149186 EAP
0.23157307827166596 HPL
0.27968435547081877 MWS


In [7]:
# Create a new dataframe to save the data
dataframe_with_frequencies = pd.DataFrame(columns=('id', 'EAP', 'HPL', 'MWS'))
dataframe_with_frequencies.head()

Unnamed: 0,id,EAP,HPL,MWS


In [8]:

# Open the test file and iterate on it
test = pd.read_csv("data/test.csv")
for iter_num, row in test.iterrows():
    sentence = row["text"] # Get sentence
    sentence = sentence.lower() # Str to lower
    sentence_tokens = nltk.tokenize.word_tokenize(sentence) # Tokenize test words
    
    # Get the author and probability of authorship attribution
    row_results = [row["id"]]
    for author in word_frequencies_by_author.keys():
        total = 0
        for word in sentence_tokens:
            total += word_frequencies_by_author[author].freq(word) 
        #print(total, author)
        row_results.append(total)
        
    # Add a new row to the dataframe
    dataframe_with_frequencies.loc[iter_num+1] = row_results

NameError: name 'best_frequency_author' is not defined

In [None]:
# Save the dataframe to a CSV file
dataframe_with_frequencies.to_csv("word_frequency.csv", index=False)