In [None]:
import pandas
import random
from sklearn.model_selection import train_test_split

# Splitting Train Set, Test Set

In [None]:
df = pandas.read_csv("data/train_test.csv")

train_data, test_data = train_test_split(df, test_size=0.2)
train_data_labels, train_data_texts = list(train_data["label"]), list(train_data["text"])
test_data_labels, test_data_texts = list(test_data["label"]), list(test_data["text"])

# Gathering Information About Poets

In [None]:
poets_statistics = {"hafez": 0, "saadi": 0}
for i in range(len(train_data_labels)):
    poets_statistics[train_data_labels[i]] += 1
poets_probability = {"hafez": poets_statistics["hafez"]/len(train_data_labels), "saadi": poets_statistics["saadi"]/len(train_data_labels)}

words = dict()
num_of_words = {"hafez": 0, "saadi": 0}

for i in range(len(train_data_texts)):
    sentence = train_data_texts[i].split()
    for j in range(len(sentence)):
        num_of_words[train_data_labels[i]] += 1
        if sentence[j] not in words:
            words[sentence[j]] = {"hafez": 0, "saadi": 0}
        words[sentence[j]][train_data_labels[i]] += 1

# Functions for Calculating Probability, Accuracy, Precision, and Recall

In [None]:
def calcProbability(poet, sentence):
    sentence = sentence.split()
    product = 1
    for i in range(len(sentence)):
        if sentence[i] in words: 
            product *= (words[sentence[i]][poet]/num_of_words[poet])
    return product * poets_probability[poet]

def calcAccuracy(predictions, test_data_labels):
    correct_detected = 0
    for i in range(len(predictions)):
        if predictions[i] == test_data_labels[i]:
            correct_detected += 1
    return correct_detected/len(test_data_labels)*100

def calcPrecision(predictions, test_data_labels):
    correct_hafez, total_hafez = 0, 0
    for i in range(len(predictions)):
        if predictions[i] == "hafez":
            total_hafez += 1
            if test_data_labels[i] == 'hafez':
                correct_hafez += 1
    return correct_hafez/total_hafez*100

def calcRecall(predictions, test_data_labels):
    correct_hafez, total_hafez = 0, 0
    for i in range(len(predictions)):
        if test_data_labels[i] == "hafez":
            total_hafez += 1
            if predictions[i] == 'hafez':
                correct_hafez += 1
    return correct_hafez/total_hafez*100


# Predicting and Calculating Accuracy, Precision, and Recall



In [None]:
predictions = list()
for i in range(len(test_data_texts)):
    if calcProbability("hafez", test_data_texts[i]) >= calcProbability("saadi", test_data_texts[i]):
        predictions.append("hafez")
    else:
        predictions.append("saadi")


print("Accuracy: ", calcAccuracy(predictions, test_data_labels))
print("Precision: ", calcPrecision(predictions, test_data_labels))
print("Recall: ", calcRecall(predictions, test_data_labels))

Accuracy:  77.38152225945429
Precision:  71.95985832349469
Recall:  72.17288336293664


# Function for Calculating Probability for Laplace Smoothing Technique

In [None]:
def calcLaplaceProbability(poet, sentence):
    sentence = sentence.split()
    product = 1
    for i in range(len(sentence)):
        if sentence[i] in words: 
            product *= ((words[sentence[i]][poet]+1)/(num_of_words[poet]+len(words.keys())))
    return product * poets_probability[poet]

# Applying Laplace Smoothing

In [None]:
laplace_predictions = list()
for i in range(len(test_data_texts)):
    if calcLaplaceProbability("hafez", test_data_texts[i]) >= calcLaplaceProbability("saadi", test_data_texts[i]):
        laplace_predictions.append("hafez")
    else:
        laplace_predictions.append("saadi")

print("Laplace Accuracy: ", calcAccuracy(laplace_predictions, test_data_labels))
print("Laplace Precision: ", calcPrecision(laplace_predictions, test_data_labels))
print("Laplace Recall: ", calcRecall(laplace_predictions, test_data_labels))

Laplace Accuracy:  80.18190521780755
Laplace Precision:  79.58762886597938
Laplace Recall:  68.56127886323267


# Doing Predictions on Second Dataset (Evaluate Dataset in data Directory)

In [None]:
evaluate = pandas.read_csv("data/evaluate.csv")
evaluate_predictions = list()

evaluate_id_list = list(evaluate["id"])
evaluate_text_list = list(evaluate["text"])



for i in range(len(evaluate_text_list)):
    if calcLaplaceProbability("hafez", evaluate_text_list[i]) >= calcLaplaceProbability("saadi", evaluate_text_list[i]):
        evaluate_predictions.append("hafez")
    else:
        evaluate_predictions.append("saadi")

pandas.DataFrame({"id":evaluate_id_list, "label":evaluate_predictions}).to_csv("data/output.csv", index=False)