In [7]:
import pandas as pd
import nltk
import enchant
import language_tool_python
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import stanfordnlp
from nltk.tag import StanfordNERTagger
from nltk.tokenize import sent_tokenize, word_tokenize

# Define the file path
file_path = 'Reviews.csv'

# Load the data into a DataFrame
df = pd.read_csv(file_path)

# Print the first few rows of the DataFrame
print(df.head())

   Id   ProductId          UserId                      ProfileName  \
0   1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   
1   2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   
2   3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres "Natalia Corres"   
3   4  B000UA0QIQ  A395BORC6FGVXV                             Karl   
4   5  B006K2ZZ7K  A1UQRSCLF8GW1T    Michael D. Bigham "M. Wassir"   

   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
0                     1                       1      5  1303862400   
1                     0                       0      1  1346976000   
2                     1                       1      4  1219017600   
3                     3                       3      2  1307923200   
4                     0                       0      5  1350777600   

                 Summary                                               Text  
0  Good Quality Dog Food  I have bought several of the Vitality canned d...  
1 

In [None]:

# Define the calculation functions for each indicator
def calculate_scores(text):
    # Implement your scoring logic for each indicator
    # Return a dictionary with the scores
    
    scores = {}
    
    # Overall LI (Tika): Length of the text
    scores['Overall LI (Tika)'] = len(text)

    # POS (CRF): Part-of-speech tagging
    tokens = word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    scores['POS (CRF)'] = len(pos_tags) / len(tokens)

    # Abbreviations (1): Percentage of abbreviations
    scores['Abbreviations (1)'] = calculate_percentage_of_abbreviations(text)

    # Spelling (2): Number of spelling mistakes
    scores['Spelling (2)'] = calculate_spelling_mistakes(text)

    # Lexical Diversity (3): Lexical diversity score
    scores['Lexical Diversity (3)'] = calculate_lexical_diversity(text)

    # Uppercased (4): Percentage of uppercased words
    scores['Uppercased (4)'] = calculate_percentage_of_uppercased(text)

    # Ungrammatical (5): Percentage of ungrammatical sentences
    scores['Ungrammatical (5)'] = calculate_percentage_of_ungrammatical_sentences(text)

    # Avg. Sentence Length (6): Average sentence length
    scores['Avg. Sentence Length (6)'] = calculate_average_sentence_length(text)

    # Fit of training data (7): Measure of text similarity with training data
    scores['Fit of training data (7)'] = calculate_fit_of_training_data(text)

    # Confidence (8): Confidence score of text analysis modules
    scores['Confidence (8)'] = calculate_confidence(text)

    # Unknown words (9): Percentage of unknown words
    scores['Unknown words (9)'] = calculate_percentage_of_unknown_words(text)

    return scores

def calculate_percentage_of_abbreviations(text):
    st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz')  # need to download and load the model
    tokens = word_tokenize(text)
    tagged_words = st.tag(tokens)
    total_words = len(tagged_words)
    abbreviation_count = sum(1 for word, tag in tagged_words if tag == 'ORGANIZATION')
    percentage = abbreviation_count / total_words * 100
    return percentage

def calculate_spelling_mistakes(text):
    dictionary = enchant.Dict("en_US")  # English dictionary
    words = text.split()
    spelling_mistake_count = sum(1 for word in words if not dictionary.check(word))
    return spelling_mistake_count

def calculate_lexical_diversity(text):
    return (len(set(text)) / len(text))

def calculate_percentage_of_uppercased(text):
    tokens = word_tokenize(text)
    total_words = len(tokens)
    uppercased_words = [word for word in tokens if word.isupper()]
    percentage = len(uppercased_words) / total_words * 100
    return percentage

def calculate_percentage_of_ungrammatical_sentences(text):
    tool = language_tool_python.LanguageTool('en-US')  # LanguageTool instance for English language
    sentences = text.split('. ')  # Split text into sentences (assuming sentences end with a period and space)
    total_sentences = len(sentences)
    ungrammatical_sentence_count = 0

    for sentence in sentences:
        matches = tool.check(sentence)
        if len(matches) > 0:
            ungrammatical_sentence_count += 1

    percentage = ungrammatical_sentence_count / total_sentences * 100
    return percentage

def calculate_average_sentence_length(text):
    sentences = nltk.sent_tokenize(text)
    total_sentences = len(sentences)
    total_words = 0

    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        total_words += len(words)

    average_length = total_words / total_sentences
    return average_length

def calculate_fit_of_training_data(text):
    # Define the default training data
    default_training_data = [
        "This is the default training data.",
        "You can add more sentences to improve the training.",
        "The fit of training data measures the similarity between text and training data.",
    ]

    # Initialize the TF-IDF vectorizer
    vectorizer = TfidfVectorizer()

    # Fit and transform the default training data
    default_training_vectors = vectorizer.fit_transform(default_training_data)

    # Transform the given text
    text_vector = vectorizer.transform([text])

    # Calculate the cosine similarity between the text and training data
    similarity_scores = cosine_similarity(text_vector, default_training_vectors)

    # Take the maximum similarity score as the fit of training data
    fit_score = similarity_scores.max()

    return fit_score

def calculate_confidence(text):
    # Load the English POS tagger model
    nlp = stanfordnlp.Pipeline(processors='pos', lang='en')

    # Process the text to obtain POS tags
    doc = nlp(text)
    pos_tags = [word.upos for sent in doc.sentences for word in sent.words]

    # Extract confidence scores (if available)
    confidence_scores = [word.upos_prob if hasattr(word, 'upos_prob') else 1.0 for sent in doc.sentences for word in sent.words]

    # Calculate the average confidence score
    confidence_score = sum(confidence_scores) / len(confidence_scores)

    return confidence_score

def calculate_percentage_of_unknown_words(text):
    # Tokenize the text into individual words
    words = nltk.word_tokenize(text)

    # Load a pre-trained English language model
    nltk.download('averaged_perceptron_tagger')
    nltk.download('words')

    # Get the set of known English words
    known_words = set(nltk.corpus.words.words())

    # Count the number of unknown words
    unknown_words = [word for word in words if word.lower() not in known_words]

    # Calculate the percentage of unknown words
    percentage = (len(unknown_words) / len(words)) * 100

    return percentage

# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    text_scores = calculate_scores(row['Text'])
    summary_scores = calculate_scores(row['Summary'])
    
    # Print the scores for each attribute
    print(f"Text Scores: {text_scores}")
    print(f"Summary Scores: {summary_scores}")
    print("---")