In [2]:
# imports
# type: ignore
import numpy as np 
import pandas as pd
import string
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

import time

from textstat import flesch_kincaid_grade, flesch_reading_ease

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\24746\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\24746\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\24746\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
start = time.time()

train_data = pd.read_csv('data/train.csv', encoding='Windows-1252')
test_data = pd.read_csv('data/test.csv', encoding='Windows-1252')

In [4]:
# set columns' name
train_data.columns = ['sentiment','id','date','query','user_id','text']
test_data.columns = ['sentiment','id','date','query','user_id','text']

# train_data
# test_data

In [5]:
# sentiment
train_data = train_data[train_data.sentiment.isin([0, 1, 2, 3, 4])]

train_data.drop(['id','date','query','user_id'],axis=1,inplace=True)

In [6]:
# Count the number of documents
num_documents1 = len(train_data)
print(f"Count the number of documents: {num_documents1}")

# Average tokens per document
train_data['token_count'] = train_data['text'].apply(lambda x: len(word_tokenize(x)))
average_tokens1 = train_data['token_count'].mean()
print(f"Average tokens per document: {average_tokens1}")

# Total vocabulary size
vocabulary = set(word for text in train_data['text'] for word in word_tokenize(text))
vocabulary_size1 = len(vocabulary)
print(f"Total vocabulary size: {vocabulary_size1}")

Count the number of documents: 1599999
Average tokens per document: 16.410284631427896
Total vocabulary size: 874870


In [7]:
def clean_text(text):

    text = text.lower()
    
    # Remove emoticons and emojis
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    text = re.sub(r'[^a-z\s]', '', text)
    
    # Tokenize and remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in word_tokenize(text) if word not in stop_words)
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    text = ' '.join(lemmatizer.lemmatize(word) for word in text.split())

    # Stemming
    stemmer = PorterStemmer()
    text = ' '.join(stemmer.stem(word) for word in text.split())
    
    return text

train_data['cleaned_text'] = train_data['text'].apply(clean_text)

In [8]:
train_data

Unnamed: 0,sentiment,text,token_count,cleaned_text
0,0,is upset that he can't update his Facebook by ...,25,upset cant updat facebook text might cri resul...
1,0,@Kenichan I dived many times for the ball. Man...,21,kenichan dive mani time ball manag save rest g...
2,0,my whole body feels itchy and like its on fire,10,whole bodi feel itchi like fire
3,0,"@nationwideclass no, it's not behaving at all....",30,nationwideclass behav im mad cant see
4,0,@Kwesidei not the whole crew,6,kwesidei whole crew
...,...,...,...,...
1599994,4,Just woke up. Having no school is the best fee...,12,woke school best feel ever
1599995,4,TheWDB.com - Very cool to hear old Walt interv...,15,thewdbcom cool hear old walt interview httpbli...
1599996,4,Are you ready for your MoJo Makeover? Ask me f...,12,readi mojo makeov ask detail
1599997,4,Happy 38th Birthday to my boo of alll time!!! ...,15,happi th birthday boo alll time tupac amaru sh...


In [9]:
# Count the number of documents
num_documents2 = len(train_data)
print(f"Count the number of documents: {num_documents1} -> {num_documents2}")

# Average tokens per document
train_data['token_count'] = train_data['cleaned_text'].apply(lambda x: len(word_tokenize(x)))
average_tokens2 = train_data['token_count'].mean()
print(f"Average tokens per document: {average_tokens1} -> {average_tokens2}")

# Total vocabulary size
vocabulary = set(word for text in train_data['cleaned_text'] for word in word_tokenize(text))
vocabulary_size2 = len(vocabulary)
print(f"Total vocabulary size: {vocabulary_size1} -> {vocabulary_size2}")

# Number of stop words removed
stop_words = set(stopwords.words('english'))
original_stopword_count = train_data['text'].apply(lambda x: sum(1 for word in word_tokenize(x) if word.lower() in stop_words)).sum()
cleaned_stopword_count = train_data['cleaned_text'].apply(lambda x: sum(1 for word in word_tokenize(x) if word.lower() in stop_words)).sum()
stopwords_removed = original_stopword_count - cleaned_stopword_count
print(f"Number of stop words removed: {stopwords_removed}")


# Number of special characters removed
def count_special_characters(text):
    return len(re.findall(r'[^\x00-\x7F]', text)) + len(re.findall(r'[^a-z\s]', text))
original_special_char_count = train_data['text'].apply(count_special_characters).sum()
cleaned_special_char_count = train_data['cleaned_text'].apply(count_special_characters).sum()
special_chars_removed = original_special_char_count - cleaned_special_char_count
print(f"Number of special characters removed: {special_chars_removed}")

# Count empty documents after cleaning
empty_documents_after_cleaning = train_data['cleaned_text'].apply(lambda x: len(x.strip()) == 0).sum()
print(f"Number of empty documents after cleaning: {empty_documents_after_cleaning}")

end = time.time()
runtime = end - start
print(f"Total runtime: {runtime} seconds")

Count the number of documents: 1599999 -> 1599999
Average tokens per document: 16.410284631427896 -> 7.715041696901061
Total vocabulary size: 874870 -> 726859
Number of stop words removed: 8737568
Number of special characters removed: 12682030
Number of empty documents after cleaning: 391
Total runtime: 1505.4705233573914 seconds


In [10]:
# Readability Scores
def readability_scores(text_column):
    fk_grades = text_column.apply(lambda x: flesch_kincaid_grade(x) if len(x.strip()) > 0 else None)
    fk_reading_ease = text_column.apply(lambda x: flesch_reading_ease(x) if len(x.strip()) > 0 else None)
    avg_fk_grade = fk_grades.mean()
    avg_fk_reading_ease = fk_reading_ease.mean()
    return avg_fk_grade, avg_fk_reading_ease

# Lexical Diversity
def lexical_diversity(text_column):
    total_words = text_column.apply(lambda x: len(word_tokenize(x))).sum()
    unique_words = len(set(word for text in text_column for word in word_tokenize(text)))
    return unique_words / total_words if total_words > 0 else 0

# Evaluate cleaned text
avg_fk_grade, avg_fk_reading_ease = readability_scores(train_data['cleaned_text'])
lexical_div = lexical_diversity(train_data['cleaned_text'])

print(f"Average Flesch-Kincaid Grade Level: {avg_fk_grade}")
print(f"Average Flesch Reading Ease Score: {avg_fk_reading_ease}")
print(f"Lexical Diversity (unique/total words): {lexical_div}")


Average Flesch-Kincaid Grade Level: 4.526849640661964
Average Flesch Reading Ease Score: 76.32139612330023
Lexical Diversity (unique/total words): 0.05888330572626071


<h1 style="color:red;">Results</h1>
<img src="data/image.png">
<img src="data/image2.png">