### Loading DataSets

In [1]:
import time

# Start timing the whole notebook
notebook_start_time = time.time()


In [2]:

import pandas as pd 

# Uploading the training dataset
df_train = pd.read_csv('trainingandtestdata/train.csv', encoding='latin-1', header=None)
df_train.columns = ['Label', 'Text1', 'TimeStamp', 'Meta1', 'Meta2', 'MainText']

# Uploading the testing dataset
df_test = pd.read_csv('trainingandtestdata/test.csv', encoding='latin-1', header=None)
df_test.columns = ['Label', 'Text1', 'TimeStamp', 'Meta1', 'Meta2', 'MainText']

print("Datasets loaded successfully")

#Merging the two datasets

df = pd.concat([df_train, df_test], ignore_index=True)

print("Datasets merged successfully")


Datasets loaded successfully
Datasets merged successfully


### Number of Documents for the dataset

In [3]:
#output the merged dataset
number_of_documents= len(df)
print("Number of documents in the dataset: ", number_of_documents)
df.head()

Number of documents in the dataset:  1600498


Unnamed: 0,Label,Text1,TimeStamp,Meta1,Meta2,MainText
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [4]:
import nltk

# Set a specific path for nltk data
nltk.data.path = ['/usr/local/share/nltk_data']

# Re-download the 'punkt' resource
nltk.download('punkt_tab', download_dir='/usr/local/share/nltk_data')


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /usr/local/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

### Before Cleaning 

In [5]:
from nltk.tokenize import sent_tokenize, word_tokenize

print("Before clean:")
# Tokenize the text into sentences
df['before_sent_tokenized_text'] = df['MainText'].apply(lambda x: sent_tokenize(x))

# Tokenize the text into words
df['before_word_tokenized_text'] = df['before_sent_tokenized_text'].apply(
    lambda sentences: [word_tokenize(sentence) for sentence in sentences]
)

# Calculate the number of sentences
before_number_of_sentences = df['before_sent_tokenized_text'].apply(len).sum()
print("Number of sentences in the dataset:", before_number_of_sentences)

# Calculate the total word count
before_number_of_words = df['before_word_tokenized_text'].apply(
    lambda sentences: sum(len(sentence) for sentence in sentences)
).sum()
print("Number of words in the dataset:", before_number_of_words)

# Calculate the average number of sentences per document

before_avg_sentences_per_doc = before_number_of_sentences / number_of_documents
print("Average number of sentences per document:", before_avg_sentences_per_doc)

# Word count for the entire dataset
print("Word count (total):", before_number_of_words)

# Unique words for the entire dataset
before_unique_words = len(set(
    word for word_list in df['before_word_tokenized_text'].explode() for word in word_list
))
print("Unique words in the dataset:", before_unique_words)

# Max word length
before_max_word_length = max(
    len(word) for word_list in df['before_word_tokenized_text'].explode() for word in word_list
)
print("Max word length:", before_max_word_length)

# Min sentence length (in words)
before_min_sentence_length = min(
    len(sentence) for sentence_list in df['before_word_tokenized_text'] for sentence in sentence_list
)
print("Min sentence length:", before_min_sentence_length)

# Max sentence length (in words)
before_max_sentence_length = max(
    len(sentence) for sentence_list in df['before_word_tokenized_text'] for sentence in sentence_list
)
print("Max sentence length:", before_max_sentence_length)


Before clean:
Number of sentences in the dataset: 2748402
Number of words in the dataset: 26261279
Average number of sentences per document: 1.7172167662814948
Word count (total): 26261279
Unique words in the dataset: 874726
Max word length: 204
Min sentence length: 1
Max sentence length: 229


### Preprocessing Data

In [6]:
#checking df for missing values
df.isnull().sum()
df.isna().sum()

Label                         0
Text1                         0
TimeStamp                     0
Meta1                         0
Meta2                         0
MainText                      0
before_sent_tokenized_text    0
before_word_tokenized_text    0
dtype: int64

### Transforming into lowercase

In [7]:
#text preprocessing 

#Transforming all the text to lowercase
import re 
df['cleaned_text'] = df['MainText'].apply(lambda x: x.lower())
df['cleaned_text']


0          @switchfoot http://twitpic.com/2y1zl - awww, t...
1          is upset that he can't update his facebook by ...
2          @kenichan i dived many times for the ball. man...
3            my whole body feels itchy and like its on fire 
4          @nationwideclass no, it's not behaving at all....
                                 ...                        
1600493    ask programming: latex or indesign?: submitted...
1600494    on that note, i hate word. i hate pages. i hat...
1600495    ahhh... back in a *real* text editing environm...
1600496    trouble in iran, i see. hmm. iran. iran so far...
1600497    reading the tweets coming out of iran... the w...
Name: cleaned_text, Length: 1600498, dtype: object

### Detecting the web address

In [8]:


# Extract URLs from the text
df['URL_address'] = df['cleaned_text'].apply(lambda x: re.findall(r'https?://\S+', x))

# Count the total number of URLs in the dataset
total_urls = df['URL_address'].apply(len).sum()

print("Total number of URLs in the dataset:", total_urls)
# Display the first few rows of extracted URLs for verification
df[['cleaned_text', 'URL_address']]






Total number of URLs in the dataset: 71722


Unnamed: 0,cleaned_text,URL_address
0,"@switchfoot http://twitpic.com/2y1zl - awww, t...",[http://twitpic.com/2y1zl]
1,is upset that he can't update his facebook by ...,[]
2,@kenichan i dived many times for the ball. man...,[]
3,my whole body feels itchy and like its on fire,[]
4,"@nationwideclass no, it's not behaving at all....",[]
...,...,...
1600493,ask programming: latex or indesign?: submitted...,[http://tinyurl.com/myfmf7]
1600494,"on that note, i hate word. i hate pages. i hat...",[]
1600495,ahhh... back in a *real* text editing environm...,[]
1600496,"trouble in iran, i see. hmm. iran. iran so far...",[]


### Removing URL web address

In [9]:
#Removing URLs from the text
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: re.sub(r'https?://\S+', '', x))
print ("URLs removed successfully")

URLs removed successfully


### Detecting Account 

In [10]:
df['detected_account']= df['cleaned_text'].apply(lambda x: re.findall(r'@(\w+)', x))
total_accounts = df['detected_account'].apply(len).sum()
print("Total number of accounts detected in the dataset:", total_accounts)

# Display the first few rows of detected accounts for verification
df[['cleaned_text', 'detected_account']]

Total number of accounts detected in the dataset: 786705


Unnamed: 0,cleaned_text,detected_account
0,"@switchfoot - awww, that's a bummer. you sho...",[switchfoot]
1,is upset that he can't update his facebook by ...,[]
2,@kenichan i dived many times for the ball. man...,[kenichan]
3,my whole body feels itchy and like its on fire,[]
4,"@nationwideclass no, it's not behaving at all....",[nationwideclass]
...,...,...
1600493,ask programming: latex or indesign?: submitted...,[]
1600494,"on that note, i hate word. i hate pages. i hat...",[]
1600495,ahhh... back in a *real* text editing environm...,[]
1600496,"trouble in iran, i see. hmm. iran. iran so far...",[]


### Detecting PhoneNumber

In [11]:
#phone number detected 
df['detected_phone_number'] = df['cleaned_text'].apply(lambda x: re.findall(r'(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})', x))
total_phone_numbers = df['detected_phone_number'].apply(len).sum()
print("Total number of phone numbers detected in the dataset:", total_phone_numbers)

# Display the first few rows of detected phone numbers for verification
df[['cleaned_text', 'detected_phone_number']].tail()

Total number of phone numbers detected in the dataset: 1215


Unnamed: 0,cleaned_text,detected_phone_number
1600493,ask programming: latex or indesign?: submitted...,[]
1600494,"on that note, i hate word. i hate pages. i hat...",[]
1600495,ahhh... back in a *real* text editing environm...,[]
1600496,"trouble in iran, i see. hmm. iran. iran so far...",[]
1600497,reading the tweets coming out of iran... the w...,[]


### Removing special characters ,numbers ,spaces, emoticons and hashtags


In [12]:


# Create a new column with special characters removed
df['special_character_removed'] = df['cleaned_text'].apply(lambda x: re.sub(r'[^a-z\s]', '', x))

# Calculate the number of special characters detected (original length - cleaned length)
df['special_characters_detected'] = df['cleaned_text'].apply(len) - df['special_character_removed'].apply(len)

# Get the total number of special characters detected
total_special_characters_detected = df['special_characters_detected'].sum()

print("Total special characters detected:", total_special_characters_detected)



#removing space 
df['special_character_removed'] = df['special_character_removed'].apply(lambda x: re.sub(r'[\s+]', ' ', x))


Total special characters detected: 6876608


In [13]:
df

Unnamed: 0,Label,Text1,TimeStamp,Meta1,Meta2,MainText,before_sent_tokenized_text,before_word_tokenized_text,cleaned_text,URL_address,detected_account,detected_phone_number,special_character_removed,special_characters_detected
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","[@switchfoot http://twitpic.com/2y1zl - Awww, ...","[[@, switchfoot, http, :, //twitpic.com/2y1zl,...","@switchfoot - awww, that's a bummer. you sho...",[http://twitpic.com/2y1zl],[switchfoot],[],switchfoot awww thats a bummer you shoulda ...,7
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,[is upset that he can't update his Facebook by...,"[[is, upset, that, he, ca, n't, update, his, F...",is upset that he can't update his facebook by ...,[],[],[],is upset that he cant update his facebook by t...,6
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,"[@Kenichan I dived many times for the ball., M...","[[@, Kenichan, I, dived, many, times, for, the...",@kenichan i dived many times for the ball. man...,[],[kenichan],[],kenichan i dived many times for the ball manag...,5
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,[my whole body feels itchy and like its on fire],"[[my, whole, body, feels, itchy, and, like, it...",my whole body feels itchy and like its on fire,[],[],[],my whole body feels itchy and like its on fire,0
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....","[@nationwideclass no, it's not behaving at all...","[[@, nationwideclass, no, ,, it, 's, not, beha...","@nationwideclass no, it's not behaving at all....",[],[nationwideclass],[],nationwideclass no its not behaving at all im ...,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1600493,2,14072,Sun Jun 14 04:31:43 UTC 2009,latex,proggit,Ask Programming: LaTeX or InDesign?: submitted...,"[Ask Programming: LaTeX or InDesign?, : submit...","[[Ask, Programming, :, LaTeX, or, InDesign, ?]...",ask programming: latex or indesign?: submitted...,[http://tinyurl.com/myfmf7],[],[],ask programming latex or indesign submitted by...,9
1600494,0,14073,Sun Jun 14 04:32:17 UTC 2009,latex,sam33r,"On that note, I hate Word. I hate Pages. I hat...","[On that note, I hate Word., I hate Pages., I ...","[[On, that, note, ,, I, hate, Word, .], [I, ha...","on that note, i hate word. i hate pages. i hat...",[],[],[],on that note i hate word i hate pages i hate l...,9
1600495,4,14074,Sun Jun 14 04:36:34 UTC 2009,latex,iamtheonlyjosie,Ahhh... back in a *real* text editing environm...,[Ahhh... back in a *real* text editing environ...,"[[Ahhh, ..., back, in, a, *, real, *, text, ed...",ahhh... back in a *real* text editing environm...,[],[],[],ahhh back in a real text editing environment i...,10
1600496,0,14075,Sun Jun 14 21:36:07 UTC 2009,iran,plutopup7,"Trouble in Iran, I see. Hmm. Iran. Iran so far...","[Trouble in Iran, I see., Hmm., Iran., Iran so...","[[Trouble, in, Iran, ,, I, see, .], [Hmm, .], ...","trouble in iran, i see. hmm. iran. iran so far...",[],[],[],trouble in iran i see hmm iran iran so far awa...,6


### After cleaning 

### Tokenization and Text Statstics

In [14]:
from nltk.tokenize import sent_tokenize, word_tokenize

# Tokenize into sentences
df['sent_tokenized_text'] = df['special_character_removed'].apply(lambda x: sent_tokenize(x))



#Tkenize into words from sentences
df['word_tokenized_text'] = df['sent_tokenized_text'].apply(
    lambda sentences: [word for sentence in sentences for word in word_tokenize(sentence)]
)

#Sentence count 
sentence_count = df['sent_tokenized_text'].apply(len).sum()
print("Sentence count:", sentence_count)

#word count
word_count = df['word_tokenized_text'].apply(lambda x: sum(len(sentence) for sentence in x)).sum()
print("Word count:", word_count)

#Average sentence length
total_sentences = df['sent_tokenized_text'].apply(len).sum()
total_words = df['word_tokenized_text'].apply(lambda x: sum(len(sentence) for sentence in x)).sum()
avg_sentence_length = total_words / total_sentences if total_sentences > 0 else 0
print("Average sentence length:", avg_sentence_length)


#Unique words

vocabulary = set(word for words in df['word_tokenized_text'].explode() for word in words)
vocabulary_size_entire_dataset = len(vocabulary)
print("Vocabulary size:", vocabulary_size_entire_dataset)


# Minimum and maximum word length
min_word_length = df['word_tokenized_text'].apply(lambda x: min(len(word) for word in x)).min()
max_word_length = df['word_tokenized_text'].apply(lambda x: max(len(word) for word in x)).max()

print(f"Minimum word length: {min_word_length}")
print(f"Maximum word length: {max_word_length}")

# Minimum and maximum sentence length (in terms of word count)
min_sentence_length = df['word_tokenized_text'].apply(len).min()
max_sentence_length = df['word_tokenized_text'].apply(len).max()

print(f"Minimum sentence length: {min_sentence_length}")
print(f"Maximum sentence length: {max_sentence_length}")


Sentence count: 1600498
Word count: 88585053
Average sentence length: 55.34843092587432
Vocabulary size: 26
Minimum word length: 1
Maximum word length: 125
Minimum sentence length: 1
Maximum sentence length: 41


### StopWords Removing 

In [15]:
#stopWord removal
nltk.download('stopwords', download_dir='/usr/local/share/nltk_data')
from nltk.corpus import stopwords



[nltk_data] Downloading package stopwords to
[nltk_data]     /usr/local/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
total_words_before = df['word_tokenized_text'].apply(len).sum()

print("Total words before removing stopwords:", total_words_before)
#Removing stopwords
stop_words = set(stopwords.words('english'))
df['stopwords_removed_text'] = df['word_tokenized_text'].apply(
    lambda x: [word for word in x if word not in stop_words]
)
stopwords_removed = total_words_before - df['stopwords_removed_text'].apply(len).sum()
print("Stopwords removed:", stopwords_removed)
after_stopwords = df['stopwords_removed_text'].apply(len).sum()
print("Total words after removing stopwords:", after_stopwords)
df

Total words before removing stopwords: 20648678
Stopwords removed: 8371773
Total words after removing stopwords: 12276905


Unnamed: 0,Label,Text1,TimeStamp,Meta1,Meta2,MainText,before_sent_tokenized_text,before_word_tokenized_text,cleaned_text,URL_address,detected_account,detected_phone_number,special_character_removed,special_characters_detected,sent_tokenized_text,word_tokenized_text,stopwords_removed_text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","[@switchfoot http://twitpic.com/2y1zl - Awww, ...","[[@, switchfoot, http, :, //twitpic.com/2y1zl,...","@switchfoot - awww, that's a bummer. you sho...",[http://twitpic.com/2y1zl],[switchfoot],[],switchfoot awww thats a bummer you shoulda ...,7,[switchfoot awww thats a bummer you shoulda...,"[switchfoot, awww, thats, a, bummer, you, shou...","[switchfoot, awww, thats, bummer, shoulda, got..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,[is upset that he can't update his Facebook by...,"[[is, upset, that, he, ca, n't, update, his, F...",is upset that he can't update his facebook by ...,[],[],[],is upset that he cant update his facebook by t...,6,[is upset that he cant update his facebook by ...,"[is, upset, that, he, cant, update, his, faceb...","[upset, cant, update, facebook, texting, might..."
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,"[@Kenichan I dived many times for the ball., M...","[[@, Kenichan, I, dived, many, times, for, the...",@kenichan i dived many times for the ball. man...,[],[kenichan],[],kenichan i dived many times for the ball manag...,5,[kenichan i dived many times for the ball mana...,"[kenichan, i, dived, many, times, for, the, ba...","[kenichan, dived, many, times, ball, managed, ..."
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,[my whole body feels itchy and like its on fire],"[[my, whole, body, feels, itchy, and, like, it...",my whole body feels itchy and like its on fire,[],[],[],my whole body feels itchy and like its on fire,0,[my whole body feels itchy and like its on fire],"[my, whole, body, feels, itchy, and, like, its...","[whole, body, feels, itchy, like, fire]"
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....","[@nationwideclass no, it's not behaving at all...","[[@, nationwideclass, no, ,, it, 's, not, beha...","@nationwideclass no, it's not behaving at all....",[],[nationwideclass],[],nationwideclass no its not behaving at all im ...,9,[nationwideclass no its not behaving at all im...,"[nationwideclass, no, its, not, behaving, at, ...","[nationwideclass, behaving, im, mad, cant, see]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1600493,2,14072,Sun Jun 14 04:31:43 UTC 2009,latex,proggit,Ask Programming: LaTeX or InDesign?: submitted...,"[Ask Programming: LaTeX or InDesign?, : submit...","[[Ask, Programming, :, LaTeX, or, InDesign, ?]...",ask programming: latex or indesign?: submitted...,[http://tinyurl.com/myfmf7],[],[],ask programming latex or indesign submitted by...,9,[ask programming latex or indesign submitted b...,"[ask, programming, latex, or, indesign, submit...","[ask, programming, latex, indesign, submitted,..."
1600494,0,14073,Sun Jun 14 04:32:17 UTC 2009,latex,sam33r,"On that note, I hate Word. I hate Pages. I hat...","[On that note, I hate Word., I hate Pages., I ...","[[On, that, note, ,, I, hate, Word, .], [I, ha...","on that note, i hate word. i hate pages. i hat...",[],[],[],on that note i hate word i hate pages i hate l...,9,[on that note i hate word i hate pages i hate ...,"[on, that, note, i, hate, word, i, hate, pages...","[note, hate, word, hate, pages, hate, latex, s..."
1600495,4,14074,Sun Jun 14 04:36:34 UTC 2009,latex,iamtheonlyjosie,Ahhh... back in a *real* text editing environm...,[Ahhh... back in a *real* text editing environ...,"[[Ahhh, ..., back, in, a, *, real, *, text, ed...",ahhh... back in a *real* text editing environm...,[],[],[],ahhh back in a real text editing environment i...,10,[ahhh back in a real text editing environment ...,"[ahhh, back, in, a, real, text, editing, envir...","[ahhh, back, real, text, editing, environment,..."
1600496,0,14075,Sun Jun 14 21:36:07 UTC 2009,iran,plutopup7,"Trouble in Iran, I see. Hmm. Iran. Iran so far...","[Trouble in Iran, I see., Hmm., Iran., Iran so...","[[Trouble, in, Iran, ,, I, see, .], [Hmm, .], ...","trouble in iran, i see. hmm. iran. iran so far...",[],[],[],trouble in iran i see hmm iran iran so far awa...,6,[trouble in iran i see hmm iran iran so far aw...,"[trouble, in, iran, i, see, hmm, iran, iran, s...","[trouble, iran, see, hmm, iran, iran, far, awa..."


### Package for stemming and lemmatization

In [17]:
# stemming and lemmatization
nltk.download('wordnet', download_dir='/usr/local/share/nltk_data')
from nltk.stem import WordNetLemmatizer #For lemmatization
from nltk.stem import PorterStemmer #For Steming 
from nltk.corpus import wordnet #For WordNet


[nltk_data] Downloading package wordnet to
[nltk_data]     /usr/local/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Lemmatization

In [18]:
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Apply lemmatization to each word in the list
df['lemmatized_text'] =df['stopwords_removed_text'].apply(
    lambda wordlist: [lemmatizer.lemmatize(word) for word in wordlist]
)

# Check the result
df['lemmatized_text']

0          [switchfoot, awww, thats, bummer, shoulda, got...
1          [upset, cant, update, facebook, texting, might...
2          [kenichan, dived, many, time, ball, managed, s...
3                     [whole, body, feel, itchy, like, fire]
4            [nationwideclass, behaving, im, mad, cant, see]
                                 ...                        
1600493    [ask, programming, latex, indesign, submitted,...
1600494    [note, hate, word, hate, page, hate, latex, sa...
1600495    [ahhh, back, real, text, editing, environment,...
1600496    [trouble, iran, see, hmm, iran, iran, far, awa...
1600497    [reading, tweet, coming, iran, whole, thing, t...
Name: lemmatized_text, Length: 1600498, dtype: object

### Stemming

In [19]:
#stemming 
#Intialize the stemmer
stemmer = PorterStemmer()

#Apply stemming to each word in the list
df['stemmed_text'] = df['lemmatized_text'].apply(
    lambda wordlist: [stemmer.stem(word) for word in wordlist]
    
)

#check the result 
df['stemmed_text']


0          [switchfoot, awww, that, bummer, shoulda, got,...
1          [upset, cant, updat, facebook, text, might, cr...
2          [kenichan, dive, mani, time, ball, manag, save...
3                     [whole, bodi, feel, itchi, like, fire]
4               [nationwideclass, behav, im, mad, cant, see]
                                 ...                        
1600493    [ask, program, latex, indesign, submit, calcio...
1600494    [note, hate, word, hate, page, hate, latex, sa...
1600495    [ahhh, back, real, text, edit, environ, lt, la...
1600496    [troubl, iran, see, hmm, iran, iran, far, away...
1600497    [read, tweet, come, iran, whole, thing, terrif...
Name: stemmed_text, Length: 1600498, dtype: object

In [20]:
# End timing the whole notebook
notebook_end_time = time.time()

# Calculate the total runtime
notebook_runtime = (notebook_end_time - notebook_start_time) / 60  # Convert seconds to minutes
print(f"Total notebook runtime: {notebook_runtime:.2f} minutes")


Total notebook runtime: 3.14 minutes


## Calculating Statistics
- Number of documents:
- Average sentence length: 
- Word count: 
- Sentence count: 
- Vocabulary size: 
- Max word length:
- Min sentence length: 
- Max sentence length: 
- Special characters removed:
- Stop words removed:
- Addresses detected:
- Phone numbers detected:
- Account numbers detected: 
- Total runtime: 

In [21]:
# Display text cleaning statistics
before_stats = {
    "doc_count": number_of_documents,
    "avg_sentence_length": before_avg_sentences_per_doc,
    "word_count": before_number_of_words,
    "sentence_count": before_number_of_sentences,
    "vocabulary_size": before_unique_words,
    "max_word_length": before_max_word_length,
    "min_sentence_length": before_min_sentence_length,
    "max_sentence_length": before_max_sentence_length,
}

after_stats = {
    "doc_count": len(df),
    "avg_sentence_length": avg_sentence_length,
    "word_count": word_count,
    "sentence_count": sentence_count,
    "vocabulary_size": vocabulary_size_entire_dataset,
    "max_word_length": max_word_length,
    "min_sentence_length": min_sentence_length,
    "max_sentence_length": max_sentence_length,
}

cleaning_metrics = {
    "special_chars_removed": total_special_characters_detected,
    "stop_words_removed": stopwords_removed,
    "addresses_detected": total_urls,
    "phones_detected": total_phone_numbers,
    "accounts_detected": total_accounts,
}



print(f"""
### Text Cleaning Statistics ###

- Number of documents: {before_stats['doc_count']} → {after_stats['doc_count']}
- Average sentence length: {before_stats['avg_sentence_length']:.1f} → {after_stats['avg_sentence_length']:.1f}
- Word count: {before_stats['word_count']} → {after_stats['word_count']}
- Sentence count: {before_stats['sentence_count']} → {after_stats['sentence_count']}
- Vocabulary size: {before_stats['vocabulary_size']} → {after_stats['vocabulary_size']}
- Max word length: {before_stats['max_word_length']} → {after_stats['max_word_length']}
- Min sentence length: {before_stats['min_sentence_length']} → {after_stats['min_sentence_length']}
- Max sentence length: {before_stats['max_sentence_length']} → {after_stats['max_sentence_length']}
- Special characters removed: {cleaning_metrics['special_chars_removed']}
- Stop words removed: {cleaning_metrics['stop_words_removed']}
- Addresses detected: {cleaning_metrics['addresses_detected']}
- Phone numbers detected: {cleaning_metrics['phones_detected']}
- Account numbers detected: {cleaning_metrics['accounts_detected']}
- Total runtime: {notebook_runtime:.2f} minutes
""")



### Text Cleaning Statistics ###

- Number of documents: 1600498 → 1600498
- Average sentence length: 1.7 → 55.3
- Word count: 26261279 → 88585053
- Sentence count: 2748402 → 1600498
- Vocabulary size: 874726 → 26
- Max word length: 204 → 125
- Min sentence length: 1 → 1
- Max sentence length: 229 → 41
- Special characters removed: 6876608
- Stop words removed: 8371773
- Addresses detected: 71722
- Phone numbers detected: 1215
- Account numbers detected: 786705
- Total runtime: 3.14 minutes

