In [None]:
def count_tf(file_name):
    with open(file_name, 'r', encoding="latin-1") as f:
        # reading data of the file
        read_data = f.read()
        tokens = word_tokenize(read_data)
        # Stemming the tokens
        stemmed_tokens = [ps.stem(token) for token in tokens]
        freq_dist = FreqDist(stemmed_tokens)
        sorted_freq = sorted(freq_dist.items(), key=lambda x: x[1], reverse=True)  # Sort in descending order
        return sorted_freq

def print_top_15(token_frequencies):
    print("Top 15 most frequent words:")
    for i, (token, frequency) in enumerate(token_frequencies[:15], start=1):
        print(f"{i}. {token} : {frequency}")

file_name = "/content/cleaned_text.txt"
token_frequencies = count_tf(file_name)
print_top_15(token_frequencies)




Top 15 most frequent words:
1. the : 2166
2. of : 1796
3. and : 1179
4. a : 1054
5. in : 955
6. discrimin : 596
7. to : 512
8. for : 419
9. is : 401
10. by : 390
11. on : 357
12. data : 292
13. as : 231
14. from : 215
15. econom : 213


In [None]:
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer


# Initialize Porter Stemmer and get stop words
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

def count_tf_without_stopwords(file_name):
    with open(file_name, 'r', encoding="latin-1") as f:
        # reading data of the file
        read_data = f.read()
        tokens = word_tokenize(read_data)
        # Filter out stop words and stem tokens
        filtered_tokens = [ps.stem(token) for token in tokens if token.lower() not in stop_words]
        freq_dist = FreqDist(filtered_tokens)
        sorted_freq = sorted(freq_dist.items(), key=lambda x: x[1], reverse=True)
        return sorted_freq

def print_top_15(token_frequencies):
    print("Top 15 most frequent words after removing stop words:")
    for i, (token, frequency) in enumerate(token_frequencies[:15], start=1):
        print(f"{i}. {token} : {frequency}")

file_name = "/content/cleaned_text.txt"
token_frequencies = count_tf_without_stopwords(file_name)
print_top_15(token_frequencies)


Top 15 most frequent words after removing stop words:
1. discrimin : 596
2. data : 292
3. econom : 213
4. group : 204
5. model : 181
6. statist : 154
7. studi : 149
8. racial : 142
9. journal : 140
10. j : 138
11. analysi : 135
12. regress : 134
13. use : 127
14. market : 110
15. employ : 110


In [None]:
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Initialize Porter Stemmer
ps = PorterStemmer()

def count_tf_after_stemming(file_name):
    with open(file_name, 'r', encoding="latin-1") as f:
        # reading data of the file
        read_data = f.read()
        tokens = word_tokenize(read_data)
        # Stemming the tokens
        stemmed_tokens = [ps.stem(token) for token in tokens]
        freq_dist = FreqDist(stemmed_tokens)
        sorted_freq = sorted(freq_dist.items(), key=lambda x: x[1], reverse=True)
        return sorted_freq

def print_top_15(token_frequencies):
    print("Top 15 most frequent stemmed words:")
    for i, (token, frequency) in enumerate(token_frequencies[:15], start=1):
        print(f"{i}. {token} : {frequency}")

file_name = "/content/cleaned_text.txt"
token_frequencies = count_tf_after_stemming(file_name)
print_top_15(token_frequencies)


Top 15 most frequent stemmed words:
1. the : 2166
2. of : 1796
3. and : 1179
4. a : 1054
5. in : 955
6. discrimin : 596
7. to : 512
8. for : 419
9. is : 401
10. by : 390
11. on : 357
12. data : 292
13. as : 231
14. from : 215
15. econom : 213


In [None]:
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize

def count_tf_after_case_folding(file_name):
    with open(file_name, 'r', encoding="latin-1") as f:
        # reading data of the file
        read_data = f.read()
        # Convert the text to lowercase
        read_data = read_data.lower()
        # Tokenize the text
        tokens = word_tokenize(read_data)
        freq_dist = FreqDist(tokens)
        sorted_freq = sorted(freq_dist.items(), key=lambda x: x[1], reverse=True)
        return sorted_freq

def print_top_15(token_frequencies):
    print("Top 15 most frequent words after case folding:")
    for i, (token, frequency) in enumerate(token_frequencies[:15], start=1):
        print(f"{i}. {token} : {frequency}")

file_name = "/content/cleaned_text.txt"
token_frequencies = count_tf_after_case_folding(file_name)
print_top_15(token_frequencies)


Top 15 most frequent words after case folding:
1. the : 2166
2. of : 1796
3. and : 1179
4. a : 1054
5. in : 955
6. discrimination : 571
7. to : 512
8. for : 419
9. is : 401
10. by : 390
11. on : 357
12. data : 292
13. as : 231
14. from : 215
15. that : 193


In [None]:
import re

def clean_text(text):
    # Remove punctuation marks except for hyphens and apostrophes (to preserve words like "we'll" or "co-operation")
    cleaned_text = re.sub(r'[^\w\s\'-]', '', text)
    # Remove multiple spaces
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    # Remove leading and trailing spaces
    cleaned_text = cleaned_text.strip()
    return cleaned_text

# Function to read the file, clean its contents, and save into a new file
def clean_and_save_file(original_file, new_file):
    with open(original_file, 'r', encoding="latin-1") as f:
        # Read the contents of the file
        file_content = f.read()
        # Clean the text using the clean_text function
        cleaned_text = clean_text(file_content)
    # Write the cleaned text into a new file
    with open(new_file, 'w', encoding="latin-1") as f:
        f.write(cleaned_text)

# Example usage:
original_file_name = "/content/A multidisciplinary survey on discrimination analysis.txt"
new_file_name = "/content/cleaned_text.txt"
clean_and_save_file(original_file_name, new_file_name)
print(f"Cleaned text saved into '{new_file_name}'")


Cleaned text saved into '/content/cleaned_text.txt'
