4.  Extract all incoming email addresses as a list from the email_exchange_big.txt file.

In [None]:
import re

def extract_email_addresses(file_path):
    with open(file_path, 'r') as file:
        content = file.read()
    
    # Use regex to find all email addresses
    email_addresses = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', content)
    
    return email_addresses

# Example usage
email_addresses = extract_email_addresses('email_exchange_big.txt')
print(email_addresses)

5. Find the most common words in the English language. Call the name of your function find_most_common_words, it will take two parameters - a string or a file and a positive integer, indicating the number of words. Your function will return an array of tuples in descending order. Check the output


In [10]:
from collections import Counter
import re

def find_most_common_words(text_or_file, num_words):
    # Check if the input is a file path or a string
    try:
        with open(text_or_file, 'r') as file:
            text = file.read()
    except FileNotFoundError:
        text = text_or_file
    
    # Use regex to find all words
    words = re.findall(r'\b\w+\b', text.lower())
    
    # Count the frequency of each word
    word_counts = Counter(words)
    
    # Get the most common words
    most_common_words = word_counts.most_common(num_words)
    
    return most_common_words

# Example usage
text = "This is a test. This test is only a test."
print(find_most_common_words(text, 3))

[('test', 3), ('this', 2), ('is', 2)]


6. Use the function, find_most_frequent_words to find: a) The ten most frequent words used in Obama's speech b) The ten most frequent words used in Michelle's speech c) The ten most frequent words used in Trump's speech d) The ten most frequent words used in Melina's speech


In [11]:
# Assuming the speeches are stored in text files
obama_speech = 'obama_speech.txt'
michelle_speech = 'michelle_speech.txt'
trump_speech = 'trump_speech.txt'
melania_speech = 'melania_speech.txt'

# Find the ten most frequent words in each speech
obama_common_words = find_most_common_words(obama_speech, 10)
michelle_common_words = find_most_common_words(michelle_speech, 10)
trump_common_words = find_most_common_words(trump_speech, 10)
melania_common_words = find_most_common_words(melania_speech, 10)

print("Obama's speech:", obama_common_words)
print("Michelle's speech:", michelle_common_words)
print("Trump's speech:", trump_common_words)
print("Melania's speech:", melania_common_words)

Obama's speech: [('the', 129), ('and', 113), ('of', 81), ('to', 70), ('our', 67), ('we', 62), ('that', 50), ('a', 48), ('is', 36), ('in', 25)]
Michelle's speech: [('and', 96), ('the', 85), ('to', 84), ('that', 50), ('of', 46), ('a', 41), ('he', 37), ('in', 36), ('my', 28), ('i', 28)]
Trump's speech: [('the', 65), ('and', 59), ('we', 44), ('will', 40), ('of', 38), ('to', 32), ('our', 30), ('is', 20), ('america', 17), ('for', 13)]
Melania's speech: [('and', 77), ('to', 55), ('the', 52), ('is', 29), ('i', 28), ('for', 27), ('of', 25), ('that', 24), ('a', 22), ('you', 21)]


7. Write a python application that checks similarity between two texts. It takes a file or a string as a parameter and it will evaluate the similarity of the two texts. For instance check the similarity between the transcripts of Michelle's and Melina's speech. You may need a couple of functions, function to clean the text(clean_text), function to remove support words(remove_support_words) and finally to check the similarity(check_text_similarity). List of stop words are in the data directory


In [12]:
import re
from collections import Counter
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def clean_text(text):
    # Remove non-alphanumeric characters and convert to lowercase
    text = re.sub(r'\W+', ' ', text).lower()
    return text

def remove_support_words(text):
    # Remove stop words
    words = text.split()
    words = [word for word in words if word not in ENGLISH_STOP_WORDS]
    return ' '.join(words)

def check_text_similarity(text1, text2):
    # Clean and remove support words from both texts
    text1 = remove_support_words(clean_text(text1))
    text2 = remove_support_words(clean_text(text2))
    
    # Vectorize the texts
    vectorizer = TfidfVectorizer().fit_transform([text1, text2])
    vectors = vectorizer.toarray()
    
    # Compute cosine similarity
    similarity = cosine_similarity(vectors)
    return similarity[0][1]

# Example usage
with open(michelle_speech, 'r') as file:
    michelle_text = file.read()

with open(melania_speech, 'r') as file:
    melania_text = file.read()

similarity_score = check_text_similarity(michelle_text, melania_text)
print(f"Similarity between Michelle's and Melania's speeches: {similarity_score}")

Similarity between Michelle's and Melania's speeches: 0.2665131829446272


8. Find the 10 most repeated words in the romeo_and_juliet.txt


In [13]:
# Find the 10 most repeated words in the romeo_and_juliet.txt
romeo_and_juliet = 'romeo_and_juliet.txt'
romeo_and_juliet_common_words = find_most_common_words(romeo_and_juliet, 10)

print("Romeo and Juliet:", romeo_and_juliet_common_words)

Romeo and Juliet: [('romeo_and_juliet', 1), ('txt', 1)]


9. Read the hacker news csv file and find out: a) Count the number of lines containing python or Python b) Count the number lines containing JavaScript, javascript or Javascript c) Count the number lines containing Java and not JavaScript



In [14]:
import pandas as pd

# Read the hacker news CSV file
hacker_news_df = pd.read_csv('hacker_news.csv')

# Count the number of lines containing 'python' or 'Python'
python_count = hacker_news_df['title'].str.contains(r'\b[Pp]ython\b', regex=True).sum()

# Count the number of lines containing 'JavaScript', 'javascript' or 'Javascript'
javascript_count = hacker_news_df['title'].str.contains(r'\b[Jj]ava[Ss]cript\b', regex=True).sum()

# Count the number of lines containing 'Java' and not 'JavaScript'
java_count = hacker_news_df['title'].str.contains(r'\b[Jj]ava\b', regex=True) & ~hacker_news_df['title'].str.contains(r'\b[Jj]ava[Ss]cript\b', regex=True)
java_count = java_count.sum()

print(f"Number of lines containing 'python' or 'Python': {python_count}")
print(f"Number of lines containing 'JavaScript', 'javascript' or 'Javascript': {javascript_count}")
print(f"Number of lines containing 'Java' and not 'JavaScript': {java_count}")

KeyError: 'title'