In [1]:
# Import necessary libraries
from selenium import webdriver  # WebDriver for browser control
from selenium.webdriver.common.by import By  # Element locators
from selenium.webdriver.common.keys import Keys  # Simulate keyboard input
from dotenv import load_dotenv  # Load environment variables from .env file
import os, time, csv  # os for OS interaction, time for delays, csv for writing files

## Single Thread (Without Thread Pool)

In [2]:
# Load environment variables from the .env file
load_dotenv()

# Retrieve the environment variables for Twitter login credentials
username = os.getenv('TWITTER_USERNAME')
password = os.getenv('TWITTER_PASSWORD')

# Set up Chrome WebDriver options (customizing browser behavior if needed)
options = webdriver.ChromeOptions()
# Initialize the WebDriver with the specified options
driver = webdriver.Chrome(options=options)

# Open the Twitter login page in the browser
driver.get('https://twitter.com/login')
# Wait for the page to load before interacting with it
time.sleep(5)

# Find the username input field and enter the username
username_field = driver.find_element(By.NAME, 'text')
username_field.send_keys(username) # Type in the username
username_field.send_keys(Keys.RETURN) # Press 'Enter' to submit the username
time.sleep(3) # Wait for the password field to load

# Find the password input field and enter the password
password_field = driver.find_element(By.NAME, 'password')
password_field.send_keys(password) # Type in the password
password_field.send_keys(Keys.RETURN) # Press 'Enter' to submit the password
time.sleep(5) # Wait for the login to complete

# Navigate to a specific user's Twitter timeline (e.g., Times of India)
driver.get('https://twitter.com/timesofindia')
# Wait for the timeline to load and display the user's tweets
time.sleep(10) # Adjust this time if needed based on the page load speed

In [3]:
# Extract tweets from the current page using XPath
page1_tweets = driver.find_elements(By.XPATH, '//article[@data-testid="tweet"]//div[@lang]')
page1_tweets_list = [tweet.text for tweet in page1_tweets]

In [4]:
# Print tweet count
print(f"Number of tweets found: {len(page1_tweets_list)}")

Number of tweets found: 3


In [5]:
# Print the extracted tweets
for tweet in page1_tweets_list:
    print(tweet)

LIVE | PM Narendra Modi responds to debate on Constitution amid BJP-Congress standoff

#NarendraModi #Parliament #LokSabha #ParliamentWinterSession
Watch | #LokSabha Speaker XI beat #RajyaSabha Chairman XI by 73 runs in the friendly cricket match of Parliamentarians

BJP MP #AnuragThakur says, "We will organise such matches in various states where MPs team will play with MLAs too in future so that every public representative
Karnataka home minister G Parameshwara has called the suicide of Bengaluru techie #SubhashAtul an eye-opener on the state of men’s rights in the country. 

More details  http://toi.in/IyJ28Z20 

#BengaluruTechie


In [6]:
# Save tweets_list to a CSV file
csv_file = 'page1_tweets_list.csv'
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['extracted tweets'])  # Header
    for tweet in page1_tweets_list:
        writer.writerow([tweet])

In [7]:
def extract_tweets(driver, num_tweets):
    # Initialize an empty list to store extracted tweets
    tweets_list = []
    
    # Get the initial scroll height of the page
    last_height = driver.execute_script("return document.body.scrollHeight")
    
    # Continue scrolling and extracting tweets until we reach the desired number
    while len(tweets_list) < num_tweets:
        # Scroll down to the bottom of the page to load more tweets
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        
        # Wait for the page to load new tweets
        time.sleep(5)  # Adjust this time based on page load speed

        # Extract tweets from the loaded page
        tweets = driver.find_elements(By.XPATH, '//article[@data-testid="tweet"]//div[@lang]')
        
        # Iterate over each tweet and extract its text
        for tweet in tweets:
            tweet_text = tweet.text
            if tweet_text and tweet_text not in tweets_list:  # Avoid adding duplicates
                tweets_list.append(tweet_text)
            
            # If we have reached the desired number of tweets, stop extracting
            if len(tweets_list) >= num_tweets:
                break
        
        # Check if we've reached the bottom of the page (i.e., no new tweets loaded)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break  # Exit the loop if no new tweets were loaded
        
        # Update the last height for the next iteration
        last_height = new_height

    # Return the list of extracted tweets
    return tweets_list

In [8]:
# Extract 100 tweets
tweets_list_100 = extract_tweets(driver, 100)

**Execution Time 1m 12.5s**

In [9]:
# Print tweet count
print(f"Number of tweets found: {len(tweets_list_100)}")

Number of tweets found: 100


In [10]:
# Print the extracted tweets
for tweet in tweets_list_100:
    print(tweet)

Watch | Maharashtra CM #DevendraFadnavis pays floral tribute to Dr Bhim Rao Ambedkar in #Nagpur as he conducts a roadshow here. The state cabinet expansion will take place today.
#HistoryMeetsAI | On December 15, 1903, street vendor Italo Marchiony received a U.S. patent for inventing a machine to make #icecream cones.

Marchiony, who moved from Italy in the late 1800s, invented his ice cream cone in New York City.
#BorderGavaskarTrophy #INDvAUS #INDvsAUS | 3rd Test

#JaspritBumrah bowls a double wicket over for his 12th five-wicket haul 

WATCH  http://toi.in/q-oPQZ/a24gk
On today's episode of Beyond The Boundary, we discuss Day  of the Gabba Test:

- Did  Rohit Sharma make a mistake in fielding first?

-  Travis Head, Steve Smith slam centuries, stitch 241 runs for the fourth wicket

- Again: Jasprit Bumrah and who else  for Indian
A 32-year-old computer operator in #Surat, India, severed four fingers of his left hand due to work stress and family pressure to remain employed. 

More 

In [11]:
# Save tweets to a CSV file
csv_file = 'tweets_list_100.csv'
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['extracted tweets'])  # Header
    for tweet in tweets_list_100:
        writer.writerow([tweet])

## Thread Pooling: Without using a thread pool, Python’s default execution model will be single-threaded, meaning one operation is done at a time. With threads (including thread pools), tasks are managed concurrently, and multiple operations can run simultaneously, which is particularly useful when dealing with I/O-bound tasks like web scraping.

In [37]:
# Load environment variables from the .env file
load_dotenv()

# Retrieve the environment variables for Twitter login credentials
username = os.getenv('TWITTER_USERNAME')
password = os.getenv('TWITTER_PASSWORD')

# Set up Chrome WebDriver options (customizing browser behavior if needed)
options = webdriver.ChromeOptions()
# Initialize the WebDriver with the specified options
driver = webdriver.Chrome(options=options)

# Open the Twitter login page in the browser
driver.get('https://twitter.com/login')
# Wait for the page to load before interacting with it
time.sleep(5)

# Find the username input field and enter the username
username_field = driver.find_element(By.NAME, 'text')
username_field.send_keys(username) # Type in the username
username_field.send_keys(Keys.RETURN) # Press 'Enter' to submit the username
time.sleep(3) # Wait for the password field to load

# Find the password input field and enter the password
password_field = driver.find_element(By.NAME, 'password')
password_field.send_keys(password) # Type in the password
password_field.send_keys(Keys.RETURN) # Press 'Enter' to submit the password
time.sleep(5) # Wait for the login to complete

# Navigate to a specific user's Twitter timeline (e.g., Times of India)
driver.get('https://twitter.com/timesofindia')
# Wait for the timeline to load and display the user's tweets
time.sleep(10) # Adjust this time if needed based on the page load speed

In [38]:
from selenium.common.exceptions import StaleElementReferenceException

def extract_tweets(driver, num_tweets):
    tweets_list = []
    last_height = driver.execute_script("return document.body.scrollHeight")

    while len(tweets_list) < num_tweets:
        # Scroll down to load more tweets
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(5)  # Wait for new tweets to load

        # Re-locate tweet elements after scrolling
        try:
            tweets = driver.find_elements(By.XPATH, '//article[@data-testid="tweet"]//div[@lang]')
            for tweet in tweets:
                tweet_text = tweet.text
                if tweet_text and tweet_text not in tweets_list:  # Avoid duplicates
                    tweets_list.append(tweet_text)
                if len(tweets_list) >= num_tweets:
                    break
        except StaleElementReferenceException:
            continue  # Skip and re-fetch elements in the next iteration

        # Check if the page is scrolled to the bottom
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break  # No new tweets loaded, exit the loop

        last_height = new_height

    return tweets_list

In [33]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading

def extract_tweets_parallel(driver, num_tweets, num_threads):
    # Thread-local storage to store tweets for each thread
    thread_local = threading.local()
    
    def task():
        # Initialize a local list for each thread
        thread_local.tweets_list = []
        new_tweets = extract_tweets(driver, num_tweets // num_threads)
        thread_local.tweets_list.extend(new_tweets)
        return thread_local.tweets_list
    
    tweets_list = []
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = [executor.submit(task) for _ in range(num_threads)]
        for future in as_completed(futures):
            tweets_list.extend(future.result())  # Merge each thread's result

    return tweets_list

# Extract 100 tweets using 5 threads
tweets_list_100_parallel = extract_tweets_parallel(driver, 100, 5)

**Execution Time 22.7s**

In [34]:
# Print tweet count
print(f"Number of tweets found: {len(tweets_list_100_parallel)}")

Number of tweets found: 100


In [35]:
# Print the extracted tweets
for tweet in tweets_list_100_parallel:
    print(tweet)

Ousted Bangladesh PM #SheikhHasina involved in forced disappearances: Interim government's commission 

More details  http://toi.in/_QdACY25
"Bas kar bhai, aur kitna rulayega!"

Fans call out Team India's cluelessness against Travis Head 

READ: http://toi.in/94s78Z/a24gk 

#INDvAUS #TravisHead #AUSvIND #GabbaTest
#BorderGavaskarTrophy #INDvAUS #INDvsAUS | 3rd Test

#TravisHead claims unique Test record with blitzkrieg hundred against #India 

More Here   http://toi.in/ll0wWa/a24gk
#Mumbai | Tragedy struck again for the BEST bus service as a 25-year-old motorcyclist, Vindo Dikshit, lost his life in a collision near Govandi's Shivaji Nagar junction.

More details  http://toi.in/D3gUVa
#INDvsAUS #AUSvsIND #GabbaTest

Centurions Head, Smith take Australia to 405/7 at stumps

Fifer for Bumrah
'You don’t require 100% technique to get runs'

Former RCB star on Travis Head's innings against India

READ  http://toi.in/blVnrb

#TravisHead #IndianCricket #INDvsAUS #INDvAUS #Cricket
#INDvsAUS #AU

In [36]:
# Save tweets to a CSV file
csv_file = 'tweets_list_100_parallel.csv'
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['extracted tweets'])  # Header
    for tweet in tweets_list_100_parallel:
        writer.writerow([tweet])

# ATTEMPTING TO EXTRACT 1000 TWEETS

In [39]:
# Extract 1000 tweets using 10 threads
tweets_list_1000_parallel = extract_tweets_parallel(driver, 1000, 10)

- **Execution Time 1m 27.6s**
- **The extraction process stopped as the end of the page was reached, with no more tweets available.**

In [40]:
# Print tweet count
print(f"Number of tweets found: {len(tweets_list_1000_parallel)}")

Number of tweets found: 712


In [41]:
# Print the extracted tweets
for tweet in tweets_list_1000_parallel:
    print(tweet)

'Resign immediately'

Rohit Sharma under fire as Australia dominate India at The Gabba

READ  http://toi.in/lAHa3b

#INDvsAUS #INDvAUS #Gabba #RohitSharma #IndianCricket
#Watch | BJP leader #RameshPahelwan and his wife and party leader Kusumlata Ramesh join #AAP in the presence of AAP national convener #ArvindKejriwal
#INDvsAUS #AUSvsIND #GabbaTest

BIG WICKET! 
@Jaspritbumrah93
 completes his five-wicket haul 

Dismisses Marsh and Head (152 off 160 balls) in one over

Australia 327/6 in 86.5 overs

Follow for live updates:
#Zelensky sacks general for 'failing to stop' Russian advance in Pokrovsk; #Ukraine stares at big loss

Watch
#Kolkata | A day after the recovery of a severed head of a woman from a vat in #Tollygunge's Graham Road area, the nine-member Special Investigation Team (SIT) probing the case claimed to have cracked the case with the arrest of a house painter from Laskarpara in Diamond Harbour.
#INDvsAUS #AUSvsIND #GabbaTest

Edged & Gone! 

Second wicket for Jasprit Bumra

In [42]:
# Save tweets to a CSV file
csv_file = 'tweets_list_1000_parallel.csv'
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['extracted tweets'])  # Header
    for tweet in tweets_list_1000_parallel:
        writer.writerow([tweet])