<a href="https://colab.research.google.com/github/alammobaDar/Data_Scraping/blob/main/Activity3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ***INSTALL LIBRARIES***

In [1]:
!pip install requests
!pip install beautifulsoup4
!pip install pandas



# ***IMPORT LIBRARIES***

In [41]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import re

# ***DATA EXTRACTION AND CLEANING***

In [109]:
urls = [
    'https://www.cbr.com/captain-americas-best-worst-traits/',
    'https://screenrant.com/captain-america-mcu-character-traits/',
    'https://www.sideshow.com/blog/the-top-10-captain-america-quotes-in-the-mcu/',
]

def clean_text(text):
    unwanted_patterns = [
        r'Want \$\d+ off\?Sign up for the newsletter',
        r'\d+',  # Remove numbers
        r'Best: ',  # Remove 'Best: ' text
        r'Worst:',  # Remove 'Worst: ' text
        r'“',
        r'”',
    ]
    for pattern in unwanted_patterns:
        text = re.sub(pattern, '', text)
    return text.strip()

def extract_data(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    time.sleep(2)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        h2_tags = soup.find_all('h2')
        info = []
        for h2 in h2_tags:
            text = h2.get_text(strip=True)
            cleaned_text = clean_text(text)

            # Only add non-empty strings
            if cleaned_text:
                info.append(cleaned_text)

        return info
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        return None

data = [extract_data(url) for url in urls]
flat_data = [item for sublist in data if sublist for item in sublist]

# Create a DataFrame with each item as a row
df_info = pd.DataFrame(flat_data, columns=['Info'])
df_info.to_csv('captain_america_personality.csv', index=False)

print(df_info)

                                                 Info
0                                       Sense of Duty
1                                     Outdated Ideals
2                                            Humility
3                                     Pride In Battle
4                         Honest and Straight Forward
5                                     He's A Bad Liar
6                                      Protectiveness
7                                 Attachment To Bucky
8                 Willingness To Do Whatever It Takes
9                         Doesn't See The Big Picture
10                         He Was Incredibly Stubborn
11                            He Was Remarkably Brave
12                               He Was Compassionate
13                  He Was Rational And Down To Earth
14                                  He Was Perceptive
15                            He Was A Natural Leader
16                          He Was Awkward With Women
17                         H

# ***ANOTHER WEBSITE***

In [108]:
urls = [
    'https://everydaypower.com/captain-america-quotes/'
]

def clean_text(text):
    unwanted_patterns = [
        r'– Steve Rogers \(Captain America\)',
        r'You will also enjoy our article on[^.]*\.',
        r'\d+. ',
        r'Check out our collection ofCaptain Price quotesif you enjoy this article.',
        r'Don’t forget to also check out thesePennywise quotesfrom the scariest clown of all.',
        r'“',
        r'”',


    ]
    for pattern in unwanted_patterns:
        text = re.sub(pattern, '', text)
    return text

def extract_data(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    time.sleep(2)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        h2_tags = soup.find_all('h2')
        info = []
        for h2 in h2_tags:
            if h2.get('id') == "h-what-s-your-favorite-captain-america-movie-nbsp" or h2.get('id') == "h-are-you-an-avid-comic-book-reader":
                continue
            next_sibling = h2.find_next_sibling()
            while next_sibling and next_sibling.name != 'h2':
                if next_sibling.name == 'p':
                    text = next_sibling.get_text(strip=True)
                    cleaned_text = clean_text(text)
                    if cleaned_text:
                        info.append(cleaned_text)
                next_sibling = next_sibling.find_next_sibling()
        return info
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        return None

data = [extract_data(url) for url in urls]
flat_data = [item for sublist in data if sublist for item in sublist]

# Create a DataFrame with each item as a row
df_info = pd.DataFrame(flat_data, columns=['Info'])
df_info.to_csv('captain_america_quotes.csv', index=False)
print(df_info)

                                                 Info
0   I don’t want to kill anyone. I don’t like bull...
1                    The hell I can’t! I’m a Captain!
2              I knocked out Adolf Hitler over times.
3   It’s probably too late to go to the bathroom, ...
4                               I’m… Captain America.
5   I know this neighborhood. I got beat up in tha...
6   Dr. Erskine said that the serum wouldn’t just ...
7   You ready to follow Captain America into the j...
8   You know for the longest time I dreamed about ...
9   Where I’m goin’, if anybody yells at me I can ...
10  You start running, they’ll never let you stop....
11  There are men laying down their lives. I got n...
12  I think this is the longest conversation I’ve ...
13  Before we get started, does anyone want to get...
14  The price of freedom is high. It always has be...
15   It was not my first kiss since I’m I’m not dead.
16  Soldiers trust each other. That’s what makes i...
17              Even when I 

# ***MERGE TWO DATASETS***

In [128]:
import pandas as pd
import csv  # Import the csv module for quoting options

# Load data from CSV files
df_info_01 = pd.read_csv('captain_america_personality.csv')
df_info_02 = pd.read_csv('captain_america_quotes.csv')

# Concatenate the DataFrames
df_merged = pd.concat([df_info_01, df_info_02], ignore_index=True)

# Set custom index
df_merged.index = [f"{i+1}." for i in range(len(df_merged))]

# Save DataFrame to TXT file with custom separator and without quotation marks
df_merged.to_csv(
    'captain_america_dataset_personality.txt',
    sep='\t',                # Use a single space as the column separator
    index=True,            # Include the DataFrame index
    header=False,          # Exclude column names
    quoting=csv.QUOTE_NONE  # Disable quoting of strings
)

print(df_merged)


                                                  Info
1.                                       Sense of Duty
2.                                     Outdated Ideals
3.                                            Humility
4.                                     Pride In Battle
5.                         Honest and Straight Forward
6.                                     He's A Bad Liar
7.                                      Protectiveness
8.                                 Attachment To Bucky
9.                 Willingness To Do Whatever It Takes
10.                        Doesn't See The Big Picture
11.                         He Was Incredibly Stubborn
12.                            He Was Remarkably Brave
13.                               He Was Compassionate
14.                  He Was Rational And Down To Earth
15.                                  He Was Perceptive
16.                            He Was A Natural Leader
17.                          He Was Awkward With Women
18.       