#code for connecting notebook to google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

#code for scrapping movie-related data


In [None]:
def get_movie_titles(url, letter):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    movies = set()

    for link in soup.find_all('a', href=True):
        href = link['href']
        if href.startswith('/movie/') and href.endswith('.htm'):
            # Extract the movie title from the URL and check the first letter
            title = href.split('/')[-1].replace('.htm', '').replace('_', ' ')
            if title.lower().startswith(letter):
                movies.add(href)

    return list(movies)

In [None]:
def get_movie_details(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    #  extracting the movie title and year
    title_year_element = soup.find('a', itemprop="url", href=lambda href: href and "/movie/" in href)
    if title_year_element:
        title_year_text = title_year_element.get_text()
        if '(' in title_year_text and ')' in title_year_text:
            title = title_year_text.split('(')[0].strip()
            year = title_year_text.split('(')[-1].split(')')[0]
        else:
            title = title_year_text.strip()
            year = 'Unknown'
    else:
        title = 'Unknown'
        year = 'Unknown'

    # Extracting details from the table
    details = {'Title': title, 'Year': year}
    table = soup.find('table', class_='b1 allef w100p')
    if table:
        rows = table.find_all('tr')
        for row in rows:
            cells = row.find_all('td')
            if len(cells) == 2:  # Ensuring there are two columns
                key = cells[0].get_text(strip=True).replace(':', '')
                value = cells[1].get_text(strip=True)
                if key and value:
                    details[key] = value
            if "Lyricist:" in cells[0].get_text():
                details['Lyricist'] = cells[1].get_text(strip=True)

    # Handling external links separately
    external_links_elements = soup.find_all('a', itemprop="sameAs")
    external_links = [elem['href'] for elem in external_links_elements] if external_links_elements else []
    details['External Links'] = ', '.join(external_links)

    # Handling 'Watch Full Movie' link separately
    watch_movie_element = soup.find(lambda tag: tag.name == "td" and "Watch Full Movie:" in tag.text)
    watch_movie_link = watch_movie_element.find_next_sibling('td').find('a')['href'] if watch_movie_element and watch_movie_element.find_next_sibling('td') else 'Not Available'
    details['Watch Full Movie'] = watch_movie_link

    return details

In [None]:

import requests
from bs4 import BeautifulSoup
import pandas as pd

# alphabets = list(string.ascii_lowercase) #would work for all alphabets
# for testing purposes
alphabets = ['z']

all_movie_details = []
df_movies = pd.DataFrame()

for letter in alphabets:
    page = 1
    previous_movies = None
    while True:
        url = f"https://www.hindilyrics4u.com/movie/{letter}.php" + ("" if page == 1 else f"?page={page}")
        movies = get_movie_titles(url, letter)

        # Check if new page content is same as previous or empty
        if not movies or movies == previous_movies:
            break
        print(movies)

        for movie_path in movies:

            movie_url = "https://www.hindilyrics4u.com" + movie_path
            print(movie_url)
            movie_details = get_movie_details(movie_url)
            all_movie_details.append(movie_details)

        previous_movies = movies
        page += 1



# Convert the list of dictionaries to a DataFrame
df_movies = pd.DataFrame(all_movie_details)

movies_csv_path = '/content/drive/MyDrive/Fall 2023/Applied Data Science/datasets/m3/movie_final123.csv'

# Save to CSV

df_movies.to_csv(movies_csv_path, index=False)
print("Movies data saved to 'movie_final.csv'.")



#code for scrapping song-related data

In [None]:

def get_artist_names(url, letter):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    artist_elements = soup.find_all('a', href=lambda href: href and "/singer/" in href)

    artist_names = []
    for element in artist_elements:
        full_text = element.get_text()
        name = full_text.split('(')[0].strip()  # Remove the count and strip any whitespace

        # Ensure the name starts with the specified letter and is not just a single letter
        if name.lower().startswith(letter.lower()) and len(name) > 1:
            artist_names.append(name)

    return artist_names

In [None]:
import requests
from bs4 import BeautifulSoup

def get_songs_by_artist(base_url, artist_name):
    page = 1
    previous_songs = None
    all_songs = []

    artist_name_formatted = artist_name.replace(' ', '_').lower()

    while True:
        url = f"{base_url}/{artist_name_formatted}.php" + ("" if page == 1 else f"?page={page}")
        print(url)
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        song_elements = soup.find_all('tr', itemprop="track")
        current_songs = []

        for element in song_elements:
            song_title_element = element.find('span', itemprop="name")
            if song_title_element:
                song_title = song_title_element.get_text()

                rating_element = element.find('span', itemprop="ratingValue")
                rating = rating_element.get_text() if rating_element else 'Unknown'

                votes_element = element.find('span', itemprop="ratingCount")
                votes = votes_element.get_text() if votes_element else 'Unknown'

                movie_element = element.find('span', itemprop="inAlbum")
                movie_name = movie_element.get_text() if movie_element else 'Unknown'

                current_songs.append({
                    'Song Title': song_title,
                    'Rating': rating,
                    'Votes': votes,
                    'Movie': movie_name
                })

        if not current_songs or current_songs == previous_songs:
            break

        all_songs.extend(current_songs)
        previous_songs = current_songs
        page += 1

    return all_songs


In [None]:
import pandas as pd


# Main script
base_url = "https://www.hindilyrics4u.com/singer/{}.php"
base_artist_url = "https://www.hindilyrics4u.com/singer"
# alphabets = list(string.ascii_lowercase) #would work for all alphabets
alphabets = ['z']  # for testing purposes

all_artists = []



for letter in alphabets:
    page = 1
    previous_names = None

    while True:
        url = base_url.format(letter) + ("" if page == 1 else f"?page={page}")
        artist_names = get_artist_names(url, letter)

        if not artist_names or artist_names == previous_names:
            break

        all_artists.extend(artist_names)
        previous_names = artist_names
        page += 1

data = []

for artist in all_artists:
    songs = get_songs_by_artist(base_artist_url, artist.replace(' ', '_'))
    for song in songs:
        data.append({
            'Artist': artist,
            'Song Title': song['Song Title'],
            'Rating': song['Rating'],
            'Number of Votes': song['Votes'],
            'Movie Title': song['Movie']
        })

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV in Google Drive
df.to_csv('/content/drive/MyDrive/Fall 2023/Applied Data Science/datasets/songs_final.csv', index=False)
