In [8]:
import requests
from bs4 import BeautifulSoup
import csv
import re

categories = [
    "golden_palm",
    "grand_prix",
    "international_jury_prize",
    "best_director",
    "best_actor",
    "best_actress",
    "best_supporting_actor",
    "best_supporting_actress",
    "best_screenplay",
    "best_art_direction",
    "best_cinematography",
    "best_music",
    "best_artistic_contribution",
    "special_mention",
    "international_prize",
    "fipresci",
    "camera_d_or",
    "special_golden_palm",
    "best_color",
    "best_animation_design",
    "best_romantic_documentary",
    "best_human_document",
    "best_social_film",
    "best_psychological_love_film",
    "best_poetic_humor",
    "best_adventure_crime_film",
    "best_musical_comedy",
    "best_lyrical_film",
    "jury_prize",
    "special_jury_prize",
    "prize_ecumenical",
    "prize_ecumenical_mention",
    "artist_technician_prize",
    "technical_grand_prize",
    "special_award",
    "young_cinema_award",
    "audience_award",
    "ocic_award",
    "ocic_award_special_mention",
    "best_short_film",
    "short_film_jury_prize",
    "best_documentary_short_film",
    "international_peace_award",
    "un_certain_regard_best_film",
    "un_certain_regard_jury_prize",
    "un_certain_regard_special_distinction",
    "un_certain_regard_poetic_narrative",
    "un_certain_regard_directing_prize",
    "un_certain_regard_screenplay",
    "un_certain_regard_best_performance",
    "un_certain_regard_best_actress",
    "un_certain_regard_best_actor",
    "un_certain_regard_best_cast",
    "un_certain_regard_fipresci",
    "un_certain_regard_jurys_coup_de_coeur",
    "critics_week_grand_prize",
    "75anniversary",
    "premiere",
    "special_screenings",
    "midnight_screenings",
    "best_european_film_quinzaine"
]

def remove_text_in_parentheses(text):
    text = re.sub(r'\([^)]*\)', '', text)
    text = re.sub(r"'[^']*'", '', text)
    text = re.sub(r'"[^"]*"', '', text)
    return text.strip()

def scrape_winners(cat_id):
    url = f"https://www.filmaffinity.com/en/awards-history.php?cat-id=cannes_{cat_id}"
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL for category {cat_id}: {e}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the wrapper with the awards
    wrapper = soup.find('div', {'class': 'wrapper'})

    # Check if the wrapper was found
    if wrapper is None:
        print(f"Error: Couldn't find a wrapper with class 'wrapper' for category {cat_id}")
        return []

    # Find all the list items in the wrapper
    list_items = wrapper.find_all('li', class_=lambda x: x != 'empty-box')

    # Extract the information from each list item
    winners = []
    for item in list_items:
        year_elem = item.find('div', {'class': 'year'})
        movie_title_elem = item.find('span', {'class': 'movie-title-link'})
        director_elem = item.find('div', {'class': 'director'})
        cast_elem = item.find('div', {'class': 'cast'})
        
        if year_elem and movie_title_elem and director_elem and cast_elem:
            year = year_elem.text.strip()
            movie_title = movie_title_elem.text.strip()
            # Remove text within parentheses and quotation marks from the movie title
            movie_title = remove_text_in_parentheses(movie_title)
            director = director_elem.text.strip()
            cast = cast_elem.text.strip()
            winners.append([year, movie_title, director, cast])

    return winners

# Loop through all the categories and scrape the winners
with open('cannes_film_affinity_winners.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Category', 'Year', 'Movie Title', 'Director', 'Cast'])

    for cat_id in categories:
        winners = scrape_winners(cat_id)
        for winner in winners:
            writer.writerow(winner)


Error fetching URL for category prize_ecumenical: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


In [4]:
import requests
from bs4 import BeautifulSoup
import csv
import re

urls = [
"https://www.filmaffinity.com/en/awards-history.php?cat-id=moscow_best_actress",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=moscow_special_mention",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=moscow_best_documentary",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=moscow_special_prize",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=moscow_audience_award",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=moscow_fipresci_prize",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=moscow_russian_critics_award",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=moscow_best_short_film",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=moscow_best_film_perspectives",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=moscow_special_mention_perspectives",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=roma_best_film",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=roma_grand_jury_award",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=roma_special_jury_award",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=roma_best_director",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=roma_best_actress",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=roma_best_actor",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=roma_best_screenplay",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=roma_best_original_score",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=roma_audience_award",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=roma_best_newcomer",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=roma_best_technical_contribution",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=roma_best_debut",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=roma_best_documentary",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=gijon_mejor_pelicula",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=gijon_mejor_pelicula_retueyos",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=gijon_mejor_pelicula_albar",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=gijon_mejor_pelicula_trance",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=gijon_mejor_director",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=gijon_mejor_actriz",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=gijon_mejor_actor",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=gijon_premio_distribucion",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=gijon_premio_especial_jurado",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=gijon_special_jury_mention",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=gijon_mejor_guion",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=gijon_mejor_fotografia",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=gijon_mejor_banda_sonora",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=gijon_mejor_direccion_artistica",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=gijon_premio_fipresci",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=gijon_young_jury_prize_best_film",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=gijon_best_animation_film",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=gijon_audience_award",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=gijon_audience_award_tierres_en_trance",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=gijon_premio_europa_film_festivals",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=gijon_premio_enfants_terribles",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=gijon_best_short",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=gijon_best_short_special_mention",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=locarno_best_film",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=locarno_special_jury_prize",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=locarno_best_director",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=locarno_best_Director_second_prize",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=locarno_best_actor",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=locarno_best_actor_second_prize",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=locarno_best_actress",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=locarno_interpretation",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=locarno_most_promising_actor",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=locarno_special_mention",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=locarno_best_screenplay",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=locarno_best_cinematography",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=locarno_best_bw_cinematography",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=locarno_best_color_cinematography",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=locarno_cineasti_del_presente_best_film",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=locarno_cineasti_del_presente_special_jury_prize",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=locarno_cineasti_del_presente_emerging_director",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=locarno_cineasti_del_presente_best_actress",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=locarno_cineasti_del_presente_best_actor",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=locarno_cineasti_del_presente_special_mention",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=locarno_arrangement_prize",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=locarno_audience_award",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=locarno_variety_award",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=locarno_pardo_verde",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=locarno_pardo_verde_special_mention",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=locarno_pardo_verde_special_mention_short",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=locarno_funniest_film_prize",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=locarno_role_of_composition",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=locarno_best_first_feature",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=locarno_best_first_feature_mention",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=locarno_swiss_federation",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=locarno_fipresci",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=locarno_europa_cinemas_label_award",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=locarno_ecumenical_award",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=locarno_ecumenical_mention",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=locarno_best_short_film",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=locarno_best_animation_short_film",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sundance_grand_jury_drama",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sundance_grand_jury_documentary",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sundance_grand_jury_world_dramatic",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sundance_grand_jury_world_doc",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sundance_best_ensemble",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sundance_audience_dramatic",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sundance_audience_documentary",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sundance_world_audience_dramatic",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sundance_world_audience_doc",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sundance_directing_award_dramatic",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sundance_directing_award_documentary",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sundance_directing_world_dramatic",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sundance_directing_world_documentary",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sundance_screenwriting_award",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sundance_screenwriting_world_award",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sundance_cinematography_dramatic",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sundance_cinematography_documentary",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sundance_cinematography_world_dramatic",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sundance_cinematography_world_doc",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sundance_documentary_editing",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sundance_documentary_world_editing",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sundance_special_jury_world_dramatic",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sundance_special_jury_world_doc",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sundance_special_jury_dramatic",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sundance_special_jury_documentary",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sundance_special_jury_world_acting",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sundance_special_jury_world_breakout",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sundance_special_jury_us_breakthrough",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sundance_special_jury_breakthrough_director",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sundance_special_jury_world_music",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sundance_special_jury_us_doc_sound",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sundance_special_jury_world_doc_craft",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sundance_special_jury_cinematic_innovation",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sundance_short_film_grand_jury",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sundance_festival_favorite_award",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sundance_alfred_p_sloan_prize",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sundance_special_jury_neorealism",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=tribeca_best_us_narrative_feature",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=tribeca_best_actress_us_narrative",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=tribeca_best_actor_us_narrative",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=tribeca_best_screenplay_us",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=tribeca_best_cinematography_us",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=tribeca_best_narrative_feature",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=tribeca_special_jury_mention",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=tribeca_Best_actress",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=tribeca_best_actor",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=tribeca_best_performance",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=tribeca_best_screenplay",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=tribeca_best_cinematography",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=tribeca_best_editing",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=tribeca_best_new_director",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=tribeca_best_international_narrative_feature",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=tribeca_best_screenplay_international",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=tribeca_best_cinematography_international",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=tribeca_best_actress_international_narrative",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=tribeca_best_actor_international_narrative",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=tribeca_best_performance_international",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=tribeca_best_documentary_feature",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=tribeca_special_jury_mention_doc",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=tribeca_best_new_documentary_director",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=tribeca_best_cinematography_doc",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=tribeca_best_editing_doc",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=tribeca_best_documentary_short",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=tribeca_student_visionary_award",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=tribeca_best_narrative_short",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=tribeca_best_animated_short",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=tribeca_best_international_narrative_short",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=tribeca_best_international_animated_short",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=tribeca_best_music_video",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=seminci_best_picture",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=seminci_silver_spike",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=seminci_jury_prize",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=seminci_special_mention",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=seminci_best_director",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=seminci_best_actor",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=seminci_best_actress",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=seminci_best_new_director",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=seminci_best_script",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=seminci_best_music",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=seminci_best_cinematographer",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=seminci_best_editing",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=seminci_fipresci",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=seminci_best_short_film",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=seminci_silver_spike_short_film",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=seminci_best_european_short_film",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=seminci_audience_award",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=seminci_best_iberoamerican_film",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=seminci_young_seminci",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=seminci_sociograph_award",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=seminci_best_picture",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=seminci_silver_spike",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=seminci_jury_prize",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=seminci_special_mention",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=seminci_best_director",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=seminci_best_actor",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=seminci_best_actress",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=seminci_best_new_director",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=seminci_best_script",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=seminci_best_music",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=seminci_best_cinematographer",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=seminci_best_editing",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=seminci_fipresci",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=seminci_best_short_film",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=seminci_silver_spike_short_film",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=seminci_best_european_short_film",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=seminci_audience_award",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=seminci_best_iberoamerican_film",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=seminci_young_seminci",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=seminci_sociograph_award",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=malaga_best_film",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=malaga_special_jury_prize",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=malaga_best_latin_american_film",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=malaga_best_direction",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=malaga_best_actor",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=malaga_best_actress",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=malaga_best_supporting_actor",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=malaga_best_supporting_actress",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=malaga_best_screenplay",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=malaga_best_new_screenplay",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=malaga_best_soundtrack",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=malaga_best_cinematography",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=malaga_best_editing",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=malaga_best_costume_design",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=malaga_best_make_up",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=malaga_special_mention_jury",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=malaga_critics_award",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=malaga_audience_award",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=malaga_best_shortfilm",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=malaga_best_shortfilm_mention",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=malaga_best_shortfilm_direction",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=malaga_best_shortfilm_actress",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=malaga_best_shortfilm_actor",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=malaga_best_shortfilm_audience",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sevilla_best_gold_film",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sevilla_best_silver_film",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sevilla_special_jury_prize",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sevilla_best_direction",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sevilla_best_actor",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sevilla_best_actress",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sevilla_best_screenplay",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sevilla_best_cinematography",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sevilla_best_editing",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sevilla_best_sound",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sevilla_special_mention",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sevilla_best_first_film",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sevilla_audience_award",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sevilla_best_documentary",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sevilla_special_mention_doc",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sevilla_critics_award",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sevilla_campus_jury_award",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sevilla_eurimages_prize",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=sevilla_rtva_prize",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=chicago_gold_hugo",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=chicago_gold_hugo_foreign_language_film",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=chicago_silver_hugo_jury_prize",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=chicago_silver_hugo_best_director",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=chicago_silver_hugo_best_screenplay",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=chicago_silver_hugo_best_performance",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=chicago_silver_hugo_best_actor",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=chicago_silver_hugo_best_actress",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=chicago_silver_hugo_best_ensemble_performance",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=chicago_silver_hugo_best_cinematography",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=chicago_honorable_mention",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=chicago_silver_hugo_best_sound",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=chicago_silver_hugo_best_art_direction",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=chicago_best_short_film",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=chicago_best_documentary",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=hamptons_best_film",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=hamptons_honorable_mention_film",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=hamptons_best_documentary",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=hamptons_honorable_mention_documentary",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=hamptons_narrative_jury_winner",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=hamptons_documentary_jury_winner",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=hamptons_special_recognition_breakthrough",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=hamptons_award_films_conflict_resolution",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=hamptons_best_short_film",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=hamptons_best_short_documentary",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=hamptons_audience_award_film",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=hamptons_audience_award_documentary",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=hamptons_audience_award_short_film",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=rotterdam_best_film",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=rotterdam_special_jury_award",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=rotterdam_fipresci",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=rotterdam_netpac_award",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=rotterdam_young_jury_award",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=rotterdam_best_short_film",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=rotterdam_vpro_big_screen_award",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=habana_best_film",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=habana_special_jury_prize",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=habana_best_director",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=habana_second_prize",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=habana_third_prize",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=habana_special_mention",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=habana_best_actress",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=habana_best_actor",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=habana_best_screenplay",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=habana_best_cinematography",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=habana_best_art_direction",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=habana_best_editing",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=habana_best_music",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=habana_best_film_music",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=habana_best_sound_mixing",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=habana_best_short_film",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=habana_best_debut",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=habana_best_documentary_feature",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=habana_best_documentary_short",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=habana_best_documentary_special_jury",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=habana_best_animated_feature",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=habana_best_animated_short",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=habana_best_animated_special_jury",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=habana_fipresci",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=habana_audience_award",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=habana_signis_award",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=habana_best_debut_special_prize",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=habana_best_debut_artistic_contribution",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=montreal_best_film",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=montreal_special_grand_prix",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=montreal_best_director",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=montreal_best_actress",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=montreal_best_actor",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=montreal_best_screenplay",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=montreal_best_artistic_contribution",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=montreal_innovation_award",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=montreal_special_prix",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=montreal_jury_prize",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=montreal_best_canadian_film",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=montreal_grand_prix_special",
"https://www.filmaffinity.com/en/awards-history.php?cat-id=montreal_most_popular_film",
]


def remove_text_in_parentheses(text):
    # Remove text within parentheses
    text = re.sub(r'\([^)]*\)', '', text)
    # Remove text within single quotes
    text = re.sub(r"'[^']*'", '', text)
    # Remove text within double quotes
    text = re.sub(r'"[^"]*"', '', text)
    return text.strip()

def get_category_and_city_from_url(url):
    match = re.search(r'cat-id=([a-zA-Z]+)_([a-zA-Z_]+)', url)
    if match:
        city = match.group(1).capitalize()
        category = match.group(2).replace("_", " ").title()
        return city, category
    else:
        return None, None

def scrape_winners(urls):
    all_winners = []
    
    for url in urls:
        city, category = get_category_and_city_from_url(url)
        
        try:
            response = requests.get(url)
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            print(f"Error fetching URL: {url}, {e}")
            continue

        soup = BeautifulSoup(response.text, 'html.parser')

        wrapper = soup.find('div', {'class': 'wrapper'})

        if wrapper is None:
            print(f"Error: Couldn't find a wrapper with class 'wrapper' for URL: {url}")
            continue

        list_items = wrapper.find_all('li', class_=lambda x: x != 'empty-box')

        for item in list_items:
            year_elem = item.find('div', {'class': 'year'})
            movie_title_elem = item.find('span', {'class': 'movie-title-link'})
            director_elem = item.find('div', {'class': 'director'})
            cast_elem = item.find('div', {'class': 'cast'})
            
            if year_elem and movie_title_elem and director_elem and cast_elem:
                year = year_elem.text.strip()
                movie_title = movie_title_elem.text.strip()
                movie_title = remove_text_in_parentheses(movie_title)
                director = director_elem.text.strip()
                cast = cast_elem.text.strip()
                all_winners.append([city, category, year, movie_title, director, cast])

    return all_winners


# Scrape winners from the URLs
winners = scrape_winners(urls)

# Write the winners to a CSV file
with open('all_film_affinity_winners.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['City', 'Category', 'Year', 'Movie Title', 'Director', 'Cast'])

    for winner in winners:
        writer.writerow(winner)

Error fetching URL: https://www.filmaffinity.com/en/awards-history.php?cat-id=moscow_best_actress, 429 Client Error: Too Many Requests for url: https://www.filmaffinity.com/en/awards-history.php?cat-id=moscow_best_actress
Error fetching URL: https://www.filmaffinity.com/en/awards-history.php?cat-id=moscow_special_mention, 429 Client Error: Too Many Requests for url: https://www.filmaffinity.com/en/awards-history.php?cat-id=moscow_special_mention
Error fetching URL: https://www.filmaffinity.com/en/awards-history.php?cat-id=moscow_best_documentary, 429 Client Error: Too Many Requests for url: https://www.filmaffinity.com/en/awards-history.php?cat-id=moscow_best_documentary
Error fetching URL: https://www.filmaffinity.com/en/awards-history.php?cat-id=moscow_special_prize, 429 Client Error: Too Many Requests for url: https://www.filmaffinity.com/en/awards-history.php?cat-id=moscow_special_prize
Error fetching URL: https://www.filmaffinity.com/en/awards-history.php?cat-id=moscow_audience_aw

KeyboardInterrupt: 

In [35]:
import requests
from bs4 import BeautifulSoup
import csv
import re

# Define the categories you want to scrape
categories = [
    "choice_award",
    "choice_award_doc",
    "best_canadian_film",
    "best_canadian_film_citation",
    "best_canadian_first_film",
    "best_female_director",
    "choice_award_mad",
    "platform_winner",
    "platform_mention",
    "fipresci",
    "discovery_award",
    "netpac_award",
    "changemaker_award"
]

def remove_text_in_parentheses(text):
    # Remove text within parentheses
    text = re.sub(r'\([^)]*\)', '', text)
    # Remove text within single quotes
    text = re.sub(r"'[^']*'", '', text)
    # Remove text within double quotes
    text = re.sub(r'"[^"]*"', '', text)
    return text.strip()

# Function to scrape the winners of a specific category
def scrape_winners(cat_id):
    url = f"https://www.filmaffinity.com/en/awards-history.php?cat-id=toronto_{cat_id}"
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL for category {cat_id}: {e}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the wrapper with the awards
    wrapper = soup.find('div', {'class': 'wrapper'})

    # Check if the wrapper was found
    if wrapper is None:
        print(f"Error: Couldn't find a wrapper with class 'wrapper' for category {cat_id}")
        return []

    # Find all the list items in the wrapper
    list_items = wrapper.find_all('li', class_=lambda x: x != 'empty-box')

    # Extract the information from each list item
    winners = []
    for item in list_items:
        year_elem = item.find('div', {'class': 'year'})
        movie_title_elem = item.find('span', {'class': 'movie-title-link'})
        director_elem = item.find('div', {'class': 'director'})
        cast_elem = item.find('div', {'class': 'cast'})
        
        if year_elem and movie_title_elem and director_elem and cast_elem:
            year = year_elem.text.strip()
            movie_title = movie_title_elem.text.strip()
            # Remove text within parentheses and quotation marks from the movie title
            movie_title = remove_text_in_parentheses(movie_title)
            director = director_elem.text.strip()
            cast = cast_elem.text.strip()
            winners.append([year, movie_title, director, cast])

    return winners

# Loop through all the categories and scrape the winners
with open('toronto_film_affinity_winners.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Category', 'Year', 'Movie Title', 'Director', 'Cast'])

    for cat_id in categories:
        winners = scrape_winners(cat_id)
        for winner in winners:
            writer.writerow([cat_id.capitalize().replace("_", " "), *winner])


In [42]:
import requests
from bs4 import BeautifulSoup
import csv
import re

# Define the categories you want to scrape
categories = [
    "golden_lion",
    "mussolini_cup",
    "grand_jury_prize",
    "silver_lion",
    "special_jury_prize",
    "best_actress",
    "best_actor",
    "best_supporting_actress",
    "best_supporting_actor",
    "best_screenplay",
    "best_cinematography",
    "best_technical",
    "new_young_actor",
    "fipresci",
    "orizzonti_best_film",
    "orizzonti_extra_best_film",
    "orizzonti_best_direction",
    "orizzonti_best_screenplay",
    "orizzonti_best_actor",
    "orizzonti_best_actress",
    "audience_award",
    "signis_award",
    "ocic_award",
    "unesco",
    "fedic_award",
    "special_lion",
    "best_first_work",
    "best_ensemble_cast",
    "best_foreign_film",
    "best_italian_film",
    "best_short_film"
]

def remove_text_in_parentheses(text):
    # Remove text within parentheses
    text = re.sub(r'\([^)]*\)', '', text)
    # Remove text within single quotes
    text = re.sub(r"'[^']*'", '', text)
    # Remove text within double quotes
    text = re.sub(r'"[^"]*"', '', text)
    return text.strip()

# Function to scrape the winners of a specific category
def scrape_winners(cat_id):
    url = f"https://www.filmaffinity.com/en/awards-history.php?cat-id=venice_{cat_id}"
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL for category {cat_id}: {e}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the wrapper with the awards
    wrapper = soup.find('div', {'class': 'wrapper'})

    # Check if the wrapper was found
    if wrapper is None:
        print(f"Error: Couldn't find a wrapper with class 'wrapper' for category {cat_id}")
        return []

    # Find all the list items in the wrapper
    list_items = wrapper.find_all('li', class_=lambda x: x != 'empty-box')

    # Extract the information from each list item
    winners = []
    for item in list_items:
        year_elem = item.find('div', {'class': 'year'})
        movie_title_elem = item.find('span', {'class': 'movie-title-link'})
        director_elem = item.find('div', {'class': 'director'})
        cast_elem = item.find('div', {'class': 'cast'})
        
        if year_elem and movie_title_elem and director_elem and cast_elem:
            year = year_elem.text.strip()
            movie_title = movie_title_elem.text.strip()
            # Remove text within parentheses and quotation marks from the movie title
            movie_title = remove_text_in_parentheses(movie_title)
            director = director_elem.text.strip()
            cast = cast_elem.text.strip()
            winners.append([year, movie_title, director, cast])

    return winners

# Loop through all the categories and scrape the winners
with open('venice_film_affinity_winners.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Category', 'Year', 'Movie Title', 'Director', 'Cast'])

    for cat_id in categories:
        winners = scrape_winners(cat_id)
        for winner in winners:
            writer.writerow([cat_id.capitalize().replace("_", " "), *winner])

In [37]:
import requests
from bs4 import BeautifulSoup
import csv
import re

# Define the categories you want to scrape
categories = [
    "golden_bear",
    "honourable_mention",
    "grand_jury_prize",
    "special_jury_prize",
    "special_prize",
    "best_director",
    "best_leading_performance",
    "best_actress",
    "best_acting_team",
    "best_screenplay",
    "silver_bear",
    "best_actor",
    "jury_prize",
    "outstanding_single_archievement",
    "art_contribution",
    "best_film_music",
    "bronze_bear",
    "best_debut",
    "fipresci",
    "fipresci_prize_honourable",
    "ocic_award",
    "ocic_award_honourable",
    "ocic_award_recommendation",
    "alfred_bauer_prize",
    "blue_angel_award",
    "piper_heidsieck",
    "piper_heidsieck_actress",
    "cicae_award",
    "cidalc_award",
    "interfilm_award",
    "interfilm_otto_award",
    "interfilm_award_recommendation",
    "interfilm_award_honorable",
    "unicrit_award",
    "unicrit_award_honorable",
    "special_mention",
    "big_gold_medal",
    "big_silver_medal",
    "big_bronze_medal",
    "panorama_audience_film"
    "panorama_audience_doc",
    "generation_kplus_grand_prix",
    "generation_kplus_special_prize_short",
    "generation_kplus_crystal_bear",
    "international_best_short_film",
    "short_film_jury_prize",
    "best_documentary",
    "documentary_special_mention"
]

def remove_text_in_parentheses(text):
    # Remove text within parentheses
    text = re.sub(r'\([^)]*\)', '', text)
    # Remove text within single quotes
    text = re.sub(r"'[^']*'", '', text)
    # Remove text within double quotes
    text = re.sub(r'"[^"]*"', '', text)
    return text.strip()

# Function to scrape the winners of a specific category
def scrape_winners(cat_id):
    url = f"https://www.filmaffinity.com/en/awards-history.php?cat-id=berlin_{cat_id}"
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL for category {cat_id}: {e}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the wrapper with the awards
    wrapper = soup.find('div', {'class': 'wrapper'})

    # Check if the wrapper was found
    if wrapper is None:
        print(f"Error: Couldn't find a wrapper with class 'wrapper' for category {cat_id}")
        return []

    # Find all the list items in the wrapper
    list_items = wrapper.find_all('li', class_=lambda x: x != 'empty-box')

    # Extract the information from each list item
    winners = []
    for item in list_items:
        year_elem = item.find('div', {'class': 'year'})
        movie_title_elem = item.find('span', {'class': 'movie-title-link'})
        director_elem = item.find('div', {'class': 'director'})
        cast_elem = item.find('div', {'class': 'cast'})
        
        if year_elem and movie_title_elem and director_elem and cast_elem:
            year = year_elem.text.strip()
            movie_title = movie_title_elem.text.strip()
            # Remove text within parentheses and quotation marks from the movie title
            movie_title = remove_text_in_parentheses(movie_title)
            director = director_elem.text.strip()
            cast = cast_elem.text.strip()
            winners.append([year, movie_title, director, cast])

    return winners

# Loop through all the categories and scrape the winners
with open('berlin_film_affinity_winners.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Category', 'Year', 'Movie Title', 'Director', 'Cast'])

    for cat_id in categories:
        winners = scrape_winners(cat_id)
        for winner in winners:
            writer.writerow([cat_id.capitalize().replace("_", " "), *winner])

In [31]:
import requests
from bs4 import BeautifulSoup
import csv
import re

# Define the categories you want to scrape
categories = [
    "best_feature",
    "special_distinction",
    "best_film_contrechamp",
    "audience_award",
    "cristal_best_short_film",
    "special_jury_award",
    "best_first_film",
    "special_distinction_short_film"
    "best_music",
    "audience_award_short_film",
    "fipresci",
    "junior_jury_award_short_film",
    "off-limits_award",
    "best_tv_production",
    "best_graduation_film",
    "best_commissioned_film",
    "city_of_annecy_award",
    "paul_grimault_award",
    "gan_foundation"
]

def remove_text_in_parentheses(text):
    # Remove text within parentheses
    text = re.sub(r'\([^)]*\)', '', text)
    # Remove text within single quotes
    text = re.sub(r"'[^']*'", '', text)
    # Remove text within double quotes
    text = re.sub(r'"[^"]*"', '', text)
    return text.strip()

# Function to scrape the winners of a specific category
def scrape_winners(cat_id):
    url = f"https://www.filmaffinity.com/en/awards-history.php?cat-id=annecy_{cat_id}"
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL for category {cat_id}: {e}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the wrapper with the awards
    wrapper = soup.find('div', {'class': 'wrapper'})

    # Check if the wrapper was found
    if wrapper is None:
        print(f"Error: Couldn't find a wrapper with class 'wrapper' for category {cat_id}")
        return []

    # Find all the list items in the wrapper
    list_items = wrapper.find_all('li', class_=lambda x: x != 'empty-box')

    # Extract the information from each list item
     winners = []
    for item in list_items:
        year_elem = item.find('div', {'class': 'year'})
        movie_title_elem = item.find('span', {'class': 'movie-title-link'})
        director_elem = item.find('div', {'class': 'director'})
        cast_elem = item.find('div', {'class': 'cast'})
        
        if year_elem and movie_title_elem and director_elem and cast_elem:
            year = year_elem.text.strip()
            movie_title = movie_title_elem.text.strip()
            # Remove text within parentheses and quotation marks from the movie title
            movie_title = remove_text_in_parentheses(movie_title)
            director = director_elem.text.strip()
            cast = cast_elem.text.strip()
            winners.append([year, movie_title, director, cast])

    return winners

# Loop through all the categories and scrape the winners
with open('annecy_film_affinity_winners.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Category', 'Year', 'Movie Title', 'Director', 'Cast'])

    for cat_id in categories:
        winners = scrape_winners(cat_id)
        for winner in winners:
            writer.writerow([cat_id.capitalize().replace("_", " "), *winner])

In [38]:
import requests
from bs4 import BeautifulSoup
import csv
import re

# Define the categories you want to scrape
categories = [
    "best_picture",
    "special_jury_award",
    "best_director",
    "best_actor",
    "best_actress",
    "best_screenplay",
    "new_director",
    "best_cinematography",
    "best_production_design",
    "best_special_effects",
    "best_make_up_fx",
    "best_ost",
    "special_mention",
    "audience_award",
    "fipresci",
    "best_european_film",
    "best_film_panorama",
    "best_animated_feature",
    "best_animated_kids",
    "best_short_film_animated",
    "best_short_film",
    "best_new_director",
    "best_film_focus_asia",
    "best_film_novesvisions",
    "best_film_novesvisions_one",
    "best_director_novesvisions",
    "special_mention_noves_visions",
    "best_short_film_novesvisions",
    "best_film_novesvisions_plus",
    "best_film_midnight",
    "best_film_orbita",
    "serial_sitges",
    "best_film_silver_melies",
    "best_film_blood_window",
    "best_short_film_silver_melies",
    "best_short_film_brigadoon",
    "sgae_nova_autoria_best_direction",
    "sgae_nova_autoria_best_screenplay",
    "sgae_nova_autoria_best_music",
    "best_film_jove_juryaward"
]

def remove_text_in_parentheses(text):
    # Remove text within parentheses
    text = re.sub(r'\([^)]*\)', '', text)
    # Remove text within single quotes
    text = re.sub(r"'[^']*'", '', text)
    # Remove text within double quotes
    text = re.sub(r'"[^"]*"', '', text)
    return text.strip()

# Function to scrape the winners of a specific category
def scrape_winners(cat_id):
    url = f"https://www.filmaffinity.com/en/awards-history.php?cat-id=sitges_{cat_id}"
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL for category {cat_id}: {e}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the wrapper with the awards
    wrapper = soup.find('div', {'class': 'wrapper'})

    # Check if the wrapper was found
    if wrapper is None:
        print(f"Error: Couldn't find a wrapper with class 'wrapper' for category {cat_id}")
        return []

    # Find all the list items in the wrapper
    list_items = wrapper.find_all('li', class_=lambda x: x != 'empty-box')

    # Extract the information from each list item
    winners = []
    for item in list_items:
        year_elem = item.find('div', {'class': 'year'})
        movie_title_elem = item.find('span', {'class': 'movie-title-link'})
        director_elem = item.find('div', {'class': 'director'})
        cast_elem = item.find('div', {'class': 'cast'})
        
        if year_elem and movie_title_elem and director_elem and cast_elem:
            year = year_elem.text.strip()
            movie_title = movie_title_elem.text.strip()
            # Remove text within parentheses and quotation marks from the movie title
            movie_title = remove_text_in_parentheses(movie_title)
            director = director_elem.text.strip()
            cast = cast_elem.text.strip()
            winners.append([year, movie_title, director, cast])

    return winners

# Loop through all the categories and scrape the winners
with open('sitges_film_affinity_winners.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Category', 'Year', 'Movie Title', 'Director', 'Cast'])

    for cat_id in categories:
        winners = scrape_winners(cat_id)
        for winner in winners:
            writer.writerow([cat_id.capitalize().replace("_", " "), *winner])

In [39]:
import requests
from bs4 import BeautifulSoup
import csv
import re

# Define the categories you want to scrape
categories = [
    "best_feature",
    "special_jury_prize",
    "best_director",
    "best_screenplay",
    "best_actor",
    "best_actress",
    "audience_award",
    "best_artistic_contribution"
]

def remove_text_in_parentheses(text):
    # Remove text within parentheses
    text = re.sub(r'\([^)]*\)', '', text)
    # Remove text within single quotes
    text = re.sub(r"'[^']*'", '', text)
    # Remove text within double quotes
    text = re.sub(r'"[^"]*"', '', text)
    return text.strip()

# Function to scrape the winners of a specific category
def scrape_winners(cat_id):
    url = f"https://www.filmaffinity.com/en/awards-history.php?cat-id=kyoto_{cat_id}"
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL for category {cat_id}: {e}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the wrapper with the awards
    wrapper = soup.find('div', {'class': 'wrapper'})

    # Check if the wrapper was found
    if wrapper is None:
        print(f"Error: Couldn't find a wrapper with class 'wrapper' for category {cat_id}")
        return []

    # Find all the list items in the wrapper
    list_items = wrapper.find_all('li', class_=lambda x: x != 'empty-box')

    # Extract the information from each list item
    winners = []
    for item in list_items:
        year_elem = item.find('div', {'class': 'year'})
        movie_title_elem = item.find('span', {'class': 'movie-title-link'})
        director_elem = item.find('div', {'class': 'director'})
        cast_elem = item.find('div', {'class': 'cast'})
        
        if year_elem and movie_title_elem and director_elem and cast_elem:
            year = year_elem.text.strip()
            movie_title = movie_title_elem.text.strip()
            # Remove text within parentheses and quotation marks from the movie title
            movie_title = remove_text_in_parentheses(movie_title)
            director = director_elem.text.strip()
            cast = cast_elem.text.strip()
            winners.append([year, movie_title, director, cast])

    return winners

# Loop through all the categories and scrape the winners
with open('tokyo_film_affinity_winners.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Category', 'Year', 'Movie Title', 'Director', 'Cast'])

    for cat_id in categories:
        winners = scrape_winners(cat_id)
        for winner in winners:
            writer.writerow([cat_id.capitalize().replace("_", " "), *winner])

In [40]:
import requests
from bs4 import BeautifulSoup
import csv
import re


# Define the categories you want to scrape
categories = [
    "best_film",
    "special_jury_award",
    "special_mention",
    "best_director",
    "best_performance",
    "best_actor",
    "best_actress",
    "best_screenplay",
    "best_iberoamerican",
    "iberoamerican_mention",
    "special_jury_mention",
    "iberoamerican_special_jury_award",
    "best_argentinian_film",
    "best_argentinian_director",
    "argentinian_mention",
    "fipresci",
    "audience_award",
    "best_short_film",
    "best_argentinian_short_film"
]

def remove_text_in_parentheses(text):
    # Remove text within parentheses
    text = re.sub(r'\([^)]*\)', '', text)
    # Remove text within single quotes
    text = re.sub(r"'[^']*'", '', text)
    # Remove text within double quotes
    text = re.sub(r'"[^"]*"', '', text)
    return text.strip()

# Function to scrape the winners of a specific category
def scrape_winners(cat_id):
    url = f"https://www.filmaffinity.com/en/awards-history.php?cat-id=mar_{cat_id}"
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL for category {cat_id}: {e}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the wrapper with the awards
    wrapper = soup.find('div', {'class': 'wrapper'})

    # Check if the wrapper was found
    if wrapper is None:
        print(f"Error: Couldn't find a wrapper with class 'wrapper' for category {cat_id}")
        return []

    # Find all the list items in the wrapper
    list_items = wrapper.find_all('li', class_=lambda x: x != 'empty-box')

    # Extract the information from each list item
    winners = []
    for item in list_items:
        year_elem = item.find('div', {'class': 'year'})
        movie_title_elem = item.find('span', {'class': 'movie-title-link'})
        director_elem = item.find('div', {'class': 'director'})
        cast_elem = item.find('div', {'class': 'cast'})
        
        if year_elem and movie_title_elem and director_elem and cast_elem:
            year = year_elem.text.strip()
            movie_title = movie_title_elem.text.strip()
            # Remove text within parentheses and quotation marks from the movie title
            movie_title = remove_text_in_parentheses(movie_title)
            director = director_elem.text.strip()
            cast = cast_elem.text.strip()
            winners.append([year, movie_title, director, cast])

    return winners

# Loop through all the categories and scrape the winners
with open('mar_film_affinity_winners.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Category', 'Year', 'Movie Title', 'Director', 'Cast'])

    for cat_id in categories:
        winners = scrape_winners(cat_id)
        for winner in winners:
            writer.writerow([cat_id.capitalize().replace("_", " "), *winner])

In [41]:
import requests
from bs4 import BeautifulSoup
import csv
import re


# Define the categories you want to scrape
categories = [
    "best_film",
    "second_best_film",
    "jury_grand_prix",
    "special_jury_award",
    "jury_prix",
    "best_director",
    "best_actor",
    "best_actress",
    "best_screenplay",
    "best_cinematography",
    "best_music",
    "best_technology",
    "artistic_achievement",
    "best_animated_film",
    "best_documentary",
    "best_short_film",
    "best_animated_short_film"
]

def remove_text_in_parentheses(text):
    # Remove text within parentheses
    text = re.sub(r'\([^)]*\)', '', text)
    # Remove text within single quotes
    text = re.sub(r"'[^']*'", '', text)
    # Remove text within double quotes
    text = re.sub(r'"[^"]*"', '', text)
    return text.strip()

# Function to scrape the winners of a specific category
def scrape_winners(cat_id):
    url = f"https://www.filmaffinity.com/en/awards-history.php?cat-id=shanghai_{cat_id}"
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL for category {cat_id}: {e}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the wrapper with the awards
    wrapper = soup.find('div', {'class': 'wrapper'})

    # Check if the wrapper was found
    if wrapper is None:
        print(f"Error: Couldn't find a wrapper with class 'wrapper' for category {cat_id}")
        return []

    # Find all the list items in the wrapper
    list_items = wrapper.find_all('li', class_=lambda x: x != 'empty-box')

    # Extract the information from each list item
    winners = []
    for item in list_items:
        year_elem = item.find('div', {'class': 'year'})
        movie_title_elem = item.find('span', {'class': 'movie-title-link'})
        director_elem = item.find('div', {'class': 'director'})
        cast_elem = item.find('div', {'class': 'cast'})
        
        if year_elem and movie_title_elem and director_elem and cast_elem:
            year = year_elem.text.strip()
            movie_title = movie_title_elem.text.strip()
            # Remove text within parentheses and quotation marks from the movie title
            movie_title = remove_text_in_parentheses(movie_title)
            director = director_elem.text.strip()
            cast = cast_elem.text.strip()
            winners.append([year, movie_title, director, cast])

    return winners

# Loop through all the categories and scrape the winners
with open('shanghai_film_affinity_winners.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Category', 'Year', 'Movie Title', 'Director', 'Cast'])

    for cat_id in categories:
        winners = scrape_winners(cat_id)
        for winner in winners:
            writer.writerow([cat_id.capitalize().replace("_", " "), *winner])

In [12]:
file_path = "C:/Users/frivo/Downloads/title.akas.tsv/title.akas.tsv"

# Open the file in read mode
with open(file_path, 'r', encoding='utf-8') as file:
    # Iterate over the first 10 lines in the file
    for line_number, line in enumerate(file):
        if line_number >= 10:
            break
        # Print each line
        print(line.strip())


titleId	ordering	title	region	language	types	attributes	isOriginalTitle
tt0000001	1	Carmencita	\N	\N	original	\N	1
tt0000001	2	Carmencita	DE	\N	\N	literal title	0
tt0000001	3	Carmencita	US	\N	imdbDisplay	\N	0
tt0000001	4	Carmencita - spanyol tánc	HU	\N	imdbDisplay	\N	0
tt0000001	5	Καρμενσίτα	GR	\N	imdbDisplay	\N	0
tt0000001	6	Карменсита	RU	\N	imdbDisplay	\N	0
tt0000001	7	Карменсіта	UA	\N	imdbDisplay	\N	0
tt0000001	8	カルメンチータ	JP	ja	imdbDisplay	\N	0
tt0000002	1	Le clown et ses chiens	\N	\N	original	\N	1


In [14]:
import pandas as pd

# List of CSV file paths
csv_files = [
    "C:/Users/frivo/Documents/csv coding project/annecy_film_affinity_winners.csv",
    "C:/Users/frivo/Documents/csv coding project/berlin_film_affinity_winners.csv",
    "C:/Users/frivo/Documents/csv coding project/cannes_film_affinity_winners.csv",
    "C:/Users/frivo/Documents/csv coding project/mar_film_affinity_winners.csv",
    "C:/Users/frivo/Documents/csv coding project/shanghai_film_affinity_winners.csv",
    "C:/Users/frivo/Documents/csv coding project/stiges_film_affinity_winners.csv",
    "C:/Users/frivo/Documents/csv coding project/tokyo_film_affinity_winners.csv",
    "C:/Users/frivo/Documents/csv coding project/toronto_film_affinity_winners.csv",
    "C:/Users/frivo/Documents/csv coding project/venice_film_affinity_winners.csv"
]

# TSV file path
tsv_file = "C:/Users/frivo/Downloads/title.akas.tsv/title.akas.tsv"

# Load TSV file
tsv_df = pd.read_csv(tsv_file, sep='\t')

# List to store lines from TSV file
tsv_lines = []

# Iterate over each CSV file
for csv_file in csv_files:
    # Load CSV file
    csv_df = pd.read_csv(csv_file)
    
    # Ensure column name consistency
    if 'Movie Title' in csv_df.columns:
        csv_df.rename(columns={'Movie Title': 'title'}, inplace=True)

    # Extract movie titles
    movie_titles = csv_df['title'].tolist()

    # Search for movie titles in TSV file
    for title in movie_titles:
        found_lines = tsv_df[tsv_df['title'] == title]
        if not found_lines.empty:
            tsv_lines.extend(found_lines.values.tolist())

# Create DataFrame from found lines
result_df = pd.DataFrame(tsv_lines, columns=tsv_df.columns)

# Write result to CSV file
result_df.to_csv('result.csv', index=False)


KeyboardInterrupt: 

In [43]:
import pandas as pd
import re

# CSV file path
csv_file = "C:/Users/frivo/Documents/csv coding project/annecy_film_affinity_winners.csv"

# TSV file path
tsv_file_path = "C:/Users/frivo/Downloads/title.akas.tsv/title.akas.tsv"

# Read all lines from the CSV file starting from the second line
with open(csv_file, 'r', encoding='utf-8') as file:
    csv_lines = file.readlines()[1:]

# Preprocess TSV file into a dictionary for faster lookups
tsv_lookup = {}
with open(tsv_file_path, 'r', encoding='utf-8') as tsv_file:
    for line in tsv_file:
        # Extract title from the TSV line
        movie_title_tsv = line.split('\t')[2].strip()  # Assuming the title is the third column
        
        # Remove spaces and special characters from the TSV title
        movie_title_tsv = re.sub(r'\W+', '', movie_title_tsv)
        
        # Remove (S) from the title if present
        movie_title_tsv = movie_title_tsv.replace('(S)', '')

        # Store line in the dictionary
        tsv_lookup[movie_title_tsv] = line.strip()

# List to store found lines
found_lines = []

# List to store line numbers not found in TSV file
lines_not_found = []

# Search for each line from the CSV file in the TSV dictionary
for line_num, csv_line in enumerate(csv_lines, start=2):  # Start from line 2
    if line_num % 10 == 0:  # Print line number every 10 lines
        print(f"Processing line {line_num}...")

    # Extract movie title from the CSV line
    movie_title_csv = csv_line.strip().split(',')[2].strip()

    # Remove spaces and special characters from movie title
    movie_title_csv = re.sub(r'\W+', '', movie_title_csv)
    
    # Remove (S) from the title if present
    movie_title_csv = movie_title_csv.replace('(S)', '')

    # Search for the movie title in the TSV dictionary
    tsv_line = tsv_lookup.get(movie_title_csv)
    if tsv_line:
        found_lines.append(tsv_line)
    else:
        lines_not_found.append((line_num, csv_line.strip()))
        print(f"Line {line_num}: '{csv_line.strip()}' not found in TSV file.")

# Write found lines to a CSV file
found_df = pd.DataFrame(found_lines, columns=['TSV Line'])
found_df.to_csv('found_lines_annecy.csv', index=False)

print("Total lines not found in TSV file:", len(lines_not_found))
print("Lines not found in TSV file:")
for line_num, line_content in lines_not_found:
    print(f"Line {line_num}: {line_content}")


Processing line 10...
Processing line 20...
Line 23: 'Best feature,2002,"My Beautiful Girl, Mari",Lee Seong-Kang,Animation' not found in TSV file.
Processing line 30...
Line 39: 'Special distinction,2013,"Ma maman est en Amérique, elle a rencontré Buffalo Bill","Marc Boreal, Thibaut Chatel","Animation,  Voice: Marc Lavoine, Julie Depardieu"' not found in TSV file.
Processing line 40...
Line 42: 'Special distinction,2010,Eleonore's Secret,Dominique Monfery,"Animation,  Voice: Jeanne Moreau, Julie Gayet ..."' not found in TSV file.
Processing line 50...
Processing line 60...
Line 64: 'Special jury award,2023,Scavengers Reign  Second Jury Award for a TV Series,"Joseph Bennett (Creator), Charles Huettner (Creator) ...","Animation,  Voice: Sunita Mani, Wunmi Mosaku ..."' not found in TSV file.
Line 67: 'Special jury award,2020,The Nose or the Conspiracy of Mavericks,Andrei Khrzhanovsky,Animation' not found in TSV file.
Processing line 70...
Processing line 80...
Processing line 90...
Proces

In [44]:
import pandas as pd

# CSV file path
csv_file = "C:/Users/frivo/Documents/csv coding project/berlin_film_affinity_winners.csv"

# TSV file path
tsv_file_path = "C:/Users/frivo/Downloads/title.akas.tsv/title.akas.tsv"

# Read all lines from the CSV file starting from the second line
with open(csv_file, 'r', encoding='utf-8') as file:
    csv_lines = file.readlines()[1:]

# Preprocess TSV file into a dictionary for faster lookups
tsv_lookup = {}
with open(tsv_file_path, 'r', encoding='utf-8') as tsv_file:
    for line in tsv_file:
        # Extract title from the TSV line
        movie_title_tsv = line.split('\t')[2].strip()  # Assuming the title is the third column

        # Remove spaces from the TSV title
        movie_title_tsv = movie_title_tsv.replace(' ', '')
        
        # Store line in the dictionary
        tsv_lookup[movie_title_tsv] = line.strip()

# List to store found lines
found_lines = []

# List to store line numbers not found in TSV file
lines_not_found = []

# Search for each line from the CSV file in the TSV dictionary
for line_num, csv_line in enumerate(csv_lines, start=2):  # Start from line 2
    if line_num % 10 == 0:  # Print line number every 10 lines
        print(f"Processing line {line_num}...")

    # Extract movie title from the CSV line
    movie_title_csv = csv_line.strip().split(',')[2].strip()

    # Remove spaces from movie title
    movie_title_csv = movie_title_csv.replace(' ', '')

    # Search for the movie title in the TSV dictionary
    tsv_line = tsv_lookup.get(movie_title_csv)
    if tsv_line:
        found_lines.append(tsv_line)
    else:
        lines_not_found.append((line_num, csv_line.strip()))
        print(f"Line {line_num}: '{csv_line.strip()}' not found in TSV file.")

# Write found lines to a CSV file
found_df = pd.DataFrame(found_lines, columns=['TSV Line'])
found_df.to_csv('found_lines_berlin.csv', index=False)

print("Total lines not found in TSV file:", len(lines_not_found))
print("Lines not found in TSV file:")
for line_num, line_content in lines_not_found:
    print(f"Line {line_num}: {line_content}")

Processing line 10...
Line 12: 'Golden bear,2014,"Black Coal, Thin Ice",Diao Yinan,"Liao Fan, Gwei Lun-Mei, Wang Xuebing ..."' not found in TSV file.
Processing line 20...
Line 21: 'Golden bear,2005,U-Carmen e-Khayelitsha,Mark Dornford-May,"Pauline Malefane, Andile Tshoni, Zweilungile Sidloyi ..."' not found in TSV file.
Processing line 30...
Line 39: 'Golden bear,1987,The Teme,Gleb Panfilov,"Mikhail Ulyanov, Inna Churikova, Stanislav Lyubshin ..."' not found in TSV file.
Processing line 40...
Line 45: 'Golden bear,1981,"Fast, Fast",Carlos Saura,"Berta Socuéllamos, José Antonio Valdelomar, Jesús Arias ..."' not found in TSV file.
Processing line 50...
Line 50: 'Golden bear,1976,"Buffalo Bill and the Indians, or Sitting Bull's History Lesson",Robert Altman,"Paul Newman, Burt Lancaster, Geraldine Chaplin ..."' not found in TSV file.
Processing line 60...
Processing line 70...
Processing line 80...
Line 83: 'Honourable mention,1991,Li Lianying: The Imperial Eunuch,Tian Zhuangzhuang,"Jiang

In [45]:
import pandas as pd
import re

# CSV file path
csv_file = "C:/Users/frivo/Documents/csv coding project/cannes_film_affinity_winners.csv"

# TSV file path
tsv_file_path = "C:/Users/frivo/Downloads/title.akas.tsv/title.akas.tsv"

# Read all lines from the CSV file starting from the second line
with open(csv_file, 'r', encoding='utf-8') as file:
    csv_lines = file.readlines()[1:]

# Preprocess TSV file into a dictionary for faster lookups
tsv_lookup = {}
with open(tsv_file_path, 'r', encoding='utf-8') as tsv_file:
    for line in tsv_file:
        # Extract title from the TSV line
        movie_title_tsv = line.split('\t')[2].strip()  # Assuming the title is the third column
        
        # Remove non-alphanumeric characters from the title
        movie_title_tsv = re.sub(r'[^a-zA-Z0-9]', '', movie_title_tsv)

        # Store line in the dictionary
        tsv_lookup[movie_title_tsv] = line.strip()

# List to store found lines
found_lines = []

# List to store line numbers not found in TSV file
lines_not_found = []

# Search for each line from the CSV file in the TSV dictionary
for line_num, csv_line in enumerate(csv_lines, start=2):  # Start from line 2
    if line_num % 10 == 0:  # Print line number every 10 lines
        print(f"Processing line {line_num}...")

    # Extract movie title from the CSV line
    movie_title_csv = csv_line.strip().split(',')[2].strip()

    # Remove non-alphanumeric characters from the title
    movie_title_csv = re.sub(r'[^a-zA-Z0-9]', '', movie_title_csv)

    # Search for the movie title in the TSV dictionary
    tsv_line = tsv_lookup.get(movie_title_csv)
    if tsv_line:
        found_lines.append(tsv_line)
    else:
        lines_not_found.append((line_num, csv_line.strip()))
        print(f"Line {line_num}: '{csv_line.strip()}' not found in TSV file.")

# Write found lines to a CSV file
found_df = pd.DataFrame(found_lines, columns=['TSV Line'])
found_df.to_csv('found_lines_cannes.csv', index=False)

print("Total lines not found in TSV file:", len(lines_not_found))
print("Lines not found in TSV file:")
for line_num, line_content in lines_not_found:
    print(f"Line {line_num}: {line_content}")


Processing line 10...
Line 17: 'Golden palm,2007,"4 Months, 3 Weeks & 2 Days",Cristian Mungiu,"Anamaria Marinca, Vlad Ivanov, Laura Vasiliu, Alexandru Potocean"' not found in TSV file.
Processing line 20...
Processing line 30...
Processing line 40...
Processing line 50...
Processing line 60...
Line 65: 'Golden palm,1958,The Cranes are Flying,Mikhail Kalatozov,"Tatyana Samojlova, Aleksey Batalov, Vasiliy Merkurev ..."' not found in TSV file.
Processing line 70...
Processing line 80...
Processing line 90...
Processing line 100...
Processing line 110...
Processing line 120...
Processing line 130...
Processing line 140...
Processing line 150...
Processing line 160...
Processing line 170...
Processing line 180...
Processing line 190...
Processing line 200...
Processing line 210...
Line 214: 'Best actor,2005,The Three Burials of Melquíades Estrada,Tommy Lee Jones,"Tommy Lee Jones, Barry Pepper, Julio Cedillo ..."' not found in TSV file.
Processing line 220...
Processing line 230...
Processin

In [46]:
import pandas as pd
import re

# CSV file path
csv_file = "C:/Users/frivo/Documents/csv coding project/mar_film_affinity_winners.csv"

# TSV file path
tsv_file_path = "C:/Users/frivo/Downloads/title.akas.tsv/title.akas.tsv"

# Read all lines from the CSV file starting from the second line
# Read all lines from the CSV file starting from the second line
with open(csv_file, 'r', encoding='utf-8') as file:
    csv_lines = file.readlines()[1:]

# Preprocess TSV file into a dictionary for faster lookups
tsv_lookup = {}
with open(tsv_file_path, 'r', encoding='utf-8') as tsv_file:
    for line in tsv_file:
        # Extract title from the TSV line
        movie_title_tsv = line.split('\t')[2].strip()  # Assuming the title is the third column
        
        # Remove non-alphanumeric characters from the title
        movie_title_tsv = re.sub(r'[^a-zA-Z0-9]', '', movie_title_tsv)

        # Store line in the dictionary
        tsv_lookup[movie_title_tsv] = line.strip()

# List to store found lines
found_lines = []

# List to store line numbers not found in TSV file
lines_not_found = []

# Search for each line from the CSV file in the TSV dictionary
for line_num, csv_line in enumerate(csv_lines, start=2):  # Start from line 2
    if line_num % 10 == 0:  # Print line number every 10 lines
        print(f"Processing line {line_num}...")

    # Extract movie title from the CSV line
    movie_title_csv = csv_line.strip().split(',')[2].strip()

    # Remove non-alphanumeric characters from the title
    movie_title_csv = re.sub(r'[^a-zA-Z0-9]', '', movie_title_csv)

    # Search for the movie title in the TSV dictionary
    tsv_line = tsv_lookup.get(movie_title_csv)
    if tsv_line:
        found_lines.append(tsv_line)
    else:
        lines_not_found.append((line_num, csv_line.strip()))
        print(f"Line {line_num}: '{csv_line.strip()}' not found in TSV file.")

# Write found lines to a CSV file
found_df = pd.DataFrame(found_lines, columns=['TSV Line'])
found_df.to_csv('found_lines_mar.csv', index=False)

print("Total lines not found in TSV file:", len(lines_not_found))
print("Lines not found in TSV file:")
for line_num, line_content in lines_not_found:
    print(f"Line {line_num}: {line_content}")

Processing line 10...
Line 11: 'Best film,2014,Come to my Voice,Huseyin Karabey,"Tuncay Akdemir, Selim Bulut, Sabahettin DAG ..."' not found in TSV file.
Processing line 20...
Processing line 30...
Line 34: 'Best film,1963,Angyalok Földje,György Révész,"Tamás Végvári, Klári Tolnay, Ferenc Zenthe ..."' not found in TSV file.
Processing line 40...
Processing line 50...
Line 59: 'Special jury award,1996,Warshot,Heiner Stadler,"Billy Clarke, Calvin Burke, Claude Channice ..."' not found in TSV file.
Processing line 60...
Line 66: 'Best director,2019,"I Was at Home, But",Angela Schanelec,"Maren Eggert, Jakob Lassalle, Clara Möller ..."' not found in TSV file.
Line 67: 'Best director,2018,What You Gonna Do When The World's On Fire?,Roberto Minervini,"Documentary,  Self: Judy Hill, Dorothy Hill, Michael Nelson ..."' not found in TSV file.
Processing line 70...
Processing line 80...
Line 86: 'Best director,1998,You laugh,"Paolo Taviani, Vittorio Taviani","Turi Ferro, Antonio Albanese, Lello Ar

In [47]:
import pandas as pd
import re 
# CSV file path
csv_file = "C:/Users/frivo/Documents/csv coding project/shanghai_film_affinity_winners.csv"

# TSV file path
tsv_file_path = "C:/Users/frivo/Downloads/title.akas.tsv/title.akas.tsv"

# Read all lines from the CSV file starting from the second line
# Read all lines from the CSV file starting from the second line
with open(csv_file, 'r', encoding='utf-8') as file:
    csv_lines = file.readlines()[1:]

# Preprocess TSV file into a dictionary for faster lookups
tsv_lookup = {}
with open(tsv_file_path, 'r', encoding='utf-8') as tsv_file:
    for line in tsv_file:
        # Extract title from the TSV line
        movie_title_tsv = line.split('\t')[2].strip()  # Assuming the title is the third column
        
        # Remove non-alphanumeric characters from the title
        movie_title_tsv = re.sub(r'[^a-zA-Z0-9]', '', movie_title_tsv)

        # Store line in the dictionary
        tsv_lookup[movie_title_tsv] = line.strip()

# List to store found lines
found_lines = []

# List to store line numbers not found in TSV file
lines_not_found = []

# Search for each line from the CSV file in the TSV dictionary
for line_num, csv_line in enumerate(csv_lines, start=2):  # Start from line 2
    if line_num % 10 == 0:  # Print line number every 10 lines
        print(f"Processing line {line_num}...")

    # Extract movie title from the CSV line
    movie_title_csv = csv_line.strip().split(',')[2].strip()

    # Remove non-alphanumeric characters from the title
    movie_title_csv = re.sub(r'[^a-zA-Z0-9]', '', movie_title_csv)

    # Search for the movie title in the TSV dictionary
    tsv_line = tsv_lookup.get(movie_title_csv)
    if tsv_line:
        found_lines.append(tsv_line)
    else:
        lines_not_found.append((line_num, csv_line.strip()))
        print(f"Line {line_num}: '{csv_line.strip()}' not found in TSV file.")

# Write found lines to a CSV file
found_df = pd.DataFrame(found_lines, columns=['TSV Line'])
found_df.to_csv('found_lines_shanghai.csv', index=False)

print("Total lines not found in TSV file:", len(lines_not_found))
print("Lines not found in TSV file:")
for line_num, line_content in lines_not_found:
    print(f"Line {line_num}: {line_content}")

Processing line 10...
Line 12: 'Best film,2011,Hayde bre,Orhan Oguz,"Sevket Emrulla, Nilüfer Açikalin, Ilker Inanoglu ..."' not found in TSV file.
Line 16: 'Best film,2007,According to the plan,Franziska Meletzky,"Dagmar Manzel, Corinna Harfouch, Kirsten Block ..."' not found in TSV file.
Processing line 20...
Processing line 30...
Processing line 40...
Processing line 50...
Processing line 60...
Processing line 70...
Line 75: 'Best actor,2023,Dust To Dust,Jonathan Li,"Da Peng, Yao Chang, Lan Ke ..."' not found in TSV file.
Processing line 80...
Line 85: 'Best actor,2011,Hayde bre,Orhan Oguz,"Sevket Emrulla, Nilüfer Açikalin, Ilker Inanoglu ..."' not found in TSV file.
Processing line 90...
Processing line 100...
Processing line 110...
Line 113: 'Best actress,2007,According to the plan,Franziska Meletzky,"Dagmar Manzel, Corinna Harfouch, Kirsten Block ..."' not found in TSV file.
Processing line 120...
Processing line 130...
Processing line 140...
Processing line 150...
Processing line

In [48]:
import pandas as pd
import re
# CSV file path
csv_file = "C:/Users/frivo/Documents/csv coding project/sitges_film_affinity_winners.csv"

# TSV file path
tsv_file_path = "C:/Users/frivo/Downloads/title.akas.tsv/title.akas.tsv"

# Read all lines from the CSV file starting from the second line
with open(csv_file, 'r', encoding='utf-8') as file:
    csv_lines = file.readlines()[1:]

# Preprocess TSV file into a dictionary for faster lookups
tsv_lookup = {}
with open(tsv_file_path, 'r', encoding='utf-8') as tsv_file:
    for line in tsv_file:
        # Extract title from the TSV line
        movie_title_tsv = line.split('\t')[2].strip()  # Assuming the title is the third column
        
        # Remove non-alphanumeric characters from the title
        movie_title_tsv = re.sub(r'[^a-zA-Z0-9]', '', movie_title_tsv)

        # Store line in the dictionary
        tsv_lookup[movie_title_tsv] = line.strip()

# List to store found lines
found_lines = []

# List to store line numbers not found in TSV file
lines_not_found = []

# Search for each line from the CSV file in the TSV dictionary
for line_num, csv_line in enumerate(csv_lines, start=2):  # Start from line 2
    if line_num % 10 == 0:  # Print line number every 10 lines
        print(f"Processing line {line_num}...")

    # Extract movie title from the CSV line
    movie_title_csv = csv_line.strip().split(',')[2].strip()

    # Remove non-alphanumeric characters from the title
    movie_title_csv = re.sub(r'[^a-zA-Z0-9]', '', movie_title_csv)

    # Search for the movie title in the TSV dictionary
    tsv_line = tsv_lookup.get(movie_title_csv)
    if tsv_line:
        found_lines.append(tsv_line)
    else:
        lines_not_found.append((line_num, csv_line.strip()))
        print(f"Line {line_num}: '{csv_line.strip()}' not found in TSV file.")

# Write found lines to a CSV file
found_df = pd.DataFrame(found_lines, columns=['TSV Line'])
found_df.to_csv('found_lines_sitges.csv', index=False)

print("Total lines not found in TSV file:", len(lines_not_found))
print("Lines not found in TSV file:")
for line_num, line_content in lines_not_found:
    print(f"Line {line_num}: {line_content}")

Processing line 10...
Processing line 20...
Processing line 30...
Processing line 40...
Processing line 50...
Processing line 60...
Processing line 70...
Processing line 80...
Processing line 90...
Line 98: 'Best director,1986,The Legend Of Suram Fortress,"Sergei Parajanov, Dodo Abashidze","Veriko Andjaparidze, Tamari Tsitsishvili, Dodo Abashidze ..."' not found in TSV file.
Processing line 100...
Processing line 110...
Processing line 120...
Processing line 130...
Processing line 140...
Processing line 150...
Line 153: 'Best actor,1984,The Brother From Another Planet,John Sayles,"Joe Morton, Tom Wright, Caroline Aaron ..."' not found in TSV file.
Processing line 160...
Processing line 170...
Processing line 180...
Processing line 190...
Processing line 200...
Processing line 210...
Processing line 220...
Processing line 230...
Line 236: 'Best screenplay,2007,Is OK,Park Chan-wook,"Lim Soo-jung, Rain, Choi Hee-jin ..."' not found in TSV file.
Processing line 240...
Line 247: 'Best scree

In [25]:
import pandas as pd
import re

# CSV file path
csv_file = "C:/Users/frivo/Documents/csv coding project/tokyo_film_affinity_winners.csv"

# TSV file path
tsv_file_path = "C:/Users/frivo/Downloads/title.akas.tsv/title.akas.tsv"

# Read all lines from the CSV file starting from the second line
with open(csv_file, 'r', encoding='utf-8') as file:
    csv_lines = file.readlines()[1:]

# Preprocess TSV file into a dictionary for faster lookups
tsv_lookup = {}
with open(tsv_file_path, 'r', encoding='utf-8') as tsv_file:
    for line in tsv_file:
        # Extract title from the TSV line
        movie_title_tsv = line.split('\t')[2].strip()  # Assuming the title is the third column
        
        # Remove non-alphanumeric characters from the title
        movie_title_tsv = re.sub(r'[^a-zA-Z0-9]', '', movie_title_tsv)

        # Store line in the dictionary
        tsv_lookup[movie_title_tsv] = line.strip()

# List to store found lines
found_lines = []

# List to store line numbers not found in TSV file
lines_not_found = []

# Search for each line from the CSV file in the TSV dictionary
for line_num, csv_line in enumerate(csv_lines, start=2):  # Start from line 2
    if line_num % 10 == 0:  # Print line number every 10 lines
        print(f"Processing line {line_num}...")

    # Extract movie title from the CSV line
    movie_title_csv = csv_line.strip().split(',')[2].strip()

    # Remove non-alphanumeric characters from the title
    movie_title_csv = re.sub(r'[^a-zA-Z0-9]', '', movie_title_csv)

    # Search for the movie title in the TSV dictionary
    tsv_line = tsv_lookup.get(movie_title_csv)
    if tsv_line:
        found_lines.append(tsv_line)
    else:
        lines_not_found.append((line_num, csv_line.strip()))
        print(f"Line {line_num}: '{csv_line.strip()}' not found in TSV file.")

# Write found lines to a CSV file
found_df = pd.DataFrame(found_lines, columns=['TSV Line'])
found_df.to_csv('found_lines_tokyo.csv', index=False)

print("Total lines not found in TSV file:", len(lines_not_found))
print("Lines not found in TSV file:")
for line_num, line_content in lines_not_found:
    print(f"Line {line_num}: {line_content}")

Total lines not found in TSV file: 0
Lines not found in TSV file:


In [49]:
import pandas as pd
import re

# CSV file path
csv_file = "C:/Users/frivo/Documents/csv coding project/toronto_film_affinity_winners.csv"

# TSV file path
tsv_file_path = "C:/Users/frivo/Downloads/title.akas.tsv/title.akas.tsv"

# Read all lines from the CSV file starting from the second line
with open(csv_file, 'r', encoding='utf-8') as file:
    csv_lines = file.readlines()[1:]

# Preprocess TSV file into a dictionary for faster lookups
tsv_lookup = {}
with open(tsv_file_path, 'r', encoding='utf-8') as tsv_file:
    for line in tsv_file:
        # Extract title from the TSV line
        movie_title_tsv = line.split('\t')[2].strip()  # Assuming the title is the third column
        
        # Remove non-alphanumeric characters from the title
        movie_title_tsv = re.sub(r'[^a-zA-Z0-9]', '', movie_title_tsv)

        # Store line in the dictionary
        tsv_lookup[movie_title_tsv] = line.strip()

# List to store found lines
found_lines = []

# List to store line numbers not found in TSV file
lines_not_found = []

# Search for each line from the CSV file in the TSV dictionary
for line_num, csv_line in enumerate(csv_lines, start=2):  # Start from line 2
    if line_num % 10 == 0:  # Print line number every 10 lines
        print(f"Processing line {line_num}...")

    # Extract movie title from the CSV line
    movie_title_csv = csv_line.strip().split(',')[2].strip()

    # Remove non-alphanumeric characters from the title
    movie_title_csv = re.sub(r'[^a-zA-Z0-9]', '', movie_title_csv)

    # Search for the movie title in the TSV dictionary
    tsv_line = tsv_lookup.get(movie_title_csv)
    if tsv_line:
        found_lines.append(tsv_line)
    else:
        lines_not_found.append((line_num, csv_line.strip()))
        print(f"Line {line_num}: '{csv_line.strip()}' not found in TSV file.")

# Write found lines to a CSV file
found_df = pd.DataFrame(found_lines, columns=['TSV Line'])
found_df.to_csv('found_lines_toronto.csv', index=False)

print("Total lines not found in TSV file:", len(lines_not_found))
print("Lines not found in TSV file:")
for line_num, line_content in lines_not_found:
    print(f"Line {line_num}: {line_content}")

Line 8: 'Choice award,2017,"Three Billboards Outside Ebbing, Missouri",Martin McDonagh,"Frances McDormand, Woody Harrelson, Sam Rockwell ..."' not found in TSV file.
Processing line 10...
Processing line 20...
Processing line 30...
Processing line 40...
Processing line 50...
Processing line 60...
Processing line 70...
Line 70: 'Best canadian film,2014,Félix and Meira,Maxime Giroux,"Martin Dubreuil, Hadas Yaron, Luzer Twersky ..."' not found in TSV file.
Processing line 80...
Processing line 90...
Processing line 100...
Line 106: 'Best canadian first film,2017,LukI,Wayne Wapeemukwa,"Angel Gates, Joe Buffalo, Ken Harrower ..."' not found in TSV file.
Processing line 110...
Processing line 120...
Line 129: 'Platform mention,2019,"Anne at 13,000 Ft.",Kazik Radwanski,"Deragh Campbell, Lawrene Denkers, Matt Johnson ..."' not found in TSV file.
Processing line 130...
Processing line 140...
Processing line 150...
Processing line 160...
Processing line 170...
Processing line 180...
Total lines 

Processing line 10...
Processing line 20...
Processing line 30...
Line 35: 'Golden lion,1990,Rosencrantz and Guildenstern Are Dead,Tom Stoppard,"Gary Oldman, Tim Roth, Richard Dreyfuss ..."' not found in TSV file.
Processing line 40...
Line 46: 'Golden lion,1968,Artists under the Big Top: Perplexed,Alexander Kluge,"Hannelore Hoger, Sigi Graue, Alfred Edel ..."' not found in TSV file.
Line 49: 'Golden lion,1965,Sandra of a Thousand Delights,Luchino Visconti,"Claudia Cardinale, Jean Sorel, Michael Craig ..."' not found in TSV file.
Processing line 50...
Processing line 60...
Processing line 70...
Processing line 80...
Line 88: 'Silver lion,2023,Me Captain,Matteo Garrone,"Seydou Sarr, Moustapha Fall, Issaka Sawadogo ..."' not found in TSV file.
Processing line 90...
Processing line 100...
Line 100: 'Silver lion,2011,"People Mountain, People Sea",Cai Shangjun,"Chen Jianbin, Tao Hong, Wu Xiubo ..."' not found in TSV file.
Processing line 110...
Processing line 120...
Processing line 130...


In [64]:
import pandas as pd
import re

# CSV file path
csv_file = "C:/Users/frivo/Downloads/academy_awards_film_affinity_winners.csv"

# TSV file path
tsv_file_path = "C:/Users/frivo/Downloads/title.akas.tsv/title.akas.tsv"

# Read all lines from the CSV file starting from the second line
with open(csv_file, 'r', encoding='utf-8') as file:
    csv_lines = file.readlines()[1:]

# Preprocess TSV file into a dictionary for faster lookups
tsv_lookup = {}
with open(tsv_file_path, 'r', encoding='utf-8') as tsv_file:
    for line in tsv_file:
        # Extract title from the TSV line
        movie_title_tsv = line.split('\t')[2].strip()  # Assuming the title is the third column
        
        # Remove non-alphanumeric characters from the title
        movie_title_tsv = re.sub(r'[^a-zA-Z0-9]', '', movie_title_tsv)

        # Store line in the dictionary
        tsv_lookup[movie_title_tsv] = line.strip()

# List to store found lines
found_lines = []

# List to store line numbers not found in TSV file
lines_not_found = []

# Search for each line from the CSV file in the TSV dictionary
for line_num, csv_line in enumerate(csv_lines, start=2):  # Start from line 2
    if line_num % 10 == 0:  # Print line number every 10 lines
        print(f"Processing line {line_num}...")

    # Extract movie title from the CSV line excluding anything within parentheses
    movie_title_csv = re.sub(r'\([^()]*\)', '', csv_line.strip().split(',')[2].strip())

    # Remove non-alphanumeric characters from the title
    movie_title_csv = re.sub(r'[^a-zA-Z0-9]', '', movie_title_csv)

    # Search for the movie title in the TSV dictionary
    tsv_line = tsv_lookup.get(movie_title_csv)
    if tsv_line:
        found_lines.append(','.join(tsv_line.split('\t')))  # Joining the data with commas
    else:
        lines_not_found.append((line_num, csv_line.strip()))
        print(f"Line {line_num}: '{csv_line.strip()}' not found in TSV file.")

# Write found lines to a CSV file
found_df = pd.DataFrame(found_lines, columns=['TSV Line'])
found_df.to_csv('found_lines_academy_award.csv', index=False)

print("Total lines not found in TSV file:", len(lines_not_found))
print("Lines not found in TSV file:")
for line_num, line_content in lines_not_found:
    print(f"Line {line_num}: {line_content}")


Processing line 10...
Processing line 20...
Processing line 30...
Processing line 40...
Processing line 50...
Processing line 60...
Processing line 70...
Processing line 80...
Processing line 90...
Processing line 100...
Processing line 110...
Processing line 120...
Processing line 130...
Processing line 140...
Processing line 150...
Processing line 160...
Processing line 170...
Processing line 180...
Processing line 190...
Processing line 200...
Line 200: 'Best leading actress,2018,"Three Billboards Outside Ebbing, Missouri",Martin McDonagh,"Frances McDormand, Woody Harrelson, Sam Rockwell ..."' not found in TSV file.
Processing line 210...
Processing line 220...
Processing line 230...
Processing line 240...
Processing line 250...
Processing line 260...
Processing line 270...
Line 277: 'Best leading actress,1941,Kitty Foyle: The Natural History of a Woman,Sam Wood,"Ginger Rogers, Dennis Morgan, James Craig ..."' not found in TSV file.
Processing line 280...
Processing line 290...
Proc

In [63]:
import pandas as pd
import re

# CSV file path
csv_file = "C:/Users/frivo/Downloads/golden_globes_film_affinity_winners.csv"

# TSV file path
tsv_file_path = "C:/Users/frivo/Downloads/title.akas.tsv/title.akas.tsv"

# Read all lines from the CSV file starting from the second line
with open(csv_file, 'r', encoding='utf-8') as file:
    csv_lines = file.readlines()[1:]

# Preprocess TSV file into a dictionary for faster lookups
tsv_lookup = {}
with open(tsv_file_path, 'r', encoding='utf-8') as tsv_file:
    for line in tsv_file:
        # Extract title from the TSV line
        movie_title_tsv = line.split('\t')[2].strip()  # Assuming the title is the third column
        
        # Remove non-alphanumeric characters from the title
        movie_title_tsv = re.sub(r'[^a-zA-Z0-9]', '', movie_title_tsv)

        # Store line in the dictionary
        tsv_lookup[movie_title_tsv] = line.strip()

# List to store found lines
found_lines = []

# List to store line numbers not found in TSV file
lines_not_found = []

# Search for each line from the CSV file in the TSV dictionary
for line_num, csv_line in enumerate(csv_lines, start=2):  # Start from line 2
    if line_num % 10 == 0:  # Print line number every 10 lines
        print(f"Processing line {line_num}...")

    # Extract movie title from the CSV line excluding anything within parentheses
    movie_title_csv = re.sub(r'\([^()]*\)', '', csv_line.strip().split(',')[2].strip())

    # Remove non-alphanumeric characters from the title
    movie_title_csv = re.sub(r'[^a-zA-Z0-9]', '', movie_title_csv)

    # Search for the movie title in the TSV dictionary
    tsv_line = tsv_lookup.get(movie_title_csv)
    if tsv_line:
        found_lines.append(','.join(tsv_line.split('\t')))  # Joining the data with commas
    else:
        lines_not_found.append((line_num, csv_line.strip()))
        print(f"Line {line_num}: '{csv_line.strip()}' not found in TSV file.")

# Write found lines to a CSV file
found_df = pd.DataFrame(found_lines, columns=['TSV Line'])
found_df.to_csv('found_lines_golden_globes_award.csv', index=False)

print("Total lines not found in TSV file:", len(lines_not_found))
print("Lines not found in TSV file:")
for line_num, line_content in lines_not_found:
    print(f"Line {line_num}: {line_content}")


Line 8: 'Best picture drama,2018,"Three Billboards Outside Ebbing, Missouri",Martin McDonagh,"Frances McDormand, Woody Harrelson, Sam Rockwell ..."' not found in TSV file.
Processing line 10...
Processing line 20...
Processing line 30...
Processing line 40...
Processing line 50...
Processing line 60...
Processing line 70...
Processing line 80...
Processing line 90...
Processing line 100...
Processing line 110...
Processing line 120...
Processing line 130...
Processing line 140...
Processing line 150...
Processing line 160...
Processing line 170...
Processing line 180...
Processing line 190...
Processing line 200...
Processing line 210...
Processing line 220...
Processing line 230...
Processing line 240...
Processing line 250...
Processing line 260...
Processing line 270...
Processing line 280...
Processing line 290...
Processing line 300...
Processing line 310...
Processing line 320...
Line 324: 'Best leading actress drama,2018,"Three Billboards Outside Ebbing, Missouri",Martin McDonag

In [73]:
import pandas as pd

def search_and_save(csv_file1, csv_column1, csv_file2, csv_column2, output_csv):
    # Read the first CSV file and extract values from the specified column
    print("Reading first CSV file and extracting values...")
    csv_df1 = pd.read_csv(csv_file1)
    csv_values1 = csv_df1.iloc[:, csv_column1].astype(str).str.replace(r'\W', '').str.lower()
    print("Values extracted from the first CSV.")

    # Read the second CSV file and extract values from the specified column
    print("Reading second CSV file and extracting values...")
    csv_df2 = pd.read_csv(csv_file2)
    csv_values2 = csv_df2.iloc[:, csv_column2].astype(str).str.replace(r'\W', '').str.lower()
    print("Values extracted from the second CSV.")

    # Initialize a list to store lines where matches are found
    found_lines = []

    # Iterate through each value from the first CSV and search in the second CSV
    print("Searching for matches...")
    for i, csv_value1 in enumerate(csv_values1):
        # Check if csv_value1 exists in the second CSV
        if csv_value1 in csv_values2.values:
            print(f"Match found for value {csv_value1} at index {i}.")
            # Add the corresponding line from the first CSV to the list
            found_lines.append(csv_df1.iloc[i, :])

    # Write the found lines to a new CSV file
    found_df = pd.DataFrame(found_lines)
    found_df.to_csv(output_csv, index=False)
    print("Found lines saved to CSV.")

# Paths to the CSV files
csv_file1_path = "C:/Users/frivo/Downloads/camera_movies.csv"
csv_file2_path = "C:/Users/frivo/Downloads/found_lines_combined_film_festival_modified.csv"

# Output CSV file path
output_csv_path = 'found_lines_again.csv'

# Call the function
search_and_save(csv_file1_path, 0, csv_file2_path, 2, output_csv_path)


Reading first CSV file and extracting values...
Values extracted from the first CSV.
Reading second CSV file and extracting values...
Values extracted from the second CSV.
Searching for matches...
Match found for value excalibur at index 52.
Match found for value fitzcarraldo at index 84.
Match found for value birdy at index 163.
Match found for value police at index 196.
Match found for value platoon at index 206.
Match found for value caravaggio at index 218.
Match found for value robocop at index 244.
Match found for value brightness at index 271.
Match found for value alice at index 376.
Match found for value goodfellas at index 385.
Match found for value cyclo at index 600.
Match found for value shine at index 609.
Match found for value fargo at index 625.
Match found for value crash at index 630.
Match found for value gattaca at index 730.
Match found for value tango at index 762.
Match found for value topsy-turvy at index 829.
Match found for value magnolia at index 875.
Match f

In [68]:
import pandas as pd

# Read the CSV file with UTF-8 encoding
csv_file = "C:/Users/frivo/found_lines_golden_globes_award.csv"
with open(csv_file, 'r', encoding='utf-8') as file:
    lines = file.readlines()

# Split each line by comma and create a list of lists
data = [line.strip().split(',') for line in lines]

# Define column names
column_names = ['index', 'number', 'title', 'letter', 'letter_again', 'something', 'something_else', 'number_again','random']

# Create a DataFrame from the list of lists with specified column names
df = pd.DataFrame(data, columns=column_names)

# Save the DataFrame to the same CSV file, overwriting the existing file
df.to_csv(csv_file, index=False)


In [69]:
import pandas as pd

# Read the CSV file with UTF-8 encoding
csv_file = "C:/Users/frivo/found_lines_academy_award.csv"
with open(csv_file, 'r', encoding='utf-8') as file:
    lines = file.readlines()

# Split each line by comma and create a list of lists
data = [line.strip().split(',') for line in lines]

# Define column names
column_names = ['index', 'number', 'title', 'letter', 'letter_again', 'something', 'something_else', 'number_again','random']

# Create a DataFrame from the list of lists with specified column names
df = pd.DataFrame(data, columns=column_names)

# Save the DataFrame to the same CSV file, overwriting the existing file
df.to_csv(csv_file, index=False)


In [70]:
import pandas as pd

# Paths to the CSV files
csv_file1_path = "C:/Users/frivo/found_lines_golden_globes_award.csv"
csv_file2_path = "C:/Users/frivo/found_lines_academy_award.csv"

# Read both CSV files
csv_df1 = pd.read_csv(csv_file1_path, sep=',', header=None)
csv_df2 = pd.read_csv(csv_file2_path, sep=',', header=None)

# Concatenate both dataframes
combined_df = pd.concat([csv_df1, csv_df2], ignore_index=True)

# Output CSV file path
output_csv_path = 'C:/Users/frivo/found_lines_combined.csv'

# Save the combined dataframe to a new CSV file with comma-separated values
combined_df.to_csv(output_csv_path, index=False, header=False)

print("Combined CSV file saved successfully with comma-separated values on each line.")


Combined CSV file saved successfully with comma-separated values on each line.


In [71]:
import pandas as pd

def search_and_save(csv_file1, csv_column1, csv_file2, csv_column2, output_csv):
    # Read the first CSV file and extract values from the specified column
    print("Reading first CSV file and extracting values...")
    csv_df1 = pd.read_csv(csv_file1)
    csv_values1 = csv_df1.iloc[:, csv_column1].astype(str).str.replace(r'\W', '').str.lower()
    print("Values extracted from the first CSV.")

    # Read the second CSV file and extract values from the specified column
    print("Reading second CSV file and extracting values...")
    csv_df2 = pd.read_csv(csv_file2)
    csv_values2 = csv_df2.iloc[:, csv_column2].astype(str).str.replace(r'\W', '').str.lower()
    print("Values extracted from the second CSV.")

    # Initialize a list to store lines where matches are found
    found_lines = []

    # Iterate through each value from the first CSV and search in the second CSV
    print("Searching for matches...")
    for i, csv_value1 in enumerate(csv_values1):
        # Check if csv_value1 exists in the second CSV
        if csv_value1 in csv_values2.values:
            print(f"Match found for value {csv_value1} at index {i}.")
            # Add the corresponding line from the first CSV to the list
            found_lines.append(csv_df1.iloc[i, :])

    # Write the found lines to a new CSV file
    found_df = pd.DataFrame(found_lines)
    found_df.to_csv(output_csv, index=False)
    print("Found lines saved to CSV.")

# Paths to the CSV files
csv_file1_path = "C:/Users/frivo/Downloads/camera_movies.csv"
csv_file2_path = "C:/Users/frivo/found_lines_combined.csv"

# Output CSV file path
output_csv_path = 'found_lines.csv'

# Call the function
search_and_save(csv_file1_path, 0, csv_file2_path, 2, output_csv_path)


Reading first CSV file and extracting values...
Values extracted from the first CSV.
Reading second CSV file and extracting values...
Values extracted from the second CSV.
Searching for matches...
Match found for value fame at index 21.
Match found for value reds at index 63.
Match found for value tootsie at index 98.
Match found for value gandhi at index 105.
Match found for value dune at index 151.
Match found for value amadeus at index 166.
Match found for value platoon at index 206.
Match found for value castaway at index 216.
Match found for value aliens at index 240.
Match found for value robocop at index 244.
Match found for value buster at index 303.
Match found for value batman at index 318.
Match found for value seinfeld at index 321.
Match found for value ghost at index 363.
Match found for value alice at index 376.
Match found for value goodfellas at index 385.
Match found for value jfk at index 395.
Match found for value unforgiven at index 460.
Match found for value speed

In [74]:
import pandas as pd

# Paths to the CSV files
csv_file1_path = "C:/Users/frivo/found_lines.csv"
csv_file2_path = "C:/Users/frivo/found_lines_again.csv"

# Read both CSV files
csv_df1 = pd.read_csv(csv_file1_path, sep=',', header=None)
csv_df2 = pd.read_csv(csv_file2_path, sep=',', header=None)

# Concatenate both dataframes
combined_df = pd.concat([csv_df1, csv_df2], ignore_index=True)

# Output CSV file path
output_csv_path = 'C:/Users/frivo/grouped_merged_combined_festival_test'

# Save the combined dataframe to a new CSV file with comma-separated values
combined_df.to_csv(output_csv_path, index=False, header=False)

print("Combined CSV file saved successfully with comma-separated values on each line.")

Combined CSV file saved successfully with comma-separated values on each line.
