In [None]:
#ADVENTURE

import time
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import numpy as np
import re

# Function to convert raw votes(eg: 2k) to integer(eg: 2000)
def parse_votes(text):
    if not text:
        return 0
    text = text.strip("()").replace(",", "").upper()
    match = re.match(r"([\d\.]+)([KM]?)", text)
    if not match:
        return 0
    number, suffix = match.groups()
    number = float(number)
    if suffix == "K":
        return int(number * 1_000)
    elif suffix == "M":
        return int(number * 1_000_000)
    else:
        return int(number)
# Convert duration string(eg: 1h 50m) to minutes of integer datatype
def duration_to_minutes(duration_str):
    hours = minutes = 0
    if 'h' in duration_str:
        h_match = re.search(r'(\d+)\s*h', duration_str)
        if h_match:
            hours = int(h_match.group(1))
    if 'm' in duration_str:
        m_match = re.search(r'(\d+)\s*m', duration_str)
        if m_match:
            minutes = int(m_match.group(1))
    return hours * 60 + minutes

# Setup WebDriver
driver = webdriver.Chrome()
driver.get('https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31&genres=adventure')
time.sleep(5)

# Click more button several times to display all the movies on the IMDb search results page
for _ in range(12):
    try:
        load_more = driver.find_element(By.XPATH, "//span[contains(text(),'more')]")
        driver.execute_script("arguments[0].click();", load_more)
        time.sleep(2.5)
    except:
        break

# Data collection
movie_details = {
    'movie_names': [],
    'ratings': [],
    'voting_counts': [],
    'duration_minutes': []
}
# Finds all movie blocks on the IMDb search results page
movie_blocks = driver.find_elements(By.CSS_SELECTOR, "li[class='ipc-metadata-list-summary-item']")

for block in movie_blocks:
# Title
    title = block.find_element(By.CSS_SELECTOR, 'h3').text.split('. ', 1)[-1].strip()
# Duration
    duration_str = np.nan
    spans = block.find_elements(By.TAG_NAME, "span")
    for span in spans:
        text = span.text.strip()
        if ('h' in text or 'm' in text) and any(char.isdigit() for char in text):
            duration_str = text
            break
    if isinstance(duration_str, str):
        duration_minutes = duration_to_minutes(duration_str)
    else:
        duration_minutes = np.nan
# Ratings
    try:
        rating = block.find_element(By.CSS_SELECTOR, "span[class ='ipc-rating-star--rating']").text.strip()
    except:
        rating = np.nan
# Votes
    try:
        vote_element = block.find_element(By.CSS_SELECTOR, "span[class ='ipc-rating-star--voteCount']")
        raw_votes_text = vote_element.text.strip() 
        votes = parse_votes(raw_votes_text)
    except:
        votes = 0

# Store movie data and print it
    movie_details['movie_names'].append(title)
    movie_details['ratings'].append(rating)
    movie_details['voting_counts'].append(votes)
    movie_details['duration_minutes'].append(duration_minutes)
    print(f"{title} | Rating: {rating} | Votes: {votes} | Duration: {duration_minutes}")
    
print("Total movies scraped:", len(movie_details['movie_names']))


In [None]:
# Data cleaning

import pandas as pd

adventure = pd.DataFrame(movie_details)
# Adds a new column named "genre" and sets every row's value to "adventure"
adventure['genre'] = "adventure"
# Checks for missing values (NaNs) in each column.
adventure.isna().sum()
# Converts the 'ratings' column to numeric (float) values.
adventure['ratings'] = pd.to_numeric(adventure['ratings'])
# Fills missing values
adventure = adventure.fillna(value={
                'ratings':float(adventure['ratings'].mean()),
                'duration_minutes':int(adventure['duration_minutes'].mean())
                })
# Makes sure the duration_minutes column is of integer type (not float) — because duration is a whole number
adventure['duration_minutes'] = adventure['duration_minutes'].astype(int)
# Prints a summary of the DataFrame:
adventure.info()
# Saves the cleaned DataFrame to a CSV file
adventure.to_csv("adventure.csv")

In [None]:
#Animation

import time
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import numpy as np
import re

# Function to convert raw votes(eg: 2k) to integer(eg: 2000)
def parse_votes(text):
    if not text:
        return 0
    text = text.strip("()").replace(",", "").upper()
    match = re.match(r"([\d\.]+)([KM]?)", text)
    if not match:
        return 0
    number, suffix = match.groups()
    number = float(number)
    if suffix == "K":
        return int(number * 1_000)
    elif suffix == "M":
        return int(number * 1_000_000)
    else:
        return int(number)
# Convert duration string(eg: 1h 50m) to minutes of integer datatype
def duration_to_minutes(duration_str):
    hours = minutes = 0
    if 'h' in duration_str:
        h_match = re.search(r'(\d+)\s*h', duration_str)
        if h_match:
            hours = int(h_match.group(1))
    if 'm' in duration_str:
        m_match = re.search(r'(\d+)\s*m', duration_str)
        if m_match:
            minutes = int(m_match.group(1))
    return hours * 60 + minutes

# Setup WebDriver
driver = webdriver.Chrome()
driver.get('https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31&genres=animation')
time.sleep(5)

# Click more button several times to display all the movies on the IMDb search results page
for _ in range(8):
    try:
        load_more = driver.find_element(By.XPATH, "//span[contains(text(),'more')]")
        driver.execute_script("arguments[0].click();", load_more)
        time.sleep(2.5)
    except:
        break
# Data collection
movie_details = {
    'movie_names': [],
    'ratings': [],
    'voting_counts': [],
    'duration_minutes': []
}
# Finds all movie blocks on the IMDb search results page
movie_blocks = driver.find_elements(By.CSS_SELECTOR, "li[class='ipc-metadata-list-summary-item']")

for block in movie_blocks:
# Title
    title = block.find_element(By.CSS_SELECTOR, 'h3').text.split('. ', 1)[-1].strip()

# Duration
    duration_str = np.nan
    spans = block.find_elements(By.TAG_NAME, "span")
    for span in spans:
        text = span.text.strip()
        if ('h' in text or 'm' in text) and any(char.isdigit() for char in text):
            duration_str = text
            break
    if isinstance(duration_str, str):
        duration_minutes = duration_to_minutes(duration_str)
    else:
        duration_minutes = np.nan
# Ratings 
    try:
        rating = block.find_element(By.CSS_SELECTOR, "span[class ='ipc-rating-star--rating']").text.strip()
    except:
        rating = np.nan
# Votes
    try:
        vote_element = block.find_element(By.CSS_SELECTOR, "span[class ='ipc-rating-star--voteCount']")
        raw_votes_text = vote_element.text.strip()  # e.g. "246K"
        votes = parse_votes(raw_votes_text)
    except:
        votes = 0
# Store movie data and print it
    movie_details['movie_names'].append(title)
    movie_details['ratings'].append(rating)
    movie_details['voting_counts'].append(votes)
    movie_details['duration_minutes'].append(duration_minutes)
    
    print(f"{title} | Rating: {rating} | Votes: {votes} | Duration: {duration_minutes}")

print("Total movies scraped:", len(movie_details['movie_names']))


In [None]:
# Data cleaning

import pandas as pd

animation= pd.DataFrame(movie_details)
# Adds a new column named "genre" and sets every row's value to "animation"
animation['genre'] = "animation"
# Checks for missing values (NaNs) in each column.
animation.isna().sum()
# Converts the 'ratings' column to numeric (float) values.
animation['ratings'] = pd.to_numeric(animation['ratings'])
# Fills missing values
animation= animation.fillna(value={
                'ratings':float(animation['ratings'].mean()),
                'duration_minutes':int(animation['duration_minutes'].mean())
                })
# Makes sure the duration_minutes column is of integer type (not float) — because duration is a whole number
animation['duration_minutes'] = animation['duration_minutes'].astype(int)
# Prints a summary of the DataFrame:
animation.info()
# Saves the cleaned DataFrame to a CSV file
animation.to_csv("animation.csv")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420 entries, 0 to 419
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   movie_names       420 non-null    object 
 1   ratings           420 non-null    float64
 2   voting_counts     420 non-null    int64  
 3   duration_minutes  420 non-null    int64  
 4   genre             420 non-null    object 
dtypes: float64(1), int64(2), object(2)
memory usage: 16.5+ KB


In [None]:
#Biography

import time
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import numpy as np
import re

# Function to convert raw votes(eg: 2k) to integer(eg: 2000)
def parse_votes(text):
    if not text:
        return 0
    text = text.strip("()").replace(",", "").upper()
    match = re.match(r"([\d\.]+)([KM]?)", text)
    if not match:
        return 0
    number, suffix = match.groups()
    number = float(number)
    if suffix == "K":
        return int(number * 1_000)
    elif suffix == "M":
        return int(number * 1_000_000)
    else:
        return int(number)
    
# Convert duration string(eg: 1h 50m) to minutes of integer datatype
def duration_to_minutes(duration_str):
    hours = minutes = 0
    if 'h' in duration_str:
        h_match = re.search(r'(\d+)\s*h', duration_str)
        if h_match:
            hours = int(h_match.group(1))
    if 'm' in duration_str:
        m_match = re.search(r'(\d+)\s*m', duration_str)
        if m_match:
            minutes = int(m_match.group(1))
    return hours * 60 + minutes

# Setup WebDriver
driver = webdriver.Chrome()
driver.get('https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31&genres=biography')
time.sleep(5)

# Click more button several times to display all the movies on the IMDb search results page
for _ in range(9):
    try:
        load_more = driver.find_element(By.XPATH, "//span[contains(text(),'more')]")
        driver.execute_script("arguments[0].click();", load_more)
        time.sleep(2.5)
    except:
        break

# Data collection
movie_details = {
    'movie_names': [],
    'ratings': [],
    'voting_counts': [],
    'duration_minutes': []
}
# Finds all movie blocks on the IMDb search results page
movie_blocks = driver.find_elements(By.CSS_SELECTOR, "li[class='ipc-metadata-list-summary-item']")

for block in movie_blocks:
# Title
    title = block.find_element(By.CSS_SELECTOR, 'h3').text.split('. ', 1)[-1].strip()
# Duration
    duration_str = np.nan
    spans = block.find_elements(By.TAG_NAME, "span")
    for span in spans:
        text = span.text.strip()
        if ('h' in text or 'm' in text) and any(char.isdigit() for char in text):
            duration_str = text
            break
    if isinstance(duration_str, str):
        duration_minutes = duration_to_minutes(duration_str)
    else:
        duration_minutes = np.nan
# Ratings
    try:
        rating = block.find_element(By.CSS_SELECTOR, "span[class ='ipc-rating-star--rating']").text.strip()
    except:
        rating = np.nan
# Votes
    try:
        vote_element = block.find_element(By.CSS_SELECTOR, "span[class ='ipc-rating-star--voteCount']")
        raw_votes_text = vote_element.text.strip()  # e.g. "246K"
        votes = parse_votes(raw_votes_text)
    except:
        votes = 0
# Store movie data and print it
    movie_details['movie_names'].append(title)
    movie_details['ratings'].append(rating)
    movie_details['voting_counts'].append(votes)
    movie_details['duration_minutes'].append(duration_minutes)

    print(f"{title} | Rating: {rating} | Votes: {votes} | Duration: {duration_minutes}")

print("Total movies scraped:", len(movie_details['movie_names']))


A Complete Unknown | Rating: 7.4 | Votes: 88000 | Duration: 141
I'm Still Here | Rating: 8.2 | Votes: 111000 | Duration: 137
Saturday Night | Rating: 6.9 | Votes: 32000 | Duration: 109
The Apprentice | Rating: 7.1 | Votes: 59000 | Duration: 122
Better Man | Rating: 7.6 | Votes: 30000 | Duration: 135
Queer | Rating: 6.4 | Votes: 23000 | Duration: 137
Queen of the Ring | Rating: 6.4 | Votes: 1900 | Duration: 129
Maria | Rating: 6.4 | Votes: 20000 | Duration: 124
Reagan | Rating: 6.1 | Votes: 11000 | Duration: 141
The Fire Inside | Rating: 6.7 | Votes: 2900 | Duration: 109
Nr. 24 | Rating: 7.5 | Votes: 16000 | Duration: 111
William Tell | Rating: 5.7 | Votes: 2900 | Duration: 133
Pavements | Rating: 7.2 | Votes: 253 | Duration: 128
Joy | Rating: 7.1 | Votes: 9800 | Duration: 115
Ordinary Angels | Rating: 7.4 | Votes: 16000 | Duration: 118
Back to Black | Rating: 6.3 | Votes: 27000 | Duration: 122
Bonhoeffer: Pastor. Spy. Assassin. | Rating: 6.2 | Votes: 3800 | Duration: 133
Young Woman an

In [None]:

# Data cleaning
import pandas as pd

biography= pd.DataFrame(movie_details)
# Adds a new column named "genre" and sets every row's value to "biography"
biography['genre'] = "biography"
# Checks for missing values (NaNs) in each column.
biography.isna().sum()
# Converts the 'ratings' column to numeric (float) values.
biography['ratings'] = pd.to_numeric(biography['ratings'])
# Fills missing values
biography= biography.fillna(value={
                'ratings':float(biography['ratings'].mean()),
                'duration_minutes':int(biography['duration_minutes'].mean())
                })
# Makes sure the duration_minutes column is of integer type (not float) — because duration is a whole number
biography['duration_minutes'] = biography['duration_minutes'].astype(int)
# Prints a summary of the DataFrame:
biography.info()
# Saves the cleaned DataFrame to a CSV file
biography.to_csv("biography.csv")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 483 entries, 0 to 482
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   movie_names       483 non-null    object 
 1   ratings           483 non-null    float64
 2   voting_counts     483 non-null    int64  
 3   duration_minutes  483 non-null    int64  
 4   genre             483 non-null    object 
dtypes: float64(1), int64(2), object(2)
memory usage: 19.0+ KB


In [None]:
#History

import time
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import numpy as np
import re
# Function to convert raw votes(eg: 2k) to integer(eg: 2000
def parse_votes(text):
    if not text:
        return 0
    text = text.strip("()").replace(",", "").upper()
    match = re.match(r"([\d\.]+)([KM]?)", text)
    if not match:
        return 0
    number, suffix = match.groups()
    number = float(number)
    if suffix == "K":
        return int(number * 1_000)
    elif suffix == "M":
        return int(number * 1_000_000)
    else:
        return int(number)
# Convert duration string(eg: 1h 50m) to minutes of integer datatype
def duration_to_minutes(duration_str):
    hours = minutes = 0
    if 'h' in duration_str:
        h_match = re.search(r'(\d+)\s*h', duration_str)
        if h_match:
            hours = int(h_match.group(1))
    if 'm' in duration_str:
        m_match = re.search(r'(\d+)\s*m', duration_str)
        if m_match:
            minutes = int(m_match.group(1))
    return hours * 60 + minutes

# Setup WebDriver
driver = webdriver.Chrome()
driver.get('https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31&genres=history')
time.sleep(5)

# Click more button several times to display all the movies on the IMDb search results page
for _ in range(6):
    try:
        load_more = driver.find_element(By.XPATH, "//span[contains(text(),'more')]")
        driver.execute_script("arguments[0].click();", load_more)
        time.sleep(2.5)
    except:
        break

# Data collection
movie_details = {
    'movie_names': [],
    'ratings': [],
    'voting_counts': [],
    'duration_minutes': []
}
# Finds all movie blocks on the IMDb search results page
movie_blocks = driver.find_elements(By.CSS_SELECTOR, "li[class='ipc-metadata-list-summary-item']")

for block in movie_blocks:
# Title
    title = block.find_element(By.CSS_SELECTOR, 'h3').text.split('. ', 1)[-1].strip()
# Duration
    duration_str = np.nan
    spans = block.find_elements(By.TAG_NAME, "span")
    for span in spans:
        text = span.text.strip()
        if ('h' in text or 'm' in text) and any(char.isdigit() for char in text):
            duration_str = text
            break
    if isinstance(duration_str, str):
        duration_minutes = duration_to_minutes(duration_str)
    else:
        duration_minutes = np.nan
# Ratings
    try:
        rating = block.find_element(By.CSS_SELECTOR, "span[class ='ipc-rating-star--rating']").text.strip()
    except:
        rating = np.nan
# Votes
    try:
        vote_element = block.find_element(By.CSS_SELECTOR, "span[class ='ipc-rating-star--voteCount']")
        raw_votes_text = vote_element.text.strip()  # e.g. "246K"
        votes = parse_votes(raw_votes_text)
    except:
        votes = 0
# Store movie data and print it
    movie_details['movie_names'].append(title)
    movie_details['ratings'].append(rating)
    movie_details['voting_counts'].append(votes)
    movie_details['duration_minutes'].append(duration_minutes)

    print(f"{title} | Rating: {rating} | Votes: {votes} | Duration: {duration_minutes}")

print("Total movies scraped:", len(movie_details['movie_names']))


Small Things Like These | Rating: 6.7 | Votes: 26000 | Duration: 98
I'm Still Here | Rating: 8.2 | Votes: 111000 | Duration: 137
Saturday Night | Rating: 6.9 | Votes: 32000 | Duration: 109
The Apprentice | Rating: 7.1 | Votes: 59000 | Duration: 122
September 5 | Rating: 7.1 | Votes: 26000 | Duration: 95
Queer | Rating: 6.4 | Votes: 23000 | Duration: 137
The Return | Rating: 6.2 | Votes: 9800 | Duration: 116
The Six Triple Eight | Rating: 6.7 | Votes: 27000 | Duration: 127
The Girl with the Needle | Rating: 7.5 | Votes: 15000 | Duration: 123
Reagan | Rating: 6.1 | Votes: 11000 | Duration: 141
Blitz | Rating: 6.2 | Votes: 17000 | Duration: 120
William Tell | Rating: 5.7 | Votes: 2900 | Duration: 133
Joy | Rating: 7.1 | Votes: 9800 | Duration: 115
Bonhoeffer: Pastor. Spy. Assassin. | Rating: 6.2 | Votes: 3800 | Duration: 133
The Children's Train | Rating: 7.4 | Votes: 7600 | Duration: 106
Unfrosted | Rating: 5.5 | Votes: 35000 | Duration: 97
Vermiglio | Rating: 6.9 | Votes: 4600 | Duratio

In [None]:
# Data cleaning
import pandas as pd

history= pd.DataFrame(movie_details)
# Adds a new column named "genre" and sets every row's value to "history"
history['genre'] = "history"
# Checks for missing values (NaNs) in each column.
history.isna().sum()
# Converts the 'ratings' column to numeric (float) values.
history['ratings'] = pd.to_numeric(history['ratings'])
# Fills missing values
history= history.fillna(value={
                'ratings':float(history['ratings'].mean()),
                'duration_minutes':int(history['duration_minutes'].mean())
                })
# Makes sure the duration_minutes column is of integer type (not float) — because duration is a whole number
history['duration_minutes'] = history['duration_minutes'].astype(int)
# Prints a summary of the DataFrame:
history.info()
# Saves the cleaned DataFrame to a CSV file
history.to_csv("history.csv")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   movie_names       344 non-null    object 
 1   ratings           344 non-null    float64
 2   voting_counts     344 non-null    int64  
 3   duration_minutes  344 non-null    int64  
 4   genre             344 non-null    object 
dtypes: float64(1), int64(2), object(2)
memory usage: 13.6+ KB


In [1]:
#Sport

import time
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import numpy as np
import re

# Function to convert raw votes(eg: 2k) to integer(eg: 2000)
def parse_votes(text):
    if not text:
        return 0
    text = text.strip("()").replace(",", "").upper()
    match = re.match(r"([\d\.]+)([KM]?)", text)
    if not match:
        return 0
    number, suffix = match.groups()
    number = float(number)
    if suffix == "K":
        return int(number * 1_000)
    elif suffix == "M":
        return int(number * 1_000_000)
    else:
        return int(number)

# Convert duration string(eg: 1h 50m) to minutes of integer datatype
def duration_to_minutes(duration_str):
    hours = minutes = 0
    if 'h' in duration_str:
        h_match = re.search(r'(\d+)\s*h', duration_str)
        if h_match:
            hours = int(h_match.group(1))
    if 'm' in duration_str:
        m_match = re.search(r'(\d+)\s*m', duration_str)
        if m_match:
            minutes = int(m_match.group(1))
    return hours * 60 + minutes

# Setup WebDriver
driver = webdriver.Chrome()
driver.get('https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31&genres=sport')
time.sleep(5)
# Click more button several times to display all the movies on the IMDb search results page
for _ in range(5):
    try:
        load_more = driver.find_element(By.XPATH, "//span[contains(text(),'more')]")
        driver.execute_script("arguments[0].click();", load_more)
        time.sleep(2.5)
    except:
        break

# Data collection
movie_details = {
    'movie_names': [],
    'ratings': [],
    'voting_counts': [],
    'duration_minutes': []
}
# Finds all movie blocks on the IMDb search results page
movie_blocks = driver.find_elements(By.CSS_SELECTOR, "li[class='ipc-metadata-list-summary-item']")

for block in movie_blocks:
# Title
    title = block.find_element(By.CSS_SELECTOR, 'h3').text.split('. ', 1)[-1].strip()
# Duration
    duration_str = np.nan
    spans = block.find_elements(By.TAG_NAME, "span")
    for span in spans:
        text = span.text.strip()
        if ('h' in text or 'm' in text) and any(char.isdigit() for char in text):
            duration_str = text
            break
    if isinstance(duration_str, str):
        duration_minutes = duration_to_minutes(duration_str)
    else:
        duration_minutes = np.nan
# Ratings
    try:
        rating = block.find_element(By.CSS_SELECTOR, "span[class ='ipc-rating-star--rating']").text.strip()
    except:
        rating = np.nan
# Votes
    try:
        vote_element = block.find_element(By.CSS_SELECTOR, "span[class ='ipc-rating-star--voteCount']")
        raw_votes_text = vote_element.text.strip()  # e.g. "246K"
        votes = parse_votes(raw_votes_text)
    except:
        votes = 0
# Store movie data and print it
    movie_details['movie_names'].append(title)
    movie_details['ratings'].append(rating)
    movie_details['voting_counts'].append(votes)
    movie_details['duration_minutes'].append(duration_minutes)

    print(f"{title} | Rating: {rating} | Votes: {votes} | Duration: {duration_minutes}")

print("Total movies scraped:", len(movie_details['movie_names']))


Challengers | Rating: 7.0 | Votes: 158000 | Duration: 131
Queen of the Ring | Rating: 6.4 | Votes: 2000 | Duration: 129
The Fire Inside | Rating: 6.7 | Votes: 2900 | Duration: 109
Under Paris | Rating: 5.2 | Votes: 37000 | Duration: 104
Young Woman and the Sea | Rating: 7.5 | Votes: 18000 | Duration: 129
Unstoppable | Rating: 6.8 | Votes: 4400 | Duration: 116
Arthur the King | Rating: 7.0 | Votes: 35000 | Duration: 107
Lal Salaam | Rating: 4.4 | Votes: 1400 | Duration: 150
Kill the Jockey | Rating: 6.4 | Votes: 1800 | Duration: 96
Chandu Champion | Rating: 7.8 | Votes: 34000 | Duration: 142
You Gotta Believe | Rating: 5.8 | Votes: 1400 | Duration: 104
Haikyu!! The Dumpster Battle | Rating: 7.7 | Votes: 6500 | Duration: 85
Rez Ball | Rating: 6.3 | Votes: 2600 | Duration: 111
Diving Into the Darkness | Rating: 8.9 | Votes: 88 | Duration: 96
Sweet Dreams | Rating: 5.4 | Votes: 1500 | Duration: 95
Race for Glory: Audi vs. Lancia | Rating: 5.8 | Votes: 5000 | Duration: 109
Maidaan | Rating:

In [4]:
# Data cleaning
import pandas as pd

sport= pd.DataFrame(movie_details)
# Adds a new column named "genre" and sets every row's value to "sport"
sport['genre'] = "sport"
# Checks for missing values (NaNs) in each column.
sport.isna().sum()
# Converts the 'ratings' column to numeric (float) values.
sport['ratings'] = pd.to_numeric(sport['ratings'])

# Fills missing values
sport= sport.fillna(value={
                'ratings':float(sport['ratings'].mean()),
                'duration_minutes':int(sport['duration_minutes'].mean())
                })
# Makes sure the duration_minutes column is of integer type (not float) — because duration is a whole number
sport['duration_minutes'] = sport['duration_minutes'].astype(int)
# Prints a summary of the DataFrame:
sport.info()
# Saves the cleaned DataFrame to a CSV file
sport.to_csv("sport.csv")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 266 entries, 0 to 265
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   movie_names       266 non-null    object 
 1   ratings           266 non-null    float64
 2   voting_counts     266 non-null    int64  
 3   duration_minutes  266 non-null    int64  
 4   genre             266 non-null    object 
dtypes: float64(1), int64(2), object(2)
memory usage: 10.5+ KB


In [None]:
# Reading csv file into dataframe and concatenates multiple dataframes

path = r"E:/AI and ML/Project imdb/adventure.csv"
# index_col=0 :  Uses the first column (0th) of the CSV as the index
df1= pd.read_csv(path ,index_col=0)
path = r"E:/AI and ML/Project imdb/animation.csv"
df2= pd.read_csv(path ,index_col=0)
path = r"E:/AI and ML/Project imdb/biography.csv"
df3= pd.read_csv(path ,index_col=0)
path = r"E:/AI and ML/Project imdb/history.csv"
df4= pd.read_csv(path ,index_col=0)
path = r"E:/AI and ML/Project imdb/sport.csv"
df5= pd.read_csv(path ,index_col=0)

imdb =pd.concat([df1,df2,df3,df4,df5])
# Resets the index of the combined DataFrame. Discards the old index (from the CSV), so it won’t be added as a new column.
imdb = imdb.reset_index(drop=True)
imdb

Unnamed: 0,movie_names,ratings,voting_counts,duration_minutes,genre
0,Gladiator II,6.500000,246000,148,adventure
1,Dune: Part Two,8.500000,637000,166,adventure
2,Twisters,6.500000,174000,122,adventure
3,Kraven the Hunter,5.500000,61000,127,adventure
4,Deadpool & Wolverine,7.500000,503000,128,adventure
...,...,...,...,...,...
2120,11 Brothers: The Movie,6.891566,0,88,sport
2121,Kick Start,6.891566,0,88,sport
2122,Natural Selection Tour 2024,6.891566,0,88,sport
2123,Leuchtend Schwarz - Sturm und der Weg zum Double,6.891566,0,88,sport


In [40]:
imdb.to_csv("imdb.csv")


In [49]:
path = r"E:/AI and ML/Project imdb/imdb.csv"
imdb= pd.read_csv(path ,index_col=0)
imdb.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2125 entries, 0 to 2124
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   movie_names       2125 non-null   object 
 1   ratings           2125 non-null   float64
 2   voting_counts     2125 non-null   int64  
 3   duration_minutes  2125 non-null   int64  
 4   genre             2125 non-null   object 
dtypes: float64(1), int64(2), object(2)
memory usage: 99.6+ KB


In [None]:
#Connection to mysql server

import mysql.connector
connection = mysql.connector.connect(
    host="localhost",
    user="root",
    password="12345678",
   
)
cursor = connection.cursor()
cursor

<mysql.connector.cursor_cext.CMySQLCursor at 0x171e3739400>

In [44]:
query = "create database imdb"
cursor.execute(query)

In [50]:
query = "use imdb"
cursor.execute(query)

In [56]:
connection.commit()

In [54]:
query = "create table imdb(movie_names varchar(1000),ratings float, voting_counts int, duration_minutes int,genre varchar(100))"
cursor.execute(query)

In [None]:
# Inserting values inside imdb table

data = []
for index in imdb.index:
    row = imdb.loc[index].values
    row = (row[0]),float(row[1]),int(row[2]),int(row[3]),(row[4])
    print(row)
    data.append(row)
query = "insert into imdb values(%s,%s,%s,%s,%s)"
cursor.executemany(query,data)

('Gladiator II', 6.5, 246000, 148, 'adventure')
('Dune: Part Two', 8.5, 637000, 166, 'adventure')
('Twisters', 6.5, 174000, 122, 'adventure')
('Kraven the Hunter', 5.5, 61000, 127, 'adventure')
('Deadpool & Wolverine', 7.5, 503000, 128, 'adventure')
('Venom: The Last Dance', 6.0, 126000, 110, 'adventure')
('Mufasa: The Lion King', 6.6, 69000, 118, 'adventure')
('Moana 2', 6.6, 107000, 100, 'adventure')
('Flow', 7.9, 81000, 85, 'adventure')
('The Count of Monte-Cristo', 7.6, 37000, 178, 'adventure')
('Sonic the Hedgehog 3', 6.9, 63000, 110, 'adventure')
('Furiosa: A Mad Max Saga', 7.5, 289000, 148, 'adventure')
('Paddington in Peru', 6.7, 24000, 106, 'adventure')
('Freaky Tales', 6.2, 6400, 107, 'adventure')
('Inside Out 2', 7.5, 220000, 96, 'adventure')
('The Unholy Trinity', 6.069212962962963, 0, 93, 'adventure')
('Borderlands', 4.7, 50000, 101, 'adventure')
('Transformers One', 7.6, 53000, 104, 'adventure')
('Bad Boys: Ride or Die', 6.5, 102000, 115, 'adventure')
('Despicable Me 4', 

In [None]:
# Printing each row of imdb table
query = "select * from imdb"
cursor.execute(query)
for data in cursor:
    print(data)

('Gladiator II', 6.5, 246000, 148, 'adventure')
('Dune: Part Two', 8.5, 637000, 166, 'adventure')
('Twisters', 6.5, 174000, 122, 'adventure')
('Kraven the Hunter', 5.5, 61000, 127, 'adventure')
('Deadpool & Wolverine', 7.5, 503000, 128, 'adventure')
('Venom: The Last Dance', 6.0, 126000, 110, 'adventure')
('Mufasa: The Lion King', 6.6, 69000, 118, 'adventure')
('Moana 2', 6.6, 107000, 100, 'adventure')
('Flow', 7.9, 81000, 85, 'adventure')
('The Count of Monte-Cristo', 7.6, 37000, 178, 'adventure')
('Sonic the Hedgehog 3', 6.9, 63000, 110, 'adventure')
('Furiosa: A Mad Max Saga', 7.5, 289000, 148, 'adventure')
('Paddington in Peru', 6.7, 24000, 106, 'adventure')
('Freaky Tales', 6.2, 6400, 107, 'adventure')
('Inside Out 2', 7.5, 220000, 96, 'adventure')
('The Unholy Trinity', 6.06921, 0, 93, 'adventure')
('Borderlands', 4.7, 50000, 101, 'adventure')
('Transformers One', 7.6, 53000, 104, 'adventure')
('Bad Boys: Ride or Die', 6.5, 102000, 115, 'adventure')
('Despicable Me 4', 6.2, 67000