In [163]:
import pandas as pd

# Load the dataset
df = pd.read_csv("MovieWithWeatherV3.csv", quotechar='"', on_bad_lines='skip', engine="python")

pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', 20)      # Set to a high number, adjust as needed


In [164]:
print("DataFrame shape (rows, columns):", df.shape)
# Check for missing values in each column
missing_values = df.isnull().sum()
print("Missing values per column:\n", missing_values)


DataFrame shape (rows, columns): (10057, 30)
Missing values per column:
 title             0
overview        383
release_date    643
runtime           0
genres            0
               ... 
Shower Rain       0
Rain              0
Thunderstorm      0
Snow              0
Mist              0
Length: 30, dtype: int64


In [165]:
dropped_df = df.drop(["status", "original_language","poster", "revenue" , "budget" ,"production_countries", "release_date", "runtime" , "cinematographer" ,"producer"],  axis=1)


In [166]:
missing_genres_rows = dropped_df[dropped_df['genres'].isna()]
print("Rows with missing 'genres':")
print(missing_genres_rows)
dropped_df = dropped_df.dropna(subset=['genres'])


Rows with missing 'genres':
Empty DataFrame
Columns: [title, overview, genres, tagline, popularity, vote_average, vote_count, cast, director, keywords, production_companies, Clear Sky, Few Clouds, Scattered Clouds, Broken Clouds, Shower Rain, Rain, Thunderstorm, Snow, Mist]
Index: []


In [167]:
missing_values = dropped_df.isnull().sum()

# Print the number of missing values per column
with pd.option_context('display.max_rows', None):
    print("Missing values per column:\n", missing_values)
print("DataFrame shape (rows, columns):", dropped_df.shape)


Missing values per column:
 title                      0
overview                 383
genres                     0
tagline                 4063
popularity                 0
vote_average               0
vote_count                 0
cast                       0
director                 230
keywords                   0
production_companies       0
Clear Sky                  0
Few Clouds                 0
Scattered Clouds           0
Broken Clouds              0
Shower Rain                0
Rain                       0
Thunderstorm               0
Snow                       0
Mist                       0
dtype: int64
DataFrame shape (rows, columns): (10057, 20)


In [168]:
import numpy as np
import pandas as pd
import ast


def fill_in_nulls(df):
    # Fill scalar values
    df['overview'] = df['overview'].fillna("None")
    df['tagline'] = df['tagline'].fillna("None")
    df['vote_count'] = df['vote_count'].fillna(0)
    df['director'] = df['director'].fillna("Unknown")

    # Define helper function for replacing NaN or empty lists
    
    # Fill weather association columns with 0
    weather_columns = ["Clear Sky", "Few Clouds", "Scattered Clouds", "Broken Clouds", 
                       "Shower Rain", "Rain", "Thunderstorm", "Snow", "Mist"]
    df[weather_columns] = df[weather_columns].fillna(0)

    return df

# Apply the function
filled_df = fill_in_nulls(dropped_df)


In [169]:
missing_values = filled_df.isnull().sum()


missing_values_df = missing_values.reset_index()
missing_values_df.columns = ['Column', 'Missing Values']

# Display all rows without truncation
with pd.option_context('display.max_rows', None):
    print("Missing values per column:\n", missing_values_df)
print("DataFrame shape (rows, columns):", filled_df.shape)
print(filled_df)

Missing values per column:
                   Column  Missing Values
0                  title               0
1               overview               0
2                 genres               0
3                tagline               0
4             popularity               0
5           vote_average               0
6             vote_count               0
7                   cast               0
8               director               0
9               keywords               0
10  production_companies               0
11             Clear Sky               0
12            Few Clouds               0
13      Scattered Clouds               0
14         Broken Clouds               0
15           Shower Rain               0
16                  Rain               0
17          Thunderstorm               0
18                  Snow               0
19                  Mist               0
DataFrame shape (rows, columns): (10057, 20)
                                            title  \
0            

In [None]:
import ast
def CleanForExtraction(df):
    df['genres'] = df['genres'].astype(str)

    # Step 2: Use ast.literal_eval to convert back to lists with error reporting
    def parse_genre_list(entry, index):
        try:
            # Convert string representation of list back to list
            parsed_entry = ast.literal_eval(entry)
            # Return the parsed entry, or ["Unknown"] if it’s an empty list
            return parsed_entry if parsed_entry else ["Unknown"]
        except (ValueError, SyntaxError):
            # Print the row index and the problematic entry
            print(f"Error parsing 'genres' at row {index}: {entry}")
            # Return ["Unknown"] or other default as needed
            return ["Unknown"]

# Apply the function with index reporting
    df['genres'] = df.apply(lambda row: parse_genre_list(row['genres'], row.name), axis=1)

    df['genres'] = df['genres'].apply(lambda x: ["Unknown"] if x == [] else x)

   

# Function to convert string representation of cast to a list of actor names with error reporting
    def parse_cast(entry, index):
        if isinstance(entry, str) and entry.startswith('['):  # Check if it's a string starting with [
            try:
                # Parse the string to a list of dictionaries
                cast_list = ast.literal_eval(entry)
                # Extract names from each dictionary in the list
                return [actor['name'] for actor in cast_list if 'name' in actor]
            except (ValueError, SyntaxError):
                # Print the row index and problematic entry if parsing fails
                print(f"Parsing failed for 'cast' at row {index}: {entry}")
                return ["Unknown"]
        elif entry == []:
            # Handle empty lists
            return ["Unknown"]
        return entry  # Return as is if it’s already in the correct format

    # Apply the function with row index for error reporting
    df['cast'] = df.apply(lambda row: parse_cast(row['cast'], row.name), axis=1)

    df['keywords'] = df['keywords'].astype(str)

    # Step 2: Use ast.literal_eval to convert back to lists with error reporting
    def parse_keyword_list(entry, index):
        try:
            # Convert string representation of list back to list
            parsed_entry = ast.literal_eval(entry)
            # Return the parsed entry, or ["Unknown"] if it’s an empty list
            return parsed_entry if parsed_entry else ["Unknown"]
        except (ValueError, SyntaxError):
            # Print the row index and the problematic entry
            print(f"Error parsing 'genres' at row {index}: {entry}")
            # Return ["Unknown"] or other default as needed
            return ["Unknown"]

# Apply the function with index reporting
    df['keywords'] = df.apply(lambda row: parse_keyword_list(row['keywords'], row.name), axis=1)

    df['keywords'] = df['keywords'].apply(lambda x: ["Unknown"] if x == [] else x)


    df['production_companies'] = df['production_companies'].astype(str)
    def parse_production_list(entry, index):
        try:
            # Convert string representation of list back to list
            parsed_entry = ast.literal_eval(entry)
            # Return the parsed entry, or ["Unknown"] if it’s an empty list
            return parsed_entry if parsed_entry else ["Unknown"]
        except (ValueError, SyntaxError):
            # Print the row index and the problematic entry
            print(f"Error parsing 'genres' at row {index}: {entry}")
            # Return ["Unknown"] or other default as needed
            return ["Unknown"]

# Apply the function with index reporting
    df['production_companies'] = df.apply(lambda row: parse_production_list(row['production_companies'], row.name), axis=1)

    df['production_companies'] = df['production_companies'].apply(lambda x: ["Unknown"] if x == [] else x)





    return df 

filled_df = CleanForExtraction(filled_df)

In [171]:
print(filled_df is None)  # This should print False if `filled_df` exists


False


In [172]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Apply TF-IDF to the 'overview' column
tfidf_overview = TfidfVectorizer(max_features=200 , ngram_range=(1, 3))  # Adjust max_features as needed
overview_tfidf = tfidf_overview.fit_transform(filled_df['overview'])

# Add the TF-IDF result to the DataFrame
filled_df = filled_df.join(pd.DataFrame(overview_tfidf.toarray(), columns=tfidf_overview.get_feature_names_out()))

# Drop the original 'overview' column
filled_df.drop(columns=['overview'], inplace=True)

In [173]:
print("DataFrame shape (rows, columns):", filled_df.shape)
print(filled_df.head())
empty_genres = filled_df['genres'].apply(lambda x: len(x) == 0)
print(filled_df[empty_genres])

DataFrame shape (rows, columns): (10057, 219)
                                       title                    genres  \
0                                      Radio                   [Drama]   
1                              Porch Pirates   [Comedy, Family, Drama]   
2  On the Sly: In Search of the Family Stone             [Documentary]   
3                             Chasing Ghosts   [Family, Comedy, Drama]   
4                             Clocking The T  [Comedy, Romance, Drama]   

                                             tagline  popularity  \
0                   His courage made them champions.      26.728   
1  They know when you are sleeping.  The know whe...       6.683   
2                                               None       3.304   
3                                Death is overrated.       2.853   
4              A Romantic Comedy With Romance Issues       2.662   

   vote_average  vote_count  \
0           7.2         712   
1           0.0           0   
2      

In [174]:
import ast
from sklearn.preprocessing import MultiLabelBinarizer


# Convert genre lists to multi-hot encoding
mlb_genres = MultiLabelBinarizer()
genres_encoded = mlb_genres.fit_transform(filled_df['genres'])

# Add the encoded genres to the DataFrame
encoded_genres_df = pd.DataFrame(genres_encoded, columns=mlb_genres.classes_)
filled_df = filled_df.join(encoded_genres_df)

# Drop the original 'genres' column to avoid redundancy
filled_df.drop(columns=['genres'], inplace=True)


In [175]:
print(filled_df)

                                            title  \
0                                           Radio   
1                                   Porch Pirates   
2       On the Sly: In Search of the Family Stone   
3                                  Chasing Ghosts   
4                                  Clocking The T   
...                                           ...   
10052                           The Shadow People   
10053  The Hat Man: Documented Cases of Pure Evil   
10054                                     A Febre   
10055      The Things With The Glowing Green Eyes   
10056         Sealed Video 15: Ruined Dead Spirit   

                                                 tagline  popularity  \
0                       His courage made them champions.      26.728   
1      They know when you are sleeping.  The know whe...       6.683   
2                                                   None       3.304   
3                                    Death is overrated.       2.853   
4  

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
from collections import Counter
import pickle
import pandas as pd

# Step 1: Determine top actors and fit MultiLabelBinarizer
# Flatten all actors in a single list to get their frequencies
all_actors = [actor for sublist in df['cast'] for actor in sublist if actor != "Unknown"]
top_actors = [actor for actor, count in Counter(all_actors).most_common(100)]  # Top 100 actors

# Filter cast to only include top actors
df['top_cast'] = df['cast'].apply(lambda actors: [actor for actor in actors if actor in top_actors])

# Fit MultiLabelBinarizer and save it
mlb_cast = MultiLabelBinarizer(classes=top_actors)
cast_encoded = mlb_cast.fit_transform(df['top_cast'])
df = df.join(pd.DataFrame(cast_encoded, columns=[f"cast_{actor}" for actor in mlb_cast.classes_]))

# Drop the original 'cast' and 'top_cast' columns
df.drop(columns=['cast', 'top_cast'], inplace=True)

# Save the fitted MultiLabelBinarizer to use on new data
with open('mlb_cast.pkl', 'wb') as f:
    pickle.dump(mlb_cast, f)

# Check the result
print(df.head())


                                       title  \
0                                      Radio   
1                              Porch Pirates   
2  On the Sly: In Search of the Family Stone   
3                             Chasing Ghosts   
4                             Clocking The T   

                                            overview release_date  runtime  \
0  In the racially divided town of Anderson, Sout...   2003-10-24      109   
1  In "Porch Pirates," three detectives—Jack Tyle...   2024-11-28      107   
2  One man's search for the prolific funk legend,...   2017-01-22      104   
3  Lucas Simons, an 11-year-old filmmaker, is obs...   2014-04-06       93   
4  Dave is a professional internet troll who lies...   2024-07-11      105   

                           genres         status original_language  \
0                       ['Drama']       Released                en   
1   ['Comedy', 'Family', 'Drama']  In Production                en   
2                 ['Documentary'

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert keywords list to a string for each movie
df['keywords_str'] = df['keywords'].apply(lambda x: ' '.join(eval(x)) if isinstance(x, str) else '')

# Apply TF-IDF vectorization
tfidf = TfidfVectorizer(max_features=200)  # Adjust max_features based on your needs
keywords_tfidf = tfidf.fit_transform(df['keywords_str'])

# Convert TF-IDF matrix to DataFrame and join
df = df.join(pd.DataFrame(keywords_tfidf.toarray(), columns=tfidf.get_feature_names_out()))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Step 1: Convert production companies list to a single string for each movie
df['companies_str'] = df['production_companies'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')

# Step 2: Apply TF-IDF Vectorization
# Set max_features to limit the dimensionality (e.g., 100 most relevant companies)
tfidf_vectorizer = TfidfVectorizer(max_features=200)  # Adjust max_features based on needs
companies_tfidf = tfidf_vectorizer.fit_transform(df['companies_str'])

# Step 3: Convert TF-IDF matrix to DataFrame and join with the original DataFrame
df_tfidf = pd.DataFrame(companies_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
df = df.join(df_tfidf)

# Step 4: Drop the original 'production_companies' and 'companies_str' columns
df.drop(columns=['production_companies', 'companies_str'], inplace=True)
