In [None]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import NMF, PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import VarianceThreshold
import numpy as np


class WeatherRecommender:
    def __init__(self, file_path):
        """
        Initialize the Weather Recommender with a dataset.
        """
        self.data = pd.read_csv(file_path)
        self.scaler = StandardScaler()
        self.min_max_scaler = MinMaxScaler()
        self.n_clusters = 9
        self.weather_cluster_mapping = None
        self.integrated_mapping = None
        self.reduced_features = None

    def reduce_features(self, variance_threshold, n_components):  # Further reduces features as the current number of features results in a very noise mapping and classification for weathers
        """
        Reduce features using VarianceThreshold and PCA.
        """
        print("Reducing features...")

        # Variance Threshold
        selector = VarianceThreshold(threshold=variance_threshold)
        reduced_data = selector.fit_transform(self.data)

        # PCA for dimensionality reduction
        pca = PCA(n_components=n_components, random_state=42)
        reduced_data = pca.fit_transform(reduced_data)

        print(f"Reduced dataset shape: {reduced_data.shape}")
        self.reduced_features = reduced_data
        return reduced_data

    def train_kmeans(self):
        """
        Train KMeans clustering model.
        """
        print("Training KMeans clustering...")
        kmeans = KMeans(n_clusters=self.n_clusters, random_state=42, n_init=10)
        clusters = kmeans.fit_predict(self.reduced_features)
        self.data['Cluster'] = clusters
        self.kmeans = kmeans

    def train_nearest_neighbors(self):
        """
        Train Nearest Neighbors model.
        """
        print("Training Nearest Neighbors model...")
        nn_model = NearestNeighbors(n_neighbors=10, metric='cosine')
        nn_model.fit(self.reduced_features)
        self.nn_model = nn_model

    def train_nmf(self):
        """
        Train Non-Negative Matrix Factorization (NMF) model.
        """
        print("Training Matrix Factorization (NMF) model...")

        # Scale to non-negative for NMF
        non_negative_data = self.min_max_scaler.fit_transform(self.reduced_features)

        nmf = NMF(n_components=self.n_clusters, random_state=42, max_iter=300)
        self.weather_factors = nmf.fit_transform(non_negative_data)
        self.nmf = nmf

    def assign_weather_to_clusters(self, weather_columns):
        """
        Assign weather types to clusters based on frequency within clusters.
        """
        print("\nMapping weather to clusters...")
        cluster_weather = pd.DataFrame(index=range(self.n_clusters), columns=weather_columns, data=0)

        for cluster in range(self.n_clusters):
            cluster_data = self.data[self.data['Cluster'] == cluster]
            for weather in weather_columns:
                cluster_weather.loc[cluster, weather] = cluster_data[weather].sum()

        # Initial Mapping
        print("Creating initial weather-cluster mapping...")
        weather_cluster_mapping = {}
        assigned_clusters = set()
        assigned_weather = set()

        for cluster in cluster_weather.index:
            most_common_weather = cluster_weather.loc[cluster].idxmax()
            if most_common_weather not in assigned_weather:
                weather_cluster_mapping[cluster] = most_common_weather
                assigned_clusters.add(cluster)
                assigned_weather.add(most_common_weather)

        # Handle Unassigned Weather Types
        all_weather_types = set(weather_columns)
        unassigned_weather = all_weather_types - assigned_weather

        print(f"Unassigned Weather Types: {unassigned_weather}")
        unassigned_clusters = set(range(self.n_clusters)) - assigned_clusters
        for weather, cluster in zip(unassigned_weather, unassigned_clusters):
            weather_cluster_mapping[cluster] = weather
            assigned_clusters.add(cluster)
            assigned_weather.add(weather)
            print(f"Assigned {weather} to Cluster {cluster}")

        self.weather_cluster_mapping = weather_cluster_mapping

    def integrate_models(self, weather_columns):
        """
        Integrate results from KMeans, Nearest Neighbors, and NMF to refine the mapping.
        """
        print("\nIntegrating results from all models...")
        cluster_scores = {cluster: {weather: 0 for weather in weather_columns} for cluster in range(self.n_clusters)}

        # KMeans weights
        for cluster, weather in self.weather_cluster_mapping.items():
            cluster_scores[cluster][weather] += 2

        # Nearest Neighbors contributions
        for weather in weather_columns:
            representative_movies = self.data[self.data[weather] > 0]
            if not representative_movies.empty:
                distances, indices = self.nn_model.kneighbors(self.reduced_features[representative_movies.index])
                for idx in indices.flatten():
                    cluster_id = self.data.loc[idx, 'Cluster']  # Map data index to cluster ID
                    cluster_scores[cluster_id][weather] += 1

        # NMF contributions
        for weather_idx, weather in enumerate(weather_columns):
            component_scores = self.weather_factors[:, weather_idx].mean(axis=0)
            dominant_cluster = np.argmax(component_scores)
            cluster_scores[dominant_cluster][weather] += 1

        # Finalize Mapping
        final_mapping = {}
        assigned_clusters = set()
        assigned_weather = set()

        for cluster in range(self.n_clusters):
            sorted_weather = sorted(cluster_scores[cluster].items(), key=lambda x: x[1], reverse=True)
            for weather, score in sorted_weather:
                if weather not in assigned_weather and cluster not in assigned_clusters:
                    final_mapping[cluster] = weather
                    assigned_clusters.add(cluster)
                    assigned_weather.add(weather)
                    break

        # Handle Unassigned
        unassigned_weather = set(weather_columns) - assigned_weather
        unassigned_clusters = set(range(self.n_clusters)) - assigned_clusters
        for weather, cluster in zip(unassigned_weather, unassigned_clusters):
            final_mapping[cluster] = weather
            assigned_clusters.add(cluster)
            assigned_weather.add(weather)

        self.integrated_mapping = final_mapping
        print("\nFinal Integrated Weather-Cluster Mapping:")
        for cluster, weather in self.integrated_mapping.items():
            print(f"Cluster {cluster}: {weather}")


    def run_pipeline(self, weather_columns):
        """
        Full pipeline: reduce features, train models, and map weather to clusters.
        """
        reduced_features = self.reduce_features(0.005, 70)
        self.reduced_features = self.scaler.fit_transform(reduced_features)

        self.train_kmeans()
        self.train_nearest_neighbors()
        self.train_nmf()
        self.assign_weather_to_clusters(weather_columns)
        self.integrate_models(weather_columns)

        # Step 4: Generate final output with genres and weather
        genre_columns = [
            'Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary',
            'Drama', 'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Mystery',
            'Romance', 'Science Fiction', 'TV Movie', 'Thriller', 'War', 'Western', 'Unknown'
        ]
        final_df = self.data[genre_columns].copy()
        final_df['Genres'] = final_df.apply(lambda row: [genre for genre in genre_columns if row[genre] > 0], axis=1)
        final_df = final_df[['Genres']].copy()  # Keep only the "Genres" column
        final_df['Weather'] = self.data['Cluster'].map(self.integrated_mapping)

        return final_df



# Usage Example
file_path = '/content/drive/MyDrive/reduced_cleaned.csv'
weather_columns = [
    "Clear Sky", "Few Clouds", "Scattered Clouds", "Broken Clouds",
    "Shower Rain", "Rain", "Thunderstorm", "Snow", "Mist"
]

recommender = WeatherRecommender(file_path)
final_df = recommender.run_pipeline(weather_columns)

# Save or analyze final results
print(final_df.head())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Reducing features...
Reduced dataset shape: (10057, 70)
Training KMeans clustering...
Training Nearest Neighbors model...
Training Matrix Factorization (NMF) model...

Mapping weather to clusters...
Creating initial weather-cluster mapping...
Unassigned Weather Types: {'Rain'}
Assigned Rain to Cluster 7

Integrating results from all models...

Final Integrated Weather-Cluster Mapping:
Cluster 0: Few Clouds
Cluster 1: Mist
Cluster 2: Broken Clouds
Cluster 3: Scattered Clouds
Cluster 4: Clear Sky
Cluster 5: Snow
Cluster 6: Thunderstorm
Cluster 7: Shower Rain
Cluster 8: Rain
                     Genres    Weather
0                   [Drama]  Clear Sky
1   [Comedy, Drama, Family]  Clear Sky
2             [Documentary]  Clear Sky
3   [Comedy, Drama, Family]  Clear Sky
4  [Comedy, Drama, Romance]  Clear Sky


In [None]:
from collections import Counter

def analyze_top_genres_by_weather(data, weather_column, genres_column):
    """
    Analyze and display the top 3 genres for each weather type.

    Parameters:
    - data: pd.DataFrame, the dataset with weather and genres columns.
    - weather_column: str, the column with final weather mapping.
    - genres_column: str, the column containing genre lists.

    Returns:
    - dict: A dictionary mapping each weather type to its top 3 genres.
    """
    weather_genres = {}

    for weather in data[weather_column].unique():
        weather_data = data[data[weather_column] == weather]

        # Flatten the list of genres for the weather
        all_genres = [genre for genres_list in weather_data[genres_column] for genre in genres_list]

        # Count genre occurrences
        top_genres = [genre for genre, _ in Counter(all_genres).most_common(3)]

        weather_genres[weather] = top_genres

    return weather_genres

# Analyze top genres for each weather type
weather_column = "Weather"  # Column with final weather mapping
genres_column = "Genres"    # Column with genre lists

top_genres_by_weather = analyze_top_genres_by_weather(final_df, weather_column, genres_column)

# Print results for testing
print("Top Genres by Weather\n")
for weather, genres in top_genres_by_weather.items():
    print(f"Weather: {weather} - Top Genres: {', '.join(genres)}")

Top Genres by Weather

Weather: Clear Sky - Top Genres: Comedy, Drama, Romance
Weather: Shower Rain - Top Genres: Drama, Documentary, Horror
Weather: Snow - Top Genres: Western, Drama, Adventure
Weather: Few Clouds - Top Genres: Comedy, Drama, Romance
Weather: Scattered Clouds - Top Genres: Drama, Thriller, Comedy
Weather: Broken Clouds - Top Genres: Drama, Horror, Thriller
Weather: Rain - Top Genres: Drama, Comedy, Documentary
Weather: Thunderstorm - Top Genres: Drama, Thriller, Horror
Weather: Mist - Top Genres: Horror, Drama, Thriller


In [None]:
data_path = '/content/drive/MyDrive/MovieWithWeatherV3.csv'
movie_df = pd.read_csv(data_path)

final_df = pd.merge(movie_df, final_df['Weather'], how='left', left_index=True, right_index=True)
final_df.drop(columns=weather_columns, inplace=True)

final_df.head()

Unnamed: 0,title,overview,release_date,runtime,genres,status,original_language,tagline,popularity,vote_average,...,director,producer,cinematographer,poster,keywords,production_companies,production_countries,budget,revenue,Weather
0,Radio,"In the racially divided town of Anderson, Sout...",2003-10-24,109,['Drama'],Released,en,His courage made them champions.,26.728,7.2,...,Michael Tollin,Brian Robbins,Don Burgess,https://image.tmdb.org/t/p/w500/uQ6ci4iFHhB6TW...,"['mentally disabled', 'high school', 'friendsh...","['Tollin/Robbins Productions', 'Revolution Stu...",['United States of America'],35000000,53293628,Clear Sky
1,Porch Pirates,"In ""Porch Pirates,"" three detectives—Jack Tyle...",2024-11-28,107,"['Comedy', 'Family', 'Drama']",In Production,en,They know when you are sleeping. The know whe...,6.683,0.0,...,Jeff Hamm,Ken Ray Monts,,https://image.tmdb.org/t/p/w500/6pimaWdX5hnno4...,"['chase', 'holiday', 'mayor', 'thanksgiving', ...","['Ichthys Films', 'Pine Line Studios', 'Red Pl...",[],0,0,Clear Sky
2,On the Sly: In Search of the Family Stone,"One man's search for the prolific funk legend,...",2017-01-22,104,['Documentary'],Released,en,,3.304,0.0,...,Michael Rubenstone,Patrick Sheehan,,https://image.tmdb.org/t/p/w500/pwRHW2onfOn1CU...,"['drug abuse', '1970s', 'search', 'rhythm and ...",[],['United States of America'],0,0,Clear Sky
3,Chasing Ghosts,"Lucas Simons, an 11-year-old filmmaker, is obs...",2014-04-06,93,"['Family', 'Comedy', 'Drama']",Released,en,Death is overrated.,2.853,6.4,...,Joshua Shreve,Molly M. Mayeux,,https://image.tmdb.org/t/p/w500/5uTRycUM5MjTH8...,"['friendship', 'life and death', 'loss of love...",['Ichthys Films'],['United States of America'],0,0,Clear Sky
4,Clocking The T,Dave is a professional internet troll who lies...,2024-07-11,105,"['Comedy', 'Romance', 'Drama']",Released,en,A Romantic Comedy With Romance Issues,2.662,0.0,...,Michael Thibault,Erika Thibault,Michael Thibault,https://image.tmdb.org/t/p/w500/iSpwGXX7v0Jh13...,"['atlanta', 'romantic comedy', 'behind the sce...",['Visible Evidence LLC'],[],88000,0,Clear Sky


In [None]:
final_df.to_csv('/content/drive/MyDrive/data_weather_mapped.csv', index=False)