In [4]:
# first solution - 

import zipfile


file_path = r'C:\Users\akank\Downloads\ml-1m.zip'

with zipfile.ZipFile(file_path, 'r') as zip_ref:
    ratings_file = zip_ref.open('ml-1m/ratings.dat')
    movies_file = zip_ref.open('ml-1m/movies.dat')
    
    rating_distribution = dict()
    movieId_count = dict()

    for line in ratings_file:
        line = line.decode('latin-1').strip()  # Decode using Latin-1
        columns = list(map(int, line.split('::')))
        if columns[2] == 5:
            columns.append('High')
        elif columns[2] == 4 or columns[2] == 3:
            columns.append('Medium')
        else:
            columns.append('Low')

        if columns[1] in movieId_count:
            movieId_count[columns[1]] += 1
        else:
            movieId_count[columns[1]] = 1

        if columns[4] in rating_distribution:
            rating_distribution[columns[4]] += 1
        else:
            rating_distribution[columns[4]] = 1

    for rating_range in rating_distribution:
        percentage = rating_distribution[rating_range] / sum(rating_distribution.values()) * 100
        print(f'{rating_range}: {percentage:.2f}%')

    
    movieId_name = dict()
    for line in movies_file:
        line = line.decode('latin-1').strip()  
        columns = line.split('::')
        movieId_name[int(columns[0])] = columns[1]


    sorted_counted_data = sorted(movieId_count.items(), key=lambda x: x[1], reverse=True)[:10]
    for movieId, count in sorted_counted_data:
        print(movieId_name[movieId], count)


High: 22.63%
Medium: 61.00%
Low: 16.37%
American Beauty (1999) 3428
Star Wars: Episode IV - A New Hope (1977) 2991
Star Wars: Episode V - The Empire Strikes Back (1980) 2990
Star Wars: Episode VI - Return of the Jedi (1983) 2883
Jurassic Park (1993) 2672
Saving Private Ryan (1998) 2653
Terminator 2: Judgment Day (1991) 2649
Matrix, The (1999) 2590
Back to the Future (1985) 2583
Silence of the Lambs, The (1991) 2578


In [5]:
# second execution - genre Insights

import zipfile
from collections import defaultdict

file_path = r'C:\Users\akank\Downloads\ml-1m.zip'

with zipfile.ZipFile(file_path, 'r') as zip_ref:
    ratings_file = zip_ref.open('ml-1m/ratings.dat')
    movie_ratings = defaultdict(list)  
    
    for line in ratings_file:
        line = line.decode('latin-1').strip()  
        userId, movieId, rating, timestamp = map(int, line.split('::'))  
        movie_ratings[movieId].append(rating)  

    movies_file = zip_ref.open('ml-1m/movies.dat')
    genre_counts = defaultdict(int)  
    genre_ratings = defaultdict(list)  
    movie_genres = {}  

    for line in movies_file:
        line = line.decode('latin-1').strip()  
        movieId, title, genres = line.split('::') 
        movieId = int(movieId)
        movie_genres[movieId] = genres.split('|')  

        if movieId in movie_ratings:
         for genre in genres.split('|'):
            genre_counts[genre] += len(movie_ratings[movieId])
            genre_ratings[genre].extend(movie_ratings[movieId])

    
    print("Most Frequently Rated Genres:")
    sorted_genre_counts = sorted(genre_counts.items(), key=lambda x: x[1], reverse=True)
    for genre, count in sorted_genre_counts:
        print(f"{genre}: {count} ratings")

    
    print("\nAverage Ratings by Genre:")
    genre_avg_ratings = {genre: sum(ratings) / len(ratings) for genre, ratings in genre_ratings.items()}
    sorted_avg_ratings = sorted(genre_avg_ratings.items(), key=lambda x: x[1], reverse=True)
    for genre, avg_rating in sorted_avg_ratings:
        print(f"{genre}: {avg_rating:.2f}")


Most Frequently Rated Genres:
Comedy: 356580 ratings
Drama: 354529 ratings
Action: 257457 ratings
Thriller: 189680 ratings
Sci-Fi: 157294 ratings
Romance: 147523 ratings
Adventure: 133953 ratings
Crime: 79541 ratings
Horror: 76386 ratings
Children's: 72186 ratings
War: 68527 ratings
Animation: 43293 ratings
Musical: 41533 ratings
Mystery: 40178 ratings
Fantasy: 36301 ratings
Western: 20683 ratings
Film-Noir: 18261 ratings
Documentary: 7910 ratings

Average Ratings by Genre:
Film-Noir: 4.08
Documentary: 3.93
War: 3.89
Drama: 3.77
Crime: 3.71
Animation: 3.68
Mystery: 3.67
Musical: 3.67
Western: 3.64
Romance: 3.61
Thriller: 3.57
Comedy: 3.52
Action: 3.49
Adventure: 3.48
Sci-Fi: 3.47
Fantasy: 3.45
Children's: 3.42
Horror: 3.22


In [64]:
import zipfile

# Define paths to the dataset files inside the zip archive
zip_file_path = r'C:\Users\akank\Downloads\ml-1m.zip'
ratings_file_path = 'ml-1m/ratings.dat'
users_file_path = 'ml-1m/users.dat'

# Open the zip file to extract data
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.printdir()  # Check the files inside the zip to confirm paths
    ratings = zip_ref.open(ratings_file_path)
    users = zip_ref.open(users_file_path)

    # Initialize dictionaries for user data
    userid_count = dict()
    user_rating = dict()
    demographic = dict()

    profession_codes = {
        0: 'other', 1: 'academic/educator', 2: 'artist', 3: 'clerical/admin', 
        4: 'college/grad student', 5: 'customer service', 6: 'doctor/healthcare', 
        7: 'executive/managerial', 8: 'farmer', 9: 'homemaker', 10: 'K-12 student', 
        11: 'lawyer', 12: 'programmer', 13: 'retired', 14: 'sales/marketing', 
        15: 'scientist', 16: 'self-employed', 17: 'technician/engineer', 
        18: 'tradesman/craftsman', 19: 'unemployed', 20: 'writer'
    }

    age_groups = {1: "Under 18", 18: "18-24", 25: "25-34", 35: "35-44", 
                  45: "45-49", 50: "50-55", 56: "56+"}

    # Read ratings and process
    for line in ratings:
        line = line.decode().strip()  # Decode to string from byte format
        user_id, movie_id, rating, _ = map(int, line.split('::'))
        userid_count[user_id] = userid_count.get(user_id, 0) + 1
        user_rating[user_id] = user_rating.get(user_id, 0) + rating

    # Read users and demographic information
    for line in users:
        line = line.decode().strip()  # Decode to string from byte format
        user_id, gender, age, occupation, zipcode = line.split('::')
        demographic[int(user_id)] = {
            'gender': gender, 
            'age': age_groups[int(age)], 
            'occupation': profession_codes[int(occupation)]
        }

    # Adding ratings and movies rated info
    for userid, ratings in user_rating.items():
        if userid in demographic:
            demographic[userid]['ratings'] = ratings

    for userid, count in userid_count.items():
        if userid in demographic:
            demographic[userid]['movies_rated'] = count

    # Analyzing profession and age-based ratings
    profession_rating = dict()
    age_rating = dict()
    for userid, data in demographic.items():
        profession = data['occupation']
        age = data['age']
        ratings = data['ratings']
        profession_rating[profession] = profession_rating.get(profession, 0) + ratings
        age_rating[age] = age_rating.get(age, 0) + ratings

    # Sorting and displaying results
    sorted_professions = sorted(profession_rating.items(), key=lambda x: x[1], reverse=True)
    print("Ratings by Occupation:")
    for occupation, total_ratings in sorted_professions:
        print(f"{occupation}: {total_ratings} ratings")

    sorted_age = sorted(age_rating.items(), key=lambda x: x[1], reverse=True)
    print("\nRatings by Age:")
    for age, total_ratings in sorted_age:
        print(f"{age}: {total_ratings} ratings")
    
    # Gender-based analysis
    gender_ratings = {'M': 0, 'F': 0}
    gender_user_count = {'M': 0, 'F': 0}
    gender_movies = {'M': 0, 'F': 0}

    for userid, data in demographic.items():
        gender = data['gender']
        gender_ratings[gender] += data['ratings']
        gender_movies[gender] += data['movies_rated']
        gender_user_count[gender] += 1

    print("\nRatings by Gender:")
    for gender, total_ratings in gender_ratings.items():
        print(f"{gender}: {total_ratings} ratings")

    gender_avg_rating = {gender: round(gender_ratings[gender] / gender_movies[gender], 2) for gender in gender_ratings}

    print("\nMovies watched by Gender:")
    for gender, total_ratings in gender_movies.items():
        print(f"{gender}: {total_ratings} movies")

    print("\nNumber of users who gave ratings by gender:")
    for gender, count in gender_user_count.items():
        print(f"{gender}: {count} users")

    print("\nAverage rating by Gender:")
    for gender, avg_rating in gender_avg_rating.items():
        print(f"{gender}: {avg_rating} average rating")


File Name                                             Modified             Size
ml-1m/                                         2016-01-29 14:39:34            0
ml-1m/movies.dat                               2003-03-26 15:18:14       171308
ml-1m/ratings.dat                              2003-02-28 15:53:08     24594131
ml-1m/README                                   2016-01-29 14:39:34         5577
ml-1m/users.dat                                2003-02-28 15:53:08       134368
Ratings by Occupation:
college/grad student: 463433 ratings
other: 461646 ratings
executive/managerial: 379506 ratings
academic/educator: 305270 ratings
technician/engineer: 263126 ratings
writer: 211232 ratings
programmer: 209060 ratings
artist: 178897 ratings
sales/marketing: 177700 ratings
self-employed: 165518 ratings
doctor/healthcare: 136229 ratings
clerical/admin: 115630 ratings
scientist: 84684 ratings
K-12 student: 82276 ratings
customer service: 77295 ratings
lawyer: 74384 ratings
retired: 52014 ratings
u

In [48]:
# forth  executiuon - rating distribution

import pandas
movies_data = [
    "1::Toy Story (1995)::Animation|Children's|Comedy",
    "2::Jumanji (1995)::Adventure|Children's|Fantasy"
]
users_data = [
    "1::M::25::12::48067",
    "2::F::35::17::70072"
]
ratings_data = [
    "1::1::5::978300760",
    "2::2::4::978302109"
]
profession_codes = {12: 'programmer', 17: 'technician/engineer'}
age_groups = {25: "25-34", 35: "35-44"}


movie_genres = {int(line.split("::")[0]): line.split("::")[2].split("|") for line in movies_data}


user_demographics = {
    int(line.split("::")[0]): {
        'gender': line.split("::")[1],
        'age': age_groups[int(line.split("::")[2])],
        'occupation': profession_codes[int(line.split("::")[3])]
    }
    for line in users_data
}


gender_genre_counts = {'M': {}, 'F': {}}
age_genre_counts = {"25-34": {}, "35-44": {}}
occupation_genre_rating_counts = {v: {} for v in profession_codes.values()}

for line in ratings_data:
    user_id, movie_id, _, _ = map(int, line.split("::"))
    if user_id in user_demographics:
        gender = user_demographics[user_id]['gender']
        age_group = user_demographics[user_id]['age']
        occupation = user_demographics[user_id]['occupation']
        genres = movie_genres.get(movie_id, [])
        
        for genre in genres:
            gender_genre_counts[gender][genre] = gender_genre_counts[gender].get(genre, 0) + 1
            age_genre_counts[age_group][genre] = age_genre_counts[age_group].get(genre, 0) + 1
            occupation_genre_rating_counts[occupation][genre] = occupation_genre_rating_counts[occupation].get(genre, 0) + 1


print("\nTop Genres by Gender:")
for gender, counts in gender_genre_counts.items():
    print(f"{gender}: {sorted(counts.items(), key=lambda x: x[1], reverse=True)}")

print("\nTop Genres by Age Group:")
for age_group, counts in age_genre_counts.items():
    print(f"{age_group}: {sorted(counts.items(), key=lambda x: x[1], reverse=True)}")

print("\nTop Genres by Occupation:")
for occupation, counts in occupation_genre_rating_counts.items():
    print(f"{occupation}: {sorted(counts.items(), key=lambda x: x[1], reverse=True)}")



Top Genres by Gender:
M: [('Animation', 1), ("Children's", 1), ('Comedy', 1)]
F: [('Adventure', 1), ("Children's", 1), ('Fantasy', 1)]

Top Genres by Age Group:
25-34: [('Animation', 1), ("Children's", 1), ('Comedy', 1)]
35-44: [('Adventure', 1), ("Children's", 1), ('Fantasy', 1)]

Top Genres by Occupation:
programmer: [('Animation', 1), ("Children's", 1), ('Comedy', 1)]
technician/engineer: [('Adventure', 1), ("Children's", 1), ('Fantasy', 1)]


In [16]:
#fifth code - top performances

import zipfile
from collections import defaultdict

file_path = r'C:\Users\akank\Downloads\ml-1m.zip'

min_ratings = 100

with zipfile.ZipFile(file_path, 'r') as zip_ref:
    
    ratings_file = zip_ref.open('ml-1m/ratings.dat')
    movie_ratings = defaultdict(list)  

    for line in ratings_file:
        line = line.decode('latin-1').strip()  
        userId, movieId, rating, timestamp = map(int, line.split('::'))  
        movie_ratings[movieId].append(rating)  

    
    movies_file = zip_ref.open('ml-1m/movies.dat')
    movie_info = {}  

    for line in movies_file:
        line = line.decode('latin-1').strip()  
        movieId, title, genres = line.split('::') 
        movieId = int(movieId)
        movie_info[movieId] = {
            'title': title,
            'genres': genres.split('|')
        }

    
    movie_avg_ratings = {}
    for movieId, ratings in movie_ratings.items():
        if len(ratings) >= min_ratings:
            avg_rating = sum(ratings) / len(ratings)
            movie_avg_ratings[movieId] = avg_rating

    
    top_movies = sorted(movie_avg_ratings.items(), key=lambda x: x[1], reverse=True)

    
    print("Top 10 Movies with the Highest Average Ratings (Minimum 100 ratings):")
    for idx, (movieId, avg_rating) in enumerate(top_movies[:10], 1):
        title = movie_info[movieId]['title']
        genres = ', '.join(movie_info[movieId]['genres'])
        print(f"{idx}. {title} - Avg Rating: {avg_rating:.2f}, Genres: {genres}")

    
    print("\nCharacteristics of Top-Rated Movies:")
    for movieId, avg_rating in top_movies[:10]:
        title = movie_info[movieId]['title']
        genres = ', '.join(movie_info[movieId]['genres'])
        
        release_year = title.split('(')[-1].strip(')') if '(' in title else 'Unknown'
        print(f"Title: {title}, Release Year: {release_year}, Genres: {genres}, Avg Rating: {avg_rating:.2f}")


Top 10 Movies with the Highest Average Ratings (Minimum 100 ratings):
1. Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954) - Avg Rating: 4.56, Genres: Action, Drama
2. Shawshank Redemption, The (1994) - Avg Rating: 4.55, Genres: Drama
3. Godfather, The (1972) - Avg Rating: 4.52, Genres: Action, Crime, Drama
4. Close Shave, A (1995) - Avg Rating: 4.52, Genres: Animation, Comedy, Thriller
5. Usual Suspects, The (1995) - Avg Rating: 4.52, Genres: Crime, Thriller
6. Schindler's List (1993) - Avg Rating: 4.51, Genres: Drama, War
7. Wrong Trousers, The (1993) - Avg Rating: 4.51, Genres: Animation, Comedy
8. Sunset Blvd. (a.k.a. Sunset Boulevard) (1950) - Avg Rating: 4.49, Genres: Film-Noir
9. Raiders of the Lost Ark (1981) - Avg Rating: 4.48, Genres: Action, Adventure
10. Rear Window (1954) - Avg Rating: 4.48, Genres: Mystery, Thriller

Characteristics of Top-Rated Movies:
Title: Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954), Release Year: 1954, Genres

In [18]:
# sixth execution - Exploring Long Tail

import zipfile
from collections import defaultdict

file_path = r'C:\Users\akank\Downloads\ml-1m.zip'

min_ratings_for_popular = 100

with zipfile.ZipFile(file_path, 'r') as zip_ref:
    
    ratings_file = zip_ref.open('ml-1m/ratings.dat')
    movie_ratings = defaultdict(list) 

    for line in ratings_file:
        line = line.decode('latin-1').strip()  
        userId, movieId, rating, timestamp = map(int, line.split('::'))  
        movie_ratings[movieId].append(rating)  

    movies_file = zip_ref.open('ml-1m/movies.dat')
    movie_info = {}  

    for line in movies_file:
        line = line.decode('latin-1').strip() 
        movieId, title, genres = line.split('::') 
        movieId = int(movieId)
        movie_info[movieId] = {
            'title': title,
            'genres': genres.split('|')
        }

    
    movie_rating_counts = {movieId: len(ratings) for movieId, ratings in movie_ratings.items()}

    
    long_tail_movies = {movieId: count for movieId, count in movie_rating_counts.items() if count < min_ratings_for_popular}
    popular_movies = {movieId: count for movieId, count in movie_rating_counts.items() if count >= min_ratings_for_popular}

    
    print(f"Number of Long-Tail Movies (with < {min_ratings_for_popular} ratings): {len(long_tail_movies)}")
    print(f"Number of Popular Movies (with >= {min_ratings_for_popular} ratings): {len(popular_movies)}")

    
    print("\nCharacteristics of Long-Tail Movies:")
    long_tail_characteristics = {
        'titles': [],
        'genres': defaultdict(int),
        'avg_ratings': []
    }

    for movieId in long_tail_movies:
        title = movie_info[movieId]['title']
        genres = movie_info[movieId]['genres']
        ratings = movie_ratings[movieId]
        avg_rating = sum(ratings) / len(ratings)
        
        long_tail_characteristics['titles'].append(title)
        for genre in genres:
            long_tail_characteristics['genres'][genre] += 1
        long_tail_characteristics['avg_ratings'].append(avg_rating)

    print("Top 10 Long-Tail Movies:")
    for idx, title in enumerate(long_tail_characteristics['titles'][:10], 1):
        print(f"{idx}. {title}")

    avg_long_tail_rating = sum(long_tail_characteristics['avg_ratings']) / len(long_tail_characteristics['avg_ratings'])
    print(f"\nAverage Rating of Long-Tail Movies: {avg_long_tail_rating:.2f}")
    print(f"Genres most frequent in Long-Tail Movies: {sorted(long_tail_characteristics['genres'].items(), key=lambda x: x[1], reverse=True)[:5]}")

    
    print("\nCharacteristics of Popular Movies:")
    popular_characteristics = {
        'titles': [],
        'genres': defaultdict(int),
        'avg_ratings': []
    }

    for movieId in popular_movies:
        title = movie_info[movieId]['title']
        genres = movie_info[movieId]['genres']
        ratings = movie_ratings[movieId]
        avg_rating = sum(ratings) / len(ratings)
        
        popular_characteristics['titles'].append(title)
        for genre in genres:
            popular_characteristics['genres'][genre] += 1
        popular_characteristics['avg_ratings'].append(avg_rating)

    print("Top 10 Popular Movies:")
    for idx, title in enumerate(popular_characteristics['titles'][:10], 1):
        print(f"{idx}. {title}")

    avg_popular_rating = sum(popular_characteristics['avg_ratings']) / len(popular_characteristics['avg_ratings'])
    print(f"\nAverage Rating of Popular Movies: {avg_popular_rating:.2f}")
    print(f"Genres most frequent in Popular Movies: {sorted(popular_characteristics['genres'].items(), key=lambda x: x[1], reverse=True)[:5]}")


Number of Long-Tail Movies (with < 100 ratings): 1687
Number of Popular Movies (with >= 100 ratings): 2019

Characteristics of Long-Tail Movies:
Top 10 Long-Tail Movies:
1. Ponette (1996)
2. Picnic (1955)
3. Prefontaine (1997)
4. Mr. Jones (1993)
5. Man with the Golden Arm, The (1955)
6. Hollywood Knights, The (1980)
7. Maybe, Maybe Not (Bewegte Mann, Der) (1994)
8. Mrs. Dalloway (1997)
9. Four Days in September (1997)
10. Naked (1993)

Average Rating of Long-Tail Movies: 3.05
Genres most frequent in Long-Tail Movies: [('Drama', 768), ('Comedy', 459), ('Romance', 182), ('Horror', 153), ('Thriller', 147)]

Characteristics of Popular Movies:
Top 10 Popular Movies:
1. One Flew Over the Cuckoo's Nest (1975)
2. James and the Giant Peach (1996)
3. My Fair Lady (1964)
4. Erin Brockovich (2000)
5. Bug's Life, A (1998)
6. Princess Bride, The (1987)
7. Ben-Hur (1959)
8. Christmas Story, A (1983)
9. Snow White and the Seven Dwarfs (1937)
10. Wizard of Oz, The (1939)

Average Rating of Popular Mov

In [31]:
# seven execution - tag analysis

import zipfile


file_path = r'C:\Users\akank\Downloads\ml-1m.zip'


tag_analysis = {}


with zipfile.ZipFile(file_path, 'r') as zip_ref:
    with zip_ref.open('ml-1m/movies.dat') as movies_file:
        for line in movies_file: 
            
            all_gen = line.decode('latin-1').strip().split("::")
            genres = all_gen[2]
            tags = genres.split("|")
            for tag in tags:
                
                if tag not in tag_analysis:
                    tag_analysis[tag] = 1
                else:
                    tag_analysis[tag] += 1


sorted_tags = sorted(tag_analysis.items(), key=lambda x: x[1], reverse=True)


print("Frequently Used Tags (Genres):")
for tag, count in sorted_tags:
    print(f"{tag}: {count}")


Frequently Used Tags (Genres):
Drama: 1603
Comedy: 1200
Action: 503
Thriller: 492
Romance: 471
Horror: 343
Adventure: 283
Sci-Fi: 276
Children's: 251
Crime: 211
War: 143
Documentary: 127
Musical: 114
Mystery: 106
Animation: 105
Fantasy: 68
Western: 68
Film-Noir: 44


In [None]:
import zipfile
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Open the zip file and read the datasets
zip_file_path = r'C:\Users\akank\Downloads\ml-1m.zip'

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.printdir()
    ratings_file = zip_ref.open('ml-1m/ratings.dat')
    movies_file = zip_ref.open('ml-1m/movies.dat')
    users_file = zip_ref.open('ml-1m/users.dat')

    ratings = pd.read_csv(ratings_file, sep='::', names=['UserID', 'MovieID', 'Rating', 'Timestamp'], engine='python', encoding='ISO-8859-1')
    movies = pd.read_csv(movies_file, sep='::', names=['MovieID', 'Title', 'Genres'], engine='python', encoding='ISO-8859-1')
    users = pd.read_csv(users_file, sep='::', names=['UserID', 'Gender', 'Age', 'Occupation', 'ZipCode'], engine='python', encoding='ISO-8859-1')

# Merge datasets for analysis
merged_data = pd.merge(ratings, movies, on='MovieID')
merged_data = pd.merge(merged_data, users, on='UserID')

# Extract release year from the movie title
merged_data['Year'] = merged_data['Title'].str.extract(r'\((\d{4})\)').astype(float)

# Distribution of Ratings by Genres
plt.figure(figsize=(12, 6))
genre_ratings = merged_data['Genres'].str.get_dummies('|').multiply(merged_data['Rating'], axis=0).sum()
genre_ratings.sort_values(ascending=False).plot(kind='bar', color='skyblue')
plt.title('Distribution of Ratings by Genres', fontsize=16)
plt.xlabel('Genres', fontsize=12)
plt.ylabel('Total Ratings', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Ratings by Year
plt.figure(figsize=(12, 6))
merged_data.groupby('Year')['Rating'].mean().plot(kind='line', color='orange')
plt.title('Average Ratings by Year', fontsize=16)
plt.xlabel('Year', fontsize=12)
plt.ylabel('Average Rating', fontsize=12)
plt.tight_layout()
plt.show()

# Gender vs Genre Preferences
plt.figure(figsize=(12, 6))
gender_genre = merged_data.groupby('Gender')['Genres'].apply(lambda x: '|'.join(x)).str.get_dummies('|').sum()
gender_genre.plot(kind='bar', stacked=True, colormap='viridis')
plt.title('Popular Genres by Gender', fontsize=16)
plt.xlabel('Gender', fontsize=12)
plt.ylabel('Total Ratings', fontsize=12)
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

# Age Group vs Genre Preferences
age_groups = {1: 'Under 18', 18: '18-24', 25: '25-34', 35: '35-44', 45: '45-49', 50: '50-55', 56: '56+'}
merged_data['AgeGroup'] = merged_data['Age'].map(age_groups)
age_genre = merged_data.groupby('AgeGroup')['Genres'].apply(lambda x: '|'.join(x)).str.get_dummies('|').sum()

plt.figure(figsize=(12, 6))
age_genre.plot(kind='bar', stacked=True, colormap='coolwarm')
plt.title('Popular Genres by Age Group', fontsize=16)
plt.xlabel('Age Group', fontsize=12)
plt.ylabel('Total Ratings', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# One-hot encode the 'Genres' column
genres_dummies = merged_data['Genres'].str.get_dummies('|')

# Encode categorical features
merged_data['Occupation'] = merged_data['Occupation'].astype('category')
merged_data['Occupation'] = merged_data['Occupation'].cat.codes

# Prepare numerical columns for correlation analysis
numerical_columns = ['Rating', 'Age', 'Occupation']
correlation_data = pd.concat([merged_data[numerical_columns], genres_dummies], axis=1)

# Compute the correlation matrix
corr_matrix = correlation_data.corr()

# Visualize the correlation matrix using a heatmap
plt.figure(figsize=(14, 10))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()


File Name                                             Modified             Size
ml-1m/                                         2016-01-29 14:39:34            0
ml-1m/movies.dat                               2003-03-26 15:18:14       171308
ml-1m/ratings.dat                              2003-02-28 15:53:08     24594131
ml-1m/README                                   2016-01-29 14:39:34         5577
ml-1m/users.dat                                2003-02-28 15:53:08       134368



KeyboardInterrupt

Exception ignored in: 'zmq.backend.cython.message.Frame.__dealloc__'
Traceback (most recent call last):
  File "zmq\\backend\\cython\\checkrc.pxd", line 13, in zmq.backend.cython.checkrc._check_rc
KeyboardInterrupt: 


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# File paths
ratings_path = r'C:\Users\akank\Downloads\ml-1m\ml-1m\ratings.dat'
movies_path = r'C:\Users\akank\Downloads\ml-1m\ml-1m\movies.dat'

# Column names
ratings_columns = ["UserID", "MovieID", "Rating", "Timestamp"]
movies_columns = ["MovieID", "Title", "Genres"]

# Load a small sample from the ratings and movies data
ratings_df = pd.read_csv(ratings_path, sep="::", names=ratings_columns, engine="python", encoding="ISO-8859-1", dtype={"UserID": "int32", "MovieID": "int32", "Rating": "float32"}, nrows=5000)

movies_df = pd.read_csv(movies_path, sep="::", names=movies_columns, engine="python", encoding="ISO-8859-1", dtype={"MovieID": "int32"})

# Print sample data for debugging
print("Ratings Data Sample:\n", ratings_df.head())
print("Movies Data Sample:\n", movies_df.head())

# Merge datasets
merged_data = pd.merge(ratings_df, movies_df, on="MovieID")

# Extract year from the movie title
merged_data["Year"] = merged_data["Title"].str.extract(r"\((\d{4})\)")
merged_data["Year"] = pd.to_numeric(merged_data["Year"], errors="coerce")  # Convert to numeric, errors will be NaN

# Print the merged data sample for debugging
print("Merged Data Sample:\n", merged_data.head())

# One-hot encode genres
genre_data = merged_data["Genres"].str.get_dummies("|")

# Calculate average ratings by genre
ratings_by_genre = genre_data.mul(merged_data["Rating"], axis=0).sum() / genre_data.sum()

# Check the computed ratings by genre
print("Ratings by Genre:\n", ratings_by_genre.head())

# Calculate average ratings by year
ratings_by_year = merged_data.groupby("Year")["Rating"].mean()

# Check the computed ratings by year
print("Ratings by Year:\n", ratings_by_year.head())

# Plot average ratings by genre
plt.figure(figsize=(12, 6))
sns.barplot(
    x=ratings_by_genre.index,
    y=ratings_by_genre.values,
    palette="viridis"
)
plt.xticks(rotation=90)
plt.title("Average Ratings by Genre")
plt.xlabel("Genre")
plt.ylabel("Average Rating")
plt.tight_layout()
plt.show()

# Plot average ratings by year
plt.figure(figsize=(12, 6))
ratings_by_year.plot(kind="line", marker="o", color="b")
plt.title("Average Ratings by Year")
plt.xlabel("Year")
plt.ylabel("Average Rating")
plt.grid(True)
plt.tight_layout()
plt.show()
