In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

###  Movies per Genre

In [None]:
import plotly.express as px
movies_df = pd.read_csv(f'/kaggle/input/movielens/ml-1m/movies.dat',
                        delimiter='::', engine= 'python', header=None,
                        names=['movie_name', 'genre'],encoding='ISO-8859-1')
movies_df['genre'] = movies_df['genre'].apply(lambda x: x.split('|'))
movies_df_exploded = movies_df.explode('genre')
movie_count_by_genre = px.histogram(movies_df_exploded, x='genre', height=400, title='Movie count by genre').update_xaxes(categoryorder="total descending")
movie_count_by_genre

### Movies per year

In [None]:
import re
movies_df['year'] = movies_df['movie_name'].apply(lambda movie_name: re.search('\((\d*)\)', movie_name).groups(1)[0])
movie_count_by_year = px.histogram(movies_df, x='year', height=400, title='Movie count by year').update_xaxes(categoryorder="total descending")
movie_count_by_year

### Occupation distribution by Gender

In [None]:
readme_text = np.array(open(f'/kaggle/input/movielens/ml-1m/README').read().splitlines())
start_index = np.flatnonzero(np.core.defchararray.find(readme_text,'Occupation is chosen')!=-1)[0]
end_index = np.flatnonzero(np.core.defchararray.find(readme_text,'MOVIES FILE DESCRIPTION')!=-1)[0]
occupation_list = [x.split('"')[1] for x in readme_text[start_index:end_index][2:-1].tolist()]
occupation_dict = dict(zip(range(len(occupation_list)), occupation_list))

users_df = pd.read_csv(f'/kaggle/input/movielens/ml-1m//users.dat',
                       delimiter='::', engine='python', header=None,
                       names=['user_id', 'gender', 'age', 'occupation', 'zip_code'])
users_df['occupation'] = users_df['occupation'].replace(occupation_dict)
users_df.head()

In [None]:
users_df_occupation_by_gender = users_df.groupby(['occupation', 'gender']).size().reset_index(name='occupation_by_gender_count')
gender_count = users_df['gender'].value_counts()
users_df_occupation_by_gender['occupation_by_gender_count'] = \
users_df_occupation_by_gender['occupation_by_gender_count'].div(users_df_occupation_by_gender['gender'].map(gender_count))
users_df_occupation_by_gender
occupation_by_gender_plot = px.bar(users_df_occupation_by_gender, x='occupation', y='occupation_by_gender_count', color='gender', barmode='group')
occupation_by_gender_plot

### About ratings 

In [None]:
ratings_df = pd.read_csv(f'/kaggle/input/movielens/ml-1m/ratings.dat',
                         delimiter='::', engine='python', header=None,
                         names=['user_id', 'movie_id', 'rating', 'time'])
ratings_df.head()

In [None]:
print(f'There are {len(pd.unique(ratings_df["user_id"]))} unique users in the dataset')
print(f'There are {len(pd.unique(ratings_df["movie_id"]))} unique movies in the dataset')

### Men Vs Women

In [None]:
combined_ratings_df = pd.merge(pd.merge(movies_df_exploded.rename_axis('movie_id'), ratings_df, on='movie_id'), users_df, on='user_id')
combined_ratings_data = combined_ratings_df.groupby(['genre', 'gender']).agg({'rating': ['mean', 'count']}).reset_index()
combined_ratings_data.columns = [' '.join(col).strip() for col in combined_ratings_data.columns.values]

combined_ratings_data.loc[combined_ratings_data['gender'] == 'F', 'rating count'] /= len(combined_ratings_df[combined_ratings_df['gender'] == 'F'])
combined_ratings_data.loc[combined_ratings_data['gender'] == 'M', 'rating count'] /= len(combined_ratings_df[combined_ratings_df['gender'] == 'M'])

ratings_by_gender_and_genre = px.bar(combined_ratings_data, x='genre', y='rating count', color='gender', barmode='group')
ratings_by_gender_and_genre

### Genres by their Average rating

In [None]:
rating_by_genre_df = ratings_df.join(movies_df_exploded, on='movie_id').\
            groupby('genre').agg({'rating': ['mean', 'count']}).sort_values(('rating', 'mean')).reset_index()
rating_by_genre_df.columns = ['_'.join(col).strip() for col in rating_by_genre_df.columns.values]
ratings_by_genre = px.bar(rating_by_genre_df, x='genre_', y='rating_mean', height=300)
ratings_by_genre

In [None]:
px.bar(rating_by_genre_df, x='genre_', y='rating_count', height=300)