In [None]:
import polars as pl
import polars.selectors as cs

import altair as alt
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

import hvplot.polars

import datetime as dt

from matplotlib.figure import figaspect

In [None]:
anime_path = r"F:\Datasets\CSV datasets\Anime Dataset\Anime.csv"

In [None]:
movie_rating_path = r"F:\Datasets\CSV datasets\Anime Dataset\Movie_Rating.csv"

In [None]:
anime_df = pl.read_csv(anime_path)

In [None]:
anime_df

In [None]:
movie_rating_df = pl.read_csv(movie_rating_path)

In [None]:
movie_rating_df

In [None]:
combined_df = anime_df.join(movie_rating_df, on="anime_id", how="left")

In [None]:
combined_df

In [None]:
combined_df.collect_schema()

In [None]:
combined_df = combined_df.with_columns(
    pl.col('anime_id').cast(pl.UInt16),
    pl.col('rating').cast(pl.Float32),
    pl.col('members').cast(pl.UInt32),
    pl.col('user_id').cast(pl.UInt32),
    pl.col('rating_right').cast(pl.Int8),
)

In [None]:
combined_df = combined_df.with_columns(
    pl.col('genre').str.split(',')
).explode('genre')

In [None]:
combined_df = combined_df.with_columns(
    pl.col('genre').str.strip_chars()
)

In [None]:
most_popular_genres = combined_df.select(
    pl.col('genre')
).group_by(
    pl.col('genre')
).len().sort(by='len', descending=True)

In [None]:
most_popular_genres

In [None]:
plt.figure(figsize=(10, 5), dpi=150)

most_popular_genres_pd = most_popular_genres.to_pandas()

sns.barplot(
    data=most_popular_genres_pd,
    x='genre',
    y='len'
)

plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
combined_df

In [None]:
exact_duplicates = combined_df.group_by(
    ['user_id', 'anime_id', 'genre']
).len().filter(
    pl.col('len') > 1
)

In [None]:
exact_duplicates

In [None]:
user_anime_genre_counts = combined_df.group_by(
    ['user_id', 'anime_id']
).agg([
    pl.col('genre').n_unique().alias('unique_genres'),
    pl.col('genre').len().alias('total_genre_rows')
])

In [None]:
user_anime_genre_counts

In [None]:
combined_df.group_by(
    ['user_id', 'anime_id']
).agg([
    pl.col('rating_right').n_unique().alias('unique_ratings'),
    pl.col('rating_right').min().alias('min_rating'),
    pl.col('rating_right').max().alias('max_rating'),
    pl.col('genre').len().alias('genre_count'),
])

In [None]:
combined_df.select(
    (pl.col(col).is_null().sum() / pl.len() * 100).alias(f"{col}_null_percentage")
    for col in combined_df.columns
)

In [None]:
genre_analysis = combined_df.select(
    'genre'
).group_by(
    'genre'
).len().sort(
    by='len', descending=True
)

genre_analysis

In [None]:
genre_variations = combined_df.select(
    'genre'
).unique().sort(
    'genre'
)

genre_variations.to_pandas()['genre'].tolist()

In [None]:
combined_df.select(
    pl.col('genre')
).unique().to_series().to_list()

In [None]:
combined_df.select(
    'genre'
).to_dict(as_series=False)

In [None]:
import re

def normalize_genre_name(genre_name):
    if genre_name is None:
        return None

    normalized = genre_name.lower().strip()

    variations = {
        'sci-fi': 's'
    }


In [None]:
def create_genre_taxonomy():
    genre_taxonomy = {
        'Action': ['Action', 'Fighting', 'Martial Arts', 'Military', 'Police'],
        'Adventure': ['Adventure', 'Space'],
        'Comedy': ['Comedy', 'Parody', 'Gag Humor'],
        'Drama': ['Drama', 'Melodrama'],
        'Fantasy': ['Fantasy', 'Magic', 'Supernatural', 'Mythology'],
        'Romance': ['Romance', 'Romantic Comedy', 'Harem', 'Reverse Harem'],
        'Science Fiction': ['Science Fiction', 'Sci-Fi', 'Mecha', 'Cyberpunk'],
        'Horror': ['Horror', 'Thriller', 'Suspense'],
        'Mystery': ['Mystery', 'Detective', 'Psychological'],
        'Slice of Life': ['Slice of Life', 'Iyashikei', 'School', 'Workplace'],
        'Sports': ['Sports', 'Racing', 'Competition'],
        'Demographic': ['Shonen', 'Shojo', 'Seinen', 'Josei', 'Kids'],
        'Mature': ['Ecchi', 'Mature', 'Adult'],
        'Historical': ['Historical', 'Samurai', 'Period Drama'],
        'Music': ['Music', 'Idol', 'Band'],
        'Game': ['Game', 'Video Game', 'Card Game'],
        'Other': ['Experimental', 'Avant Garde', 'Art House']
    }

    return genre_taxonomy


In [None]:
combined_df.select([
    'anime_id', 'name', 'rating', 'type', 'episodes', 'genre'
]).describe()

# User Behavior Analytics
## 4. User Engagement Segmentation:

In [None]:
combined_df.group_by(
    'user_id'
).agg(
    pl.col('anime_id').n_unique().alias('anime_count'),
    pl.col('rating_right').alias('total_rating'),
    pl.col('rating_right').mean().alias('avg_user_rating'),
    pl.col('rating_right').std().alias('rating_variance'),
    pl.col('rating_right').min().alias('min_rating'),
    pl.col('rating_right').max().alias('max_rating'),
    pl.col('genre').n_unique().alias('unique_genres'),
    pl.col('genre').len().alias('total_genre_interections'),
    pl.col('type').n_unique().alias('unique_types')
)

# Rating Bias Detection:

In [None]:
combined_df.select(
    pl.col('rating_right').mean().alias('global_avg_rating'),
)

In [None]:
combined_df.group_by(
    pl.col('anime_id')
).agg(
    pl.col('rating_right').mean().alias('anime_avg_rating'),
)

In [None]:
combined_df.group_by(
    'genre'
).agg(
    pl.col('rating_right').mean().alias('genre_avg_rating'),
)

In [None]:
user_journey_df = combined_df.filter(
    pl.col('rating_right').is_not_null()
).group_by('user_id').agg(
    pl.col('rating_right').mean().alias('avg_user_rating'),
    pl.col('rating_right').std().alias('rating_std_dev'),
    pl.col('genre').n_unique().alias('unique_genres_count')
)

user_journey_df

In [None]:
genre_exploration_threshold = user_journey_df.select(
    pl.col('unique_genres_count').mean()
).item()

genre_exploration_threshold


In [None]:
user_journey_df = user_journey_df.with_columns(
    pl.when(pl.col('unique_genres_count') > genre_exploration_threshold)
    .then(pl.lit('Explorer'))
    .otherwise(pl.lit('Specialist'))
    .alias('user_type')
)

user_journey_df


In [None]:
user_segment_analysis = user_journey_df.group_by(
    'user_type'
).agg(
    pl.col('avg_user_rating').mean().alias('mean_of_average_ratings'),
    pl.col('rating_std_dev').mean().alias('mean_of_rating_std_dev'),
    pl.len().alias('user_count')
)

user_segment_analysis

In [None]:
fig = px.bar(
    user_segment_analysis.to_pandas(),
    x='user_type',
    y=['mean_of_average_ratings', 'mean_of_rating_std_dev'],
    barmode='group',
    title='Rating Patterns: Genre Specialists vs. Explorers',
    labels={'user_type': 'User Type', 'value': 'Rating Metric', 'variable': 'Metric'}
)
fig.show()

# Content Intelligence

In [None]:
C = combined_df.select(
    pl.col('rating_right').mean()
).item()

print(f"Mean rating across all anime (C): {C:.2f}")

In [None]:
anime_stats_df = combined_df.group_by(
    ['anime_id', 'name']
).agg(
    pl.col('rating_right').count().alias('v'),
    pl.col('rating_right').mean().alias('R'),
).drop_nulls()

anime_stats_df

In [None]:
m =  anime_stats_df.select(pl.col('v').quantile(0.9, "higher")).item()
print(f"Minimum ratings required (m): {m}")

In [None]:
anime_popularity_df = anime_stats_df.with_columns(
    popularity_score = ((pl.col('v') / (pl.col('v') + m)) * pl.col('R') + (m / (pl.col('v') + m)) * C)
)

anime_popularity_df

In [None]:
top_10_popular_anime = anime_popularity_df.sort('popularity_score', descending=True).head(10)

top_10_popular_anime

In [None]:
fig = px.bar(
    top_10_popular_anime.to_pandas(),
    x='name',
    y='popularity_score',
    title='Top 10 Most Popular Anime by Composite Score',
    labels={'name': 'Anime', 'popularity_score': 'Popularity Score'},
    color='popularity_score',
    color_continuous_scale=px.colors.sequential.Viridis
)
fig.update_layout(xaxis_tickangle=-45)
fig.show()

In [None]:
anime_agg_df = combined_df.group_by(
    ['anime_id', 'name', 'type', 'episodes']
).agg(
    pl.col('rating_right').mean().alias('avg_rating'),
).drop_nulls()

anime_agg_df

In [None]:
fig = px.scatter(
    anime_agg_df.to_pandas(),
    x='episodes',
    y='avg_rating',
    color='type',
    log_x=True,
    title='Episode Count vs. Average Rating by Type (Log Scale)',
    labels={'episodes': 'Episodes (log scale)', 'avg_rating': 'Average Rating'},
    hover_data=['name']
)
fig.show()

In [None]:
genre_df = combined_df.select(
    pl.col(['anime_id', 'genre'])
).unique()

genre_df

In [None]:
episode_genre_df = anime_agg_df.join(genre_df, on='anime_id')
episode_genre_df


In [None]:
episode_genre_df.group_by(
    'genre'
).agg(
    pl.col('avg_rating').mean().alias('avg_rating_per_genre'),
    pl.len().alias('anime_count'),
).sort('anime_count', descending=True)

# Advanced Statistical Analysis