In [1]:
import pandas as pd
import numpy as np

In [2]:
import joblib
model = joblib.load('/content/model_lr_0.0025_lambda_5.pkl')

In [5]:
df = pd.read_csv('/content/anime_final.csv')

In [30]:
df.drop(columns=['Unnamed: 0'],inplace=True)

In [16]:
import joblib
scaler = joblib.load('/content/scaler.pkl')

In [24]:
early_theta = joblib.load('/content/theta.pkl')

In [17]:
import requests
import json

def fetch_anime_info(anime_id):
    url = f'https://api.jikan.moe/v4/anime/{anime_id}/full'
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.json()['data']
        else:
            print(f"Failed to fetch data for Anime ID: {anime_id}, Status Code: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error fetching data for Anime ID: {anime_id}: {e}")
        return None

def format_new_anime_data(anime_json):
    def format_data(entry):
        return ", ".join([item['name'] for item in entry]) if entry else "None"

    anime_formatted = {
        'title': anime_json['title'],
        'type': anime_json['type'],
        'source': anime_json['source'],
        'episodes': anime_json['episodes'],
        'rating': anime_json['rating'],
        'score': anime_json['score'],
        'rank': anime_json['rank'],
        'popularity': anime_json['popularity'],
        'favorites': anime_json['favorites'],
        'season': anime_json['season'],
        'year': anime_json['year'],
        'producers': format_data(anime_json.get('producers')),
        'licensors': format_data(anime_json.get('licensors')),
        'studios': format_data(anime_json.get('studios')),
        'genres': format_data(anime_json.get('genres')),
        'explicit_genres': format_data(anime_json.get('explicit_genres')),
        'themes': format_data(anime_json.get('themes')),
        'demographics': format_data(anime_json.get('demographics'))
    }

    return anime_formatted

In [18]:
# Function to estimate rank using closest scores
def estimate_rank(row, non_nan_rows, k=4):
    # Calculate the absolute difference in scores
    non_nan_rows = non_nan_rows.copy()
    non_nan_rows['score_diff'] = abs(non_nan_rows['score'] - row['score'])
    # Get the k rows with the smallest score difference
    closest_scores = non_nan_rows.nsmallest(k, 'score_diff')
    # Calculate and return the average rank, rounded to the nearest integer
    return round(closest_scores['rank'].mean())

In [19]:
def preprocess_anime_data(df, formatted_anime_df, split_and_encode, preprocess_custom, qt):
    # Read the original feature DataFrame and drop irrelevant columns
    original_features = df

    # Create a dummy DataFrame with zero-filled rows based on the columns of df_features
    dummy_df = pd.DataFrame(0, index=np.arange(1), columns=original_features.columns)

    # Split and Encode Catergoircal Features
    columns_to_encode = ['producers', 'genres', 'explicit_genres', 'themes']
    encoded_columns = split_and_encode(formatted_anime_df, columns_to_encode)

    encoded_columns = encoded_columns.rename(columns={"explicit_genres_None": "explicit_genres_nan"})

    # Merge encoded columns back to the main DataFrame
    formatted_anime_df.drop(columns_to_encode, axis=1, inplace=True)
    formatted_anime_df = pd.concat([formatted_anime_df, encoded_columns], axis=1)

    # List of other categorical features
    categorical_features = ['year', 'type', 'source', 'rating', 'season', 'licensors', 'studios', 'demographics']

    # Perform One-Hot Encoding on other categorical features
    formatted_anime_df = pd.get_dummies(formatted_anime_df, columns=categorical_features, dtype=int)

    # Rename year columns to add .0
    year_columns = [col for col in formatted_anime_df.columns if col.startswith('year_')]
    formatted_anime_df.rename(columns={col: f"{col}.0" for col in year_columns}, inplace=True)

    # Align formatted_anime_df columns to match dummy_df
    common_columns = dummy_df.columns.intersection(formatted_anime_df.columns)
    filtered_anime_df = formatted_anime_df[common_columns]

    # Append filtered_anime_df to the empty dummy_df
    df_features = pd.concat([dummy_df, filtered_anime_df], ignore_index=True)

    # Drop the initial zero-filled row used to create the structure
    df_features = df_features.drop(0).reset_index(drop=True)

    # Estimate ranks for rows with NaN rank
    nan_rows = df_features[df_features['rank'].isna()]
    non_nan_rows = df.dropna(subset=['rank'])
    for idx, row in nan_rows.iterrows():
        estimated_rank = estimate_rank(row, non_nan_rows)
        df_features.loc[idx, 'rank'] = estimated_rank

    # Drop the temporary 'score_diff' column if it exists
    if 'score_diff' in df_features.columns:
        df_features.drop(columns=['score_diff'], inplace=True)

    # Fill Remaining NaN values with 0
    df_features = df_features.fillna(0)

    # Preprocess
    columns_to_standardize = ['episodes', 'score', 'rank', 'popularity', 'favorites']

    custom, _ = preprocess_custom(df_features, scaler, 'my_score', columns_to_standardize)

    return df_features, custom, formatted_anime_df['title']

In [31]:
def process_single_show(anime_id, df, split_and_encode, preprocess_custom, scaler, theta):
    raw_anime_data = fetch_anime_info(anime_id)

    if raw_anime_data:
        formatted_anime_data = format_new_anime_data(raw_anime_data)
        formatted_anime_df = pd.DataFrame([formatted_anime_data])
        custom_preprocessed_df, custom, title = preprocess_anime_data(df, formatted_anime_df, split_and_encode, preprocess_custom, scaler)
        # print(custom)
        y_pred_custom = custom.dot(theta)

        result_df = pd.DataFrame({'Title': title, 'Predicted Score': y_pred_custom})
        return result_df
    else:
        print("No data found for the given Anime ID.")
        return pd.DataFrame(columns=['Title', 'Predicted Score'])

In [21]:
# Function to split and encode multiple columns
def split_and_encode(df, column_names):
    encoded_dfs = []
    for column_name in column_names:
        # Ensure columns are strings
        df[column_name] = df[column_name].astype(str)
        # Split the column by comma and create a DataFrame of booleans
        encoded_df = df[column_name].str.get_dummies(sep=', ')
        # Prefix the column names to avoid conflicts
        encoded_df.columns = [f"{column_name}_{col}" for col in encoded_df.columns]
        encoded_dfs.append(encoded_df)
    # Concatenate all the encoded DataFrames
    return pd.concat(encoded_dfs, axis=1)

In [22]:
def preprocess_custom(df, scaler, target_column, columns_to_standardize, fit=False):
    # Drop id and title
    df_processed = df.drop(['series_animedb_id', 'title'], axis=1)

    # Separate features and target
    X = df_processed.drop(target_column, axis=1)
    y = 0

    if fit:
        # Fit and transform the features specified
        X[columns_to_standardize] = scaler.fit_transform(X[columns_to_standardize])
    else:
        # Transform the features specified using the already fitted standardization
        X[columns_to_standardize] = scaler.transform(X[columns_to_standardize])

    # Add a bias term to the standardized data
    X_bias = np.c_[np.ones(X.shape[0]), X]

    return X_bias, y

In [32]:
anime_id = 413
single_show_df = process_single_show(anime_id, df, split_and_encode, preprocess_custom, scaler, early_theta)

# Display result
display(single_show_df)

Unnamed: 0,Title,Predicted Score
0,Hametsu no Mars,4.393878


In [34]:
anime_id = 5114
single_show_df = process_single_show(anime_id, df, split_and_encode, preprocess_custom, scaler, early_theta)

# Display result
display(single_show_df)

Unnamed: 0,Title,Predicted Score
0,Fullmetal Alchemist: Brotherhood,9.350652
