In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [2]:
from gensim.models import Word2Vec

In [4]:
# Load the data into a pandas DataFrame
df = pd.read_csv("Anime_data.csv")
df =df.dropna()

# Convert the genre, producer and studio values into lists
df["Genre"] = df["Genre"].apply(lambda x: x[1:-1].split(", "))
df["Producer"] = df["Producer"].apply(lambda x: x[1:-1].split(", "))
df["Studio"] = df["Studio"].apply(lambda x: x[1:-1].split(", "))

# Use Word2Vec to vectorize the genre, producer and studio columns
model_genre = Word2Vec(df["Genre"], size=5, window=5, min_count=1)
model_producer = Word2Vec(df["Producer"], size=5, window=5, min_count=1)
model_studio = Word2Vec(df["Studio"], size=5, window=5, min_count=1)

# Get the vector representations of the genres, producers, and studios
df["Genre"] = df["Genre"].apply(lambda x: np.mean([model_genre.wv[g] for g in x], axis=0))
df["Producer"] = df["Producer"].apply(lambda x: np.mean([model_producer.wv[p] for p in x], axis=0))
df["Studio"] = df["Studio"].apply(lambda x: np.mean([model_studio.wv[s] for s in x], axis=0))

#convert each list of arrays into a 2D numpy array
df["Genre"] = np.vstack(df["Genre"].to_numpy())
df["Producer"] = np.vstack(df["Producer"].to_numpy())
df["Studio"] = np.vstack(df["Studio"].to_numpy())



# Convert the text data into numerical representations
vectorizer = TfidfVectorizer()
title_vectors = vectorizer.fit_transform(df["Title"])
df["Title"] = title_vectors.getnnz(axis=1)

synopsis_vectors = vectorizer.fit_transform(df["Synopsis"])
df["Synopsis"] = synopsis_vectors.getnnz(axis=1)

#manage the type column that is a categorical feature
le = LabelEncoder()
df["Type"] = le.fit_transform(df["Type"])

# Bin the target variable into 5 categorical bins
df['Rating'] = pd.cut(df['Rating'], bins=[0, 2, 4, 6, 8, 10], labels=[1, 2, 3, 4, 5], include_lowest=True)
# Convert the binned target variable into categorical data
df['Rating'] = df['Rating'].astype('category')




In [6]:
df.head(5)

Unnamed: 0,Anime_id,Title,Genre,Synopsis,Type,Producer,Studio,Rating,ScoredBy,Popularity,Members,Episodes,Source,Aired,Link
0,1,2,-0.158387,132,5,-0.008276,0.04554,5,363889.0,39.0,704490.0,26.0,Original,"Apr 3, 1998 to Apr 24, 1999",https://myanimelist.net/anime/1/Cowboy_Bebop
1,5,5,-0.197001,110,0,0.018593,0.072957,5,111187.0,475.0,179899.0,1.0,Original,"Sep 1, 2001",https://myanimelist.net/anime/5/Cowboy_Bebop__...
2,6,1,-0.171961,124,5,0.086295,-0.063883,5,197451.0,158.0,372709.0,26.0,Manga,"Apr 1, 1998 to Sep 30, 1998",https://myanimelist.net/anime/6/Trigun
3,7,3,-0.211592,59,5,-0.008276,0.04554,4,31875.0,1278.0,74889.0,26.0,Original,"Jul 2, 2002 to Dec 24, 2002",https://myanimelist.net/anime/7/Witch_Hunter_R...
5,15,2,-0.165929,113,5,0.057731,-0.017057,5,48765.0,888.0,106468.0,145.0,Manga,"Apr 6, 2005 to Mar 19, 2008",https://myanimelist.net/anime/15/Eyeshield_21


In [85]:

# Split the data into training and testing sets
X = df[["Title", "Genre", "Synopsis", "Type", "Producer", "Studio"]]
y = df["Rating"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the random forest classifier on the training set
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Evaluate the model on the testing set
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8188976377952756


In [86]:
df3 = pd.read_csv("Anime_data.csv")
df3.dtypes

Anime_id        int64
Title          object
Genre          object
Synopsis       object
Type           object
Producer       object
Studio         object
Rating        float64
ScoredBy      float64
Popularity    float64
Members       float64
Episodes      float64
Source         object
Aired          object
Link           object
dtype: object

In [254]:
import ast
new_data = {
    'Title': ["My Hero Academia Season 4", "Attack on Titan Season 4", "FullMetal Alchemist: Brotherhood"],
    'Genre': [["'Action'","'Adventure'"],["'Adventure'"],["'Drama'"]],
    'Synopsis': ["The story follows a young boy named Izuku Midoriya who dreams of becoming a hero in a world where most people possess powers known as Quirks. Despite being born without a Quirk, he is scouted by the world's greatest hero and enrolls in a school for professional heroes.",                "Several hundred years ago, humans were nearly exterminated by Titans. Titans are typically several stories tall, and seem to have no intelligence, devouring human beings and other creatures on sight. A small percentage of humanity survived by walling themselves in a city protected by extremely high walls, even taller than the biggest of Titans.",                "The story is set in a fictional universe in which alchemy is one of the most advanced scientific techniques known to man. It follows two alchemist brothers named Edward and Alphonse Elric who, after a failed attempt to bring their deceased mother back to life using alchemy, set out on a journey to find the Philosopher's Stone, hoping to restore their bodies to their original forms."],
    'Type': ["TV", "TV", "TV"],
    'Producer': [[ "'Dentsu'"], ["'Wit Studio'", "'Production I.G'"],["'Aniplex'","'Square Enix'","'Mainichi Broadcasting System'"]],
    'Studio': [["'Bones'"],["'Wit Studio'", "'Production I.G'"],["'Bones'"]],
}
new_df = pd.DataFrame(new_data)


In [255]:
new_df.head(3)

Unnamed: 0,Title,Genre,Synopsis,Type,Producer,Studio
0,My Hero Academia Season 4,"['Action', 'Adventure']",The story follows a young boy named Izuku Mido...,TV,['Dentsu'],['Bones']
1,Attack on Titan Season 4,['Adventure'],"Several hundred years ago, humans were nearly ...",TV,"['Wit Studio', 'Production I.G']","['Wit Studio', 'Production I.G']"
2,FullMetal Alchemist: Brotherhood,['Drama'],The story is set in a fictional universe in wh...,TV,"['Aniplex', 'Square Enix', 'Mainichi Broadcast...",['Bones']


In [256]:
new_df.dtypes

Title       object
Genre       object
Synopsis    object
Type        object
Producer    object
Studio      object
dtype: object

In [257]:
# Get the vector representations of the genres, producers, and studios using the previously trained Word2Vec models
new_df["Genre"] = new_df["Genre"].apply(lambda x: np.mean([model_genre.wv[g] for g in x], axis=0))
new_df["Producer"] = new_df["Producer"].apply(lambda x: np.mean([model_producer.wv[p] for p in x], axis=0))
new_df["Studio"] = new_df["Studio"].apply(lambda x: np.mean([model_studio.wv[s] for s in x], axis=0))

In [258]:
new_df.head(3)

Unnamed: 0,Title,Genre,Synopsis,Type,Producer,Studio
0,My Hero Academia Season 4,"[-0.16943127, 0.34769958, 1.201236, -0.4529987...",The story follows a young boy named Izuku Mido...,TV,"[0.06853441, 0.1947563, 0.13894895, 0.03537272...","[0.072956584, -0.048882958, -0.00915616, -0.07..."
1,Attack on Titan Season 4,"[-0.19130664, 0.37928888, 1.114392, -0.4426684...","Several hundred years ago, humans were nearly ...",TV,"[0.0044625206, 0.022601847, 0.08155207, 0.0237...","[-0.0047950465, 0.0001430707, 0.054405253, 0.0..."
2,FullMetal Alchemist: Brotherhood,"[-0.23548013, 0.38037443, 1.3347614, -0.430265...",The story is set in a fictional universe in wh...,TV,"[-0.0028374132, 0.14565973, 0.13988759, 0.0450...","[0.072956584, -0.048882958, -0.00915616, -0.07..."


In [259]:
#convert each list of arrays into a 2D numpy array
new_df["Genre"] = np.vstack(new_df["Genre"].to_numpy())
new_df["Producer"] = np.vstack(new_df["Producer"].to_numpy())
new_df["Studio"] = np.vstack(new_df["Studio"].to_numpy())

In [260]:
# Convert the text data into numerical representations
title_vectors = vectorizer.transform(new_df["Title"])
new_df["Title"] = title_vectors.getnnz(axis=1)

synopsis_vectors = vectorizer.transform(new_df["Synopsis"])
new_df["Synopsis"] = synopsis_vectors.getnnz(axis=1)

#manage the type column that is a categorical feature
new_df["Type"] = le.transform(new_df["Type"])


In [261]:
# Get the input features for the new data
new_X = new_df[["Title", "Genre", "Synopsis", "Type", "Producer", "Studio"]]


# Use the trained model to make predictions on the new data
new_y_pred = model.predict(new_X)


In [262]:
new_y_pred

array([4, 4, 4])

In [263]:
new_df.head(5)

Unnamed: 0,Title,Genre,Synopsis,Type,Producer,Studio
0,4,-0.169431,37,0,0.068534,0.072957
1,4,-0.191307,43,0,0.004463,-0.004795
2,3,-0.23548,50,0,-0.002837,0.072957
