In [129]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from gensim.models import Word2Vec
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import MultiLabelBinarizer
import re
from sklearn.preprocessing import MinMaxScaler

In [130]:


df = pd.read_csv("Anime_data.csv",)
df = df.dropna()
df = df[["Title", "Genre", "Synopsis", "Type", "Producer", "Studio","Rating"]]
def clean_text(text):
    # Remove special characters, excluding commas
    text = re.sub(r'[^\w\s,]', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove trailing and leading whitespaces
    text = text.strip()

    #value of list 
    text = text.split(",")
    
    return text

df['Genre'] = df['Genre'].astype(str).apply(lambda x: clean_text(x))
df['Producer'] = df['Producer'].astype(str).apply(lambda x: clean_text(x))
df['Studio'] = df['Studio'].astype(str).apply(lambda x: clean_text(x))
df.head()



Unnamed: 0,Title,Genre,Synopsis,Type,Producer,Studio,Rating
0,Cowboy Bebop,"[action, adventure, comedy, drama, scifi, ...","In the year 2071, humanity has colonized sever...",TV,[bandai visual],[sunrise],8.81
1,Cowboy Bebop: Tengoku no Tobira,"[action, space, drama, mystery, scifi]","Another day, another bounty—such is the life o...",Movie,"[sunrise, bandai visual]",[bones],8.41
2,Trigun,"[action, scifi, adventure, comedy, drama, ...","Vash the Stampede is the man with a $$60,000,0...",TV,[victor entertainment],[madhouse],8.31
3,Witch Hunter Robin,"[action, magic, police, supernatural, dram...",Witches are individuals with special powers li...,TV,[bandai visual],[sunrise],7.34
5,Eyeshield 21,"[action, sports, comedy, shounen]",Sena is like any other shy kid starting high s...,TV,"[tv tokyo, nihon ad systems, tv tokyo music,...",[studio gallop],8.05


In [131]:
# Get the unique values in the 'Genre' column
unique_genres = set([genre for row in df['Genre'] for genre in row])
unique_Producer = set([Producer for row in df['Producer'] for Producer in row])
unique_Studio = set([Studio for row in df['Studio'] for Studio in row])

# Print the unique genres
print(len(unique_genres))
print(len(unique_Producer))
print(len(unique_Studio))

80
1140
499


In [132]:

# Define the MultiLabelBinarizer object
mlb = MultiLabelBinarizer()

# Apply the MultiLabelBinarizer to the Genre, Studio, and Producer columns of the dataframe
new_df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['Genre']), columns=mlb.classes_, index=df.index)], axis=1)
new_df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['Studio']), columns=mlb.classes_, index=df.index)], axis=1)
new_df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['Producer']), columns=mlb.classes_, index=df.index)], axis=1)

# one-hot encode the Type column
type_df = pd.get_dummies(df['Type'])


# Drop the original Genre, Studio, and Producer columns from the dataframe
new_df = new_df.drop(['Genre', 'Studio', 'Producer', 'Type'], axis=1)




In [133]:
new_df.columns

Index(['Title', 'Synopsis', 'Rating', ' 12 diary holders', ' 1st place',
       ' 3xcube', ' 4kids entertainment', ' 501st joint fighter wing', ' 5pb',
       ' 81 produce',
       ...
       'yaoqi', 'yellow film', 'yomiko advertising', 'yomiuri advertising',
       'yomiuri telecasting', 'yoon039s color', 'youmex', 'ytv',
       'ziz entertainment ziz', 'zyc'],
      dtype='object', length=1143)

In [134]:
# Convert the text data into numerical representations
vectorizer = TfidfVectorizer()
title_vectors = vectorizer.fit_transform(new_df["Title"])
new_df["Title"] = [str (item) for item in new_df["Title"]]
new_df["Title"] = title_vectors.getnnz(axis=1)

synopsis_vectors = vectorizer.fit_transform(new_df["Synopsis"].apply(lambda x: np.str_(x)))
new_df["Synopsis"] = synopsis_vectors.getnnz(axis=1)

In [135]:
# Bin the target variable into 5 categorical bins
new_df['Rating'] = pd.cut(new_df['Rating'], bins=[0, 2, 4, 6, 8, 10], labels=[1, 2, 3, 4, 5], include_lowest=True)
# Convert the binned target variable into categorical data
df['Rating'] = new_df['Rating'].astype('category')

In [136]:
X= new_df.drop(columns="Rating")
y= new_df['Rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Train the random forest classifier on the training set
model = RandomForestClassifier()
model.fit(X_train, y_train)



# Evaluate the model on the testing set
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.7712053571428571


In [137]:
X.head()

Unnamed: 0,Title,Synopsis,12 diary holders,1st place,3xcube,4kids entertainment,501st joint fighter wing,5pb,81 produce,abc animation,...,yaoqi,yellow film,yomiko advertising,yomiuri advertising,yomiuri telecasting,yoon039s color,youmex,ytv,ziz entertainment ziz,zyc
0,2,132,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5,110,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,124,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,59,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,2,113,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [169]:
new_data = {
    'Title': ["My Hero Academia Season 4", "Attack on Titan Season 4", "FullMetal Alchemist: Brotherhood"],
    'Genre': ["Action, Adventure, Superhero", "Action, Drama, Fantasy", "Adventure, Drama, Fantasy, Science fiction"],
    'Synopsis': ["The story follows a young boy named Izuku Midoriya who dreams of becoming a hero in a world where most people possess powers known as Quirks. Despite being born without a Quirk, he is scouted by the world's greatest hero and enrolls in a school for professional heroes.",                "Several hundred years ago, humans were nearly exterminated by Titans. Titans are typically several stories tall, and seem to have no intelligence, devouring human beings and other creatures on sight. A small percentage of humanity survived by walling themselves in a city protected by extremely high walls, even taller than the biggest of Titans.",                "The story is set in a fictional universe in which alchemy is one of the most advanced scientific techniques known to man. It follows two alchemist brothers named Edward and Alphonse Elric who, after a failed attempt to bring their deceased mother back to life using alchemy, set out on a journey to find the Philosopher's Stone, hoping to restore their bodies to their original forms."],
    'Type': ["TV", "TV", "TV"],
    'Producer': ["Funimation, MBS, Dentsu", "Wit Studio, Hajime Isayama, Production I.G", "Aniplex, Square Enix, Mainichi Broadcasting System"],
    'Studio': ["Bones, MBS", "Wit Studio, Production I.G", "Aniplex, Bones"],
}

test = pd.DataFrame(new_data)

In [170]:
test.head()

Unnamed: 0,Title,Genre,Synopsis,Type,Producer,Studio
0,My Hero Academia Season 4,"Action, Adventure, Superhero",The story follows a young boy named Izuku Mido...,TV,"Funimation, MBS, Dentsu","Bones, MBS"
1,Attack on Titan Season 4,"Action, Drama, Fantasy","Several hundred years ago, humans were nearly ...",TV,"Wit Studio, Hajime Isayama, Production I.G","Wit Studio, Production I.G"
2,FullMetal Alchemist: Brotherhood,"Adventure, Drama, Fantasy, Science fiction",The story is set in a fictional universe in wh...,TV,"Aniplex, Square Enix, Mainichi Broadcasting Sy...","Aniplex, Bones"


In [171]:
test['Genre'] = test['Genre'].astype(str).apply(lambda x: clean_text(x))
test['Producer'] = test['Producer'].astype(str).apply(lambda x: clean_text(x))
test['Studio'] = test['Studio'].astype(str).apply(lambda x: clean_text(x))
test.head()

Unnamed: 0,Title,Genre,Synopsis,Type,Producer,Studio
0,My Hero Academia Season 4,"[action, adventure, superhero]",The story follows a young boy named Izuku Mido...,TV,"[funimation, mbs, dentsu]","[bones, mbs]"
1,Attack on Titan Season 4,"[action, drama, fantasy]","Several hundred years ago, humans were nearly ...",TV,"[wit studio, hajime isayama, production ig]","[wit studio, production ig]"
2,FullMetal Alchemist: Brotherhood,"[adventure, drama, fantasy, science fiction]",The story is set in a fictional universe in wh...,TV,"[aniplex, square enix, mainichi broadcasting...","[aniplex, bones]"


In [172]:
# Define the MultiLabelBinarizer object
mlb = MultiLabelBinarizer()

# Apply the MultiLabelBinarizer to the Genre, Studio, and Producer columns of the dataframe
new_df2 = pd.concat([test, pd.DataFrame(mlb.fit_transform(test['Genre']), columns=mlb.classes_, index=test.index)], axis=1)
new_df2 = pd.concat([test, pd.DataFrame(mlb.fit_transform(test['Studio']), columns=mlb.classes_, index=test.index)], axis=1)
new_df2 = pd.concat([test, pd.DataFrame(mlb.fit_transform(test['Producer']), columns=mlb.classes_, index=test.index)], axis=1)

# one-hot encode the Type column
type_df = pd.get_dummies(df['Type'])


# Drop the original Genre, Studio, and Producer columns from the dataframe
new_df2 = new_df2.drop(['Genre', 'Studio', 'Producer', 'Type'], axis=1)

# Convert the text data into numerical representations
vectorizer = TfidfVectorizer(lowercase = False)
#new_df2 = [str (item) for item in new_df2]

title_vectors = vectorizer.fit_transform(new_df2["Title"])
new_df2["Title"] = title_vectors.getnnz(axis=1)

synopsis_vectors = vectorizer.fit_transform(new_df2["Synopsis"].apply(lambda x: np.str_(x)))
new_df2["Synopsis"] = synopsis_vectors.getnnz(axis=1)


In [173]:
len(new_df2.columns)

11

In [174]:
new_df2.columns

# Create a new DataFrame with 1142 columns and all rows set to 0
new_X_test = pd.DataFrame(0, index=new_df2.index, columns=np.arange(1142))

# Copy the 11 features from X_test to the first 11 columns of new_X_test
new_X_test.iloc[:, :len(new_df2.columns)] = new_df2.values

# Pass the new_X_test DataFrame to the predict method of your model
y_pred = model.predict(new_X_test)



In [175]:
y_pred

array([4, 4, 4])