In [1]:
import numpy as np
import sklearn 
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

In [2]:
animeData = pd.read_csv('./animes.csv')
print(animeData)

         uid                                        title  \
0      28891                      Haikyuu!! Second Season   
1      23273                      Shigatsu wa Kimi no Uso   
2      34599                                Made in Abyss   
3       5114             Fullmetal Alchemist: Brotherhood   
4      31758             Kizumonogatari III: Reiketsu-hen   
...      ...                                          ...   
19306  32979                                Flip Flappers   
19307    123                                Fushigi Yuugi   
19308   1281                             Gakkou no Kaidan   
19309    450  InuYasha Movie 2: Kagami no Naka no Mugenjo   
19310     87     Mobile Suit Gundam: Char's Counterattack   

                                                synopsis  \
0      Following their participation at the Inter-Hig...   
1      Music accompanies the path of the human metron...   
2      The Abyss—a gaping chasm stretching down into ...   
3      "In order for someth

In [3]:
import re

uniqueGenres = []
cleanedGenres = []
for genre in animeData['genre']:
    currentGenre = genre.split(',')
    currentGenreCleaned = re.sub(r'[\[\]\(\)\"\,]', '', genre)  # Remove non-alphanumeric characters
    # print(currentGenreCleaned)
    cleanedGenres.append(currentGenreCleaned)
    finalGenreSplit = currentGenreCleaned.split("' '")  # Split into individual genres
    for individualGenre in finalGenreSplit:
        individualGenre = re.sub(r'[\']', '', individualGenre)
        if individualGenre not in uniqueGenres and individualGenre != '':
            uniqueGenres.append(individualGenre)

print (uniqueGenres)



['Comedy', 'Sports', 'Drama', 'School', 'Shounen', 'Music', 'Romance', 'Sci-Fi', 'Adventure', 'Mystery', 'Fantasy', 'Action', 'Military', 'Magic', 'Supernatural', 'Vampire', 'Slice of Life', 'Demons', 'Historical', 'Super Power', 'Mecha', 'Parody', 'Samurai', 'Seinen', 'Police', 'Psychological', 'Josei', 'Space', 'Kids', 'Shoujo Ai', 'Ecchi', 'Shoujo', 'Horror', 'Shounen Ai', 'Cars', 'Martial Arts', 'Game', 'Thriller', 'Dementia', 'Harem', 'Hentai', 'Yaoi', 'Yuri']


In [4]:
cleanedGenres
animeData['text'] = animeData['title'] + " " + animeData['synopsis']
print(cleanedGenres[:3])
print(animeData['text'][:3])

["'Comedy' 'Sports' 'Drama' 'School' 'Shounen'", "'Drama' 'Music' 'Romance' 'School' 'Shounen'", "'Sci-Fi' 'Adventure' 'Mystery' 'Drama' 'Fantasy'"]
0    Haikyuu!! Second Season Following their partic...
1    Shigatsu wa Kimi no Uso Music accompanies the ...
2    Made in Abyss The Abyss—a gaping chasm stretch...
Name: text, dtype: object


In [5]:
import re

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)  # remove punctuation
    text = re.sub(r'\s+', ' ', text)         # remove extra spaces
    text = text.lower()
    return text

animeData['text'] = animeData['text'].apply(clean_text)
print(animeData['text'][:3])

0    haikyuu second season following their particip...
1    shigatsu wa kimi no uso music accompanies the ...
2    made in abyss the abyssa gaping chasm stretchi...
Name: text, dtype: object


In [6]:
from sklearn.preprocessing import MultiLabelBinarizer

# Convert string to list
formattedGenres = []
for genre in animeData['genre']:
    currentGenre = genre.split(',')
    currentGenreCleaned = re.sub(r'[\[\]\(\)\"\,]', '', genre)  # Remove non-alphanumeric characters
    finalGenreSplit = currentGenreCleaned.split("' '")  # Split into individual genres
    tempList = []
    for individualGenre in finalGenreSplit: 
        if individualGenre != '':
            individualGenre = re.sub(r'[\']', '', individualGenre)
            tempList.append(individualGenre.strip())
    formattedGenres.append(tempList)

print(formattedGenres[:3])
print(animeData['genre'][:3])
print(animeData['title'][:3])

# for i, genres in enumerate(formattedGenres):
#     animeData['genre_list'][i] = genres
        
print(formattedGenres[0][0])

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(formattedGenres)

print(mlb.classes_)  # All genre labels

[['Comedy', 'Sports', 'Drama', 'School', 'Shounen'], ['Drama', 'Music', 'Romance', 'School', 'Shounen'], ['Sci-Fi', 'Adventure', 'Mystery', 'Drama', 'Fantasy']]
0    ['Comedy', 'Sports', 'Drama', 'School', 'Shoun...
1    ['Drama', 'Music', 'Romance', 'School', 'Shoun...
2    ['Sci-Fi', 'Adventure', 'Mystery', 'Drama', 'F...
Name: genre, dtype: object
0    Haikyuu!! Second Season
1    Shigatsu wa Kimi no Uso
2              Made in Abyss
Name: title, dtype: object
Comedy
['Action' 'Adventure' 'Cars' 'Comedy' 'Dementia' 'Demons' 'Drama' 'Ecchi'
 'Fantasy' 'Game' 'Harem' 'Hentai' 'Historical' 'Horror' 'Josei' 'Kids'
 'Magic' 'Martial Arts' 'Mecha' 'Military' 'Music' 'Mystery' 'Parody'
 'Police' 'Psychological' 'Romance' 'Samurai' 'School' 'Sci-Fi' 'Seinen'
 'Shoujo' 'Shoujo Ai' 'Shounen' 'Shounen Ai' 'Slice of Life' 'Space'
 'Sports' 'Super Power' 'Supernatural' 'Thriller' 'Vampire' 'Yaoi' 'Yuri']


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Fill missing values in 'text' column with empty strings for processing (Might remove them from the list of anime to train on later)
animeData['text'] = animeData['text'].fillna('')

# Create TF-IDF features from the 'text' column as vectorized input for the model (Limited to top 5000 words, removing English stop words)
tfidf = TfidfVectorizer(
    max_features=5000, 
    stop_words='english',
    ngram_range=(1,2)  # unigrams + bigrams
) # 

X = tfidf.fit_transform(animeData['text'])

In [8]:
from sklearn.model_selection import train_test_split

# First split: train + temp
X_train, X_temp, y_train, y_temp, idx_train, idx_temp = train_test_split(
    X, y, animeData.index, test_size=0.3, random_state=42
)

# Second split: validation + test
X_val, X_test, y_val, y_test, idx_val, idx_test = train_test_split(
    X_temp, y_temp, idx_temp, test_size=0.5, random_state=42
)

print(X_train.shape, X_val.shape, X_test.shape)

(13517, 5000) (2897, 5000) (2897, 5000)


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

model = OneVsRestClassifier(LogisticRegression(max_iter=2000))
model.fit(X_train, y_train)

0,1,2
,estimator,LogisticRegre...max_iter=2000)
,n_jobs,
,verbose,0

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,2000


In [None]:
from sklearn.metrics import classification_report, f1_score

y_val_probs = model.predict_proba(X_val)
thresholds = [0.2, 0.3, 0.4, 0.5]
best_f1 = 0
best_threshold = 0.5

for t in thresholds:
    y_val_pred = (y_val_probs >= t).astype(int)
    f1 = f1_score(y_val, y_val_pred, average='micro')
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = t

print("Best threshold:", best_threshold, "F1-micro on validation:", best_f1)

# --- 4️⃣ Predict on test set using best threshold ---
y_test_probs = model.predict_proba(X_test)
y_test_pred = (y_test_probs >= best_threshold).astype(int)

# Print classification report
print(classification_report(y_test, y_test_pred, target_names=mlb.classes_))


# Convert to readable genre names
true_labels = mlb.inverse_transform(y_test)
pred_labels = mlb.inverse_transform(y_test_pred)

# Compare first 100 samples
for i in range(100):
    print("TEXT:", animeData.loc[idx_test[i], 'title'])
    print("ACTUAL:", true_labels[i])
    print("PREDICTED:", pred_labels[i])
    print("-----")

Best threshold: 0.2 F1-micro on validation: 0.5744342775738316
               precision    recall  f1-score   support

       Action       0.52      0.81      0.64       638
    Adventure       0.50      0.74      0.59       481
         Cars       1.00      0.29      0.44        28
       Comedy       0.45      0.94      0.61       959
     Dementia       0.56      0.28      0.38        85
       Demons       0.55      0.14      0.23        76
        Drama       0.41      0.60      0.49       470
        Ecchi       0.58      0.14      0.23        98
      Fantasy       0.46      0.74      0.57       544
         Game       0.88      0.26      0.40        58
        Harem       0.50      0.01      0.03        69
       Hentai       0.58      0.89      0.71       372
   Historical       0.66      0.36      0.47       182
       Horror       0.71      0.12      0.21        82
        Josei       0.00      0.00      0.00        13
         Kids       0.49      0.72      0.58       381
 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [11]:
for i, genre in enumerate(mlb.classes_):
    top10 = np.argsort(model.estimators_[i].coef_[0])[-10:]
    print(f"\nTop words for {genre}:")
    print([tfidf.get_feature_names_out()[j] for j in top10])


Top words for Action:
['strike', 'hunter', 'iii', 'arc', 'lupin', 'precure', 'fighting', 'gintama', 'fight', 'battle']

Top words for Adventure:
['saiyuuki', 'travel', 'land', 'conan', 'lupin', 'world', 'bouken', 'adventures', 'journey', 'adventure']

Top words for Cars:
['takumi', 'trilogy', 'driver', 'cars', 'race', 'bus', 'stage', 'racing', 'initial', 'car']

Top words for Comedy:
['gintama', 'doraemon', 'source ann', 'pokemon', 'funny', 'parody', 'lupin', 'shorts', 'gag', 'comedy']

Top words for Dementia:
['taku', 'work', 'experimental', 'takashi', 'keiichi', 'collaborative', 'short', 'animation', 'tanaami', 'film']

Top words for Demons:
['berserk', 'choujin', 'devil', 'natsume', 'inuyasha', 'maou', 'oni', 'youkai', 'demons', 'demon']

Top words for Drama:
['lives', 'earthquake', 'yuuki', 'mother', 'drawn', 'kyojin', 'hi', 'natsume', 'educational film', 'educational']

Top words for Ecchi:
['queens blade', 'witches', 'perverted', 'magical', 'ova', 'girls', 'academy', 'specials',