In [1]:
import os.path
#Input file is animes.xlsx
fname = os.path.join('..', 'resource', 'asnlib', 'publicdata', 'animes.xlsx')

# Getting all the unique genres present in the data

import pandas as pd

# Load the Excel file with the specified sheet name
df = pd.read_excel(fname, sheet_name="Extracted", engine='openpyxl')

# Concatenate all entries in the column and split by comma
all_words = ','.join(df['genres']).split(',')
# Remove leading/trailing whitespaces and convert to lowercase
all_words_cleaned = [word.strip().lower() for word in all_words]
# Get unique words
unique_words = set(all_words_cleaned)
# Print unique words
print(f'Found {len(unique_words)} unique genres: {", ".join(sorted(unique_words))}')


Found 44 unique genres: action, adventure, cars, comedy, dementia, demons, drama, ecchi, fantasy, game, harem, hentai, historical, horror, josei, kids, magic, martial arts, mecha, military, music, mystery, parody, police, psychological, romance, samurai, school, sci-fi, seinen, shoujo, shoujo ai, shoujo", shounen, shounen ai, slice of life, space, sports, super power, supernatural, thriller, vampire, yaoi, yuri


In [2]:
# Getting the top 10 genres present in the data

from collections import Counter

# Count occurrences of each word
word_counts = Counter(all_words_cleaned)
# Find top 10 most occurring words
top_10_words = word_counts.most_common(10)

# Print top 10 most occurring words
print("Top 10 most occurring words:")
for word, count in top_10_words:
    print(f"{word}: {count}")


Top 10 most occurring words:
comedy: 1202
action: 995
fantasy: 778
slice of life: 550
school: 545
music: 507
drama: 492
supernatural: 437
adventure: 428
romance: 421


In [3]:
# Creating a filtered dataset in which we only have the rows that have only those top 10 genres

# Find top 10 most occurring words
top_10_words = [word for word, _ in word_counts.most_common(10)]

# Function to filter rows based on top 10 words
def filter_rows(row):
    words_in_row = [word.strip().lower() for word in row['genres'].split(',')]
    return any(word in top_10_words for word in words_in_row)

# Filter rows
filtered_df = df[df.apply(filter_rows, axis=1)].copy()

# Remove other columns
#filtered_df = filtered_df[['genres']]  # Adjust column name as needed

# Remove other words from entries
filtered_df['genres'] = filtered_df['genres'].apply(lambda x: ','.join(word for word in x.split(',') if word.strip().lower() in top_10_words))

# Reset index
filtered_df.reset_index(drop=True, inplace=True)

# Display the filtered DataFrame
print(f'{len(filtered_df)} of {len(df)} remaining')

#filtered_df.to_excel("anime_filtered_data.xlsx", index=False)

3051 of 3479 remaining


In [4]:
# Creating a one hot encoding for the genres column and generating that data file

# Concatenate all entries in the column and split by comma
all_words = ','.join(filtered_df['genres']).split(',')

# Remove leading/trailing whitespaces and convert to lowercase
all_words_cleaned = [word.strip().lower() for word in all_words]

# Get unique words
unique_words = set(all_words_cleaned)

# Count occurrences of each word
word_counts = Counter(all_words_cleaned)

# Find top 10 most occurring words
top_10_words = [word for word, _ in word_counts.most_common(10)]

# Function to filter rows based on top 10 words
def filter_rows(row):
    words_in_row = [word.strip().lower() for word in row['genres'].split(',')]
    return any(word in top_10_words for word in words_in_row)

# Filter rows based on the presence of top 10 genres and operate on a copy to avoid SettingWithCopyWarning
filtered_df = filtered_df[filtered_df.apply(filter_rows, axis=1)].copy()

# Now we can safely apply transformations to 'genres' without causing SettingWithCopyWarning
filtered_df['genres'] = filtered_df['genres'].apply(lambda x: ','.join(word for word in x.split(',') if word.strip().lower() in top_10_words))

# One-hot encode the column with top 10 words
one_hot_encoded_df = filtered_df['genres'].str.get_dummies(sep=',')

# Reset index
one_hot_encoded_df.reset_index(drop=True, inplace=True)

# Display the one-hot encoded DataFrame
#print(one_hot_encoded_df)

# Combine one-hot encoded columns with original DataFrame
combined_df = pd.concat([filtered_df, one_hot_encoded_df], axis=1)
print(f'{len(combined_df)} of {len(df)} remaining')
#combined_df.to_excel("filtered_data.xlsx", index=False)


3051 of 3479 remaining


In [5]:
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd

# Load your dataset
df = combined_df

# Convert 'start_date' to a datetime, if it's not already
df['start_date'] = pd.to_datetime(df['start_date'], errors='coerce')

# Extract numerical features from 'start_date'
df['year'] = df['start_date'].dt.year
df['month'] = df['start_date'].dt.month
df['day'] = df['start_date'].dt.day
# Drop the original 'start_date' column
df = df.drop('start_date', axis=1)

# Select features and target 
X = df[['media', 'episodes', 'members', 'year', 'month',
        'source', 'season_cleaned', 'Action', 'Adventure', 'Comedy', 'Drama',
        'Fantasy', 'Music', 'Romance', 'School', 'Slice of Life', 'Supernatural']]
y = df['rating']

# Convert categorical columns to numeric using One-Hot Encoding if not already done
X = pd.get_dummies(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler to the training data and transform it
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data
X_test_scaled = scaler.transform(X_test)


In [33]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import pandas as pd

def simple_nn():
    return MLPRegressor(hidden_layer_sizes=(10,), max_iter=200, random_state=42)

# You can either replace the RHS below with your own function, or define a function "solution_nn" directly
#solution_nn = simple_nn

###
### YOUR CODE HERE
###
def solution_nn():
    
   
    genres_df = df['genres'].str.get_dummies(sep=',')
    
    
    le = LabelEncoder()
    df['source_encoded'] = le.fit_transform(df['source'])
    
   
    X = pd.concat([genres_df, df[['members', 'source_encoded']]], axis=1)
    y = df['rating']
    
    
    model = MLPRegressor(hidden_layer_sizes=(50, 50), activation='logistic', alpha=0.0001, 
                         learning_rate='adaptive', random_state=42)
    

    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('regressor', model)
    ])
    

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    

    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print("MSE:", mse)
    
    return pipeline.named_steps['regressor']



In [34]:
import time

mlp = solution_nn()
assert isinstance(mlp, MLPRegressor), 'Solution must be an MLPRegressor'
start = time.time()
mlp.fit(X_train_scaled, y_train)
duration = time.time()-start
print(f'Model trained in {int(duration)} seconds')
predictions = mlp.predict(X_test_scaled)
mse = mean_squared_error(y_test, predictions)
print(f'MSE={mse}')

MSE: 0.47626766757925215
Model trained in 6 seconds
MSE=0.39867133402827115




In [35]:
assert mse < 0.4, 'The network\'s error was too high for full credit'
print('The error was low enough for full credit on accuracy')

The error was low enough for full credit on accuracy


In [36]:
assert mse < 0.5, 'The network\'s error was too high for partial credit'
print('The error was low enough for partial credit')

The error was low enough for partial credit


In [37]:
# Testing that you can achieve high accuracy in a short amount of time
assert duration < 30, 'Your MLP training took too long'
assert mse < 0.4, 'Ineligible for points for speed due to low accuracy'
print('Your MLP receives full credit for accuracy and speed')

Your MLP receives full credit for accuracy and speed
