In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, BaggingRegressor, GradientBoostingRegressor
from sklearn.datasets import make_regression
from sklearn.linear_model import  TheilSenRegressor, RANSACRegressor, HuberRegressor, PassiveAggressiveRegressor, SGDRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.svm import SVR, NuSVR
from sklearn.neighbors import KNeighborsRegressor
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.cluster import KMeans

- This is the Top 100 anime rating dataset 

In [2]:
df = pd.read_csv('../input/top-100-anime-dataset-ratings/final_anime_dataset.csv')
df.head()

Unnamed: 0,Serial No.,Name,Links,Genre and Theme,No. of episodes,TV show,Genre: magic,Genre: adventure,Genre: psychological,Genre: comedy,Genre: drama,Genre: romance,Genre: mystery,Genre: action,Genre: fantasy,Rating
0,1,Fullmetal Alchemist: Brotherhood (TV),https://www.animenewsnetwork.com//encyclopedia...,"adventure,comedy,drama,fantasy,thriller,alchem...",64,1,0,1,0,1,1,0,0,0,1,9.09
1,2,Steins;Gate (TV),https://www.animenewsnetwork.com//encyclopedia...,"adventure,comedy,drama,mystery,psychological,r...",24,1,0,1,1,1,1,1,1,0,0,9.04
2,3,Clannad After Story (TV),https://www.animenewsnetwork.com//encyclopedia...,"comedy,drama,psychological,romance,supernatura...",25,1,0,0,1,1,1,1,0,0,0,9.04
3,4,your name. (movie),https://www.animenewsnetwork.com//encyclopedia...,"comedy,drama,romance,supernatural,amnesia,body...",1,0,0,0,0,1,1,1,0,0,0,9.02
4,5,Rurouni Kenshin: Trust &amp; Betrayal (OAV),https://www.animenewsnetwork.com//encyclopedia...,"action,drama,romance,historical,revenge,samura...",4,1,0,0,0,0,1,1,0,1,0,8.97


 I have used ```No. of episodes```, ```TV show``` and the rest of the Genre categories as the input features, with ```Rating``` as the output variable.

 Here I have created a function ```reg_func``` to make it easier to implement the machine learning algorithms

In [3]:
X = df[['TV show','Genre: magic', 'Genre: adventure', 'Genre: psychological',
       'Genre: comedy', 'Genre: drama', 'Genre: romance', 'Genre: mystery',
       'Genre: action', 'Genre: fantasy']]
Y = df['Rating']
x_train, x_test,y_train,y_test = train_test_split(X,Y,test_size =0.2)
def reg_func(model):
       model.fit(x_train,y_train)
       y_pred=model.predict(x_test)
       score=mean_squared_error(y_test,y_pred)
       print('Mean Squared Error: '+str(score))


 ## Linear Regression

In [4]:

model = LinearRegression()
reg_func(model)


Mean Squared Error: 0.031068867559503905


 ## TheilSen Regressor

In [5]:
model = TheilSenRegressor()
reg_func(model)

Mean Squared Error: 0.035694138446864684


 ## RANSAC Regressor

In [6]:
model = RANSACRegressor()
reg_func(model)

Mean Squared Error: 0.04463276119848668


 ## Huber Regressor

In [7]:
model = HuberRegressor()
reg_func(model)

Mean Squared Error: 0.033546241402565784


 ## Passive Aggressive Regressor

In [8]:
model=PassiveAggressiveRegressor()
reg_func(model)

Mean Squared Error: 0.03978556004764151


 ## Gaussian Process Regressor

In [9]:
model = GaussianProcessRegressor(normalize_y=True)
reg_func(model)

Mean Squared Error: 0.03658629577489439


 ## Support Vector Machine

In [10]:
model = SVR()
reg_func(model)

Mean Squared Error: 0.03104432923131579


 ## NU Support Vector Regression

In [11]:
model = NuSVR()
reg_func(model)

Mean Squared Error: 0.025143414110868723


 ## KNNeighbours as Regressor

In [12]:
model= KNeighborsRegressor(n_neighbors=5)
reg_func(model)

Mean Squared Error: 0.034004399999999914


 ## Stochastic Gradient Descent

In [13]:
model= SGDRegressor(n_iter_no_change=750)
reg_func(model)

Mean Squared Error: 0.03349339888197998


 ## Kernal Ridge Regression

In [14]:
model= KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
reg_func(model)

Mean Squared Error: 0.03099610902633882


 ## Decision Tree

In [15]:
model=DecisionTreeRegressor(random_state=0)
reg_func(model)

Mean Squared Error: 0.03857571111111104


 ## Random Forest

In [16]:

regressor = RandomForestRegressor(n_estimators = 20, random_state = 0)
reg_func(regressor)


Mean Squared Error: 0.035206520909287865


 ## Extra Trees

In [17]:
model= ExtraTreesRegressor(n_estimators=20,random_state=0)
reg_func(model)

Mean Squared Error: 0.03857571111111129


 ## Bagging Regressor

In [18]:
model= BaggingRegressor(n_estimators=20,random_state=0)
reg_func(model)

Mean Squared Error: 0.034309921357204565


## Boosting Techniques

 ## ADABoost Regressor

In [19]:
model=AdaBoostRegressor(n_estimators=20,random_state=0)
reg_func(model)

Mean Squared Error: 0.035618624394682


 ## XGBoost

In [20]:
xgb_r = xgb.XGBRegressor(n_estimators = 20, random_state= 0,gamma=1,subsample=0.1)
reg_func(xgb_r)


Mean Squared Error: 0.03353634964561554


 ## Gradient Boosting

In [21]:
model=GradientBoostingRegressor(n_estimators = 20, random_state= 0,learning_rate=0.1)
reg_func(model)

Mean Squared Error: 0.033326103822843275


 ## Evaluating Algorithms

 - As shown above ``` Mean Squared Error``` has one of the lowest values in the Boosting techniques and Linear regression models. Hence they are the best working model for this dataset.
 - The highest loss is mainly witnessed in bagging trees since the input data for prediction contains 95% categorical features this results in sparse data reducing the efficiency of these tree based models.